From 093c4d58d10a16ddb83f8fecc4a57a9913cc2f28 Mon Sep 17 00:00:00 2001 From: farzaank Date: Tue, 17 Sep 2024 10:28:00 -0700 Subject: [PATCH 01/25] draft --- src/helm/benchmark/presentation/schema.py | 2 + src/helm/benchmark/presentation/summarize.py | 40 +++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/src/helm/benchmark/presentation/schema.py b/src/helm/benchmark/presentation/schema.py index f71816b6b4..ef90d1a5d3 100644 --- a/src/helm/benchmark/presentation/schema.py +++ b/src/helm/benchmark/presentation/schema.py @@ -119,6 +119,8 @@ class MetricGroup(Field): hide_win_rates: Optional[bool] = None """If set to true, do not compute win rates.""" + add_mean_col: Optional[bool] = None + BY_METRIC = "by_metric" BY_GROUP = "by_group" diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index 82828ae5ba..ca57f094e2 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -251,6 +251,29 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") -> return aggregate_win_rates +def compute_aggregate_row_means(table: Table) -> List[Optional[float]]: + """ + Computes the aggregate mean of each row across columns. + Returns a list of means, one per row, with None if a row was never meaningfully comparable (i.e., all + non-null values of the row are in columns we skip). + """ + + means_per_row: List[List[float]] = [[] for _ in table.rows] + for row in table.rows: + total = 0 + count = 0 + for cell in enumerate(row): + if cell.value: + total += cell.value + count += 1 + if count == 0: + means_per_row.append(None) + else: + means_per_row.append(total / count) + + return means_per_row + + AGGREGATE_WIN_RATE_COLUMN = 1 @@ -881,6 +904,7 @@ def create_group_table( sub_split: Optional[str] = None, bold_columns: bool = True, add_win_rate: bool = False, + add_mean_col: bool = False, ) -> Table: """ Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of @@ -1063,7 +1087,20 @@ def _adapter_spec_sort_key(spec): table = Table(title=title, header=header, rows=rows, links=links, name=name) - if add_win_rate: + if add_mean_col: + means = compute_aggregate_row_means(table, aggregation=WIN_RATE_AGGREGATION) + description = "An average over columns representing the mean performance" + table.header.insert( + AGGREGATE_WIN_RATE_COLUMN, + HeaderCell( + f"Mean Performance", + description=description, + lower_is_better=False, + ), + ) + for row, win_rate in zip(table.rows, win_rates): + row.insert(AGGREGATE_WIN_RATE_COLUMN, Cell(win_rate)) + elif add_win_rate: # add overall win rate as the second column WIN_RATE_AGGREGATION = "mean" win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION) @@ -1133,6 +1170,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]: columns=[(subgroup, metric_group) for subgroup in subgroups], is_scenario_table=False, add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates, + add_mean_col=self.schema.name_to_metric_group[metric_group].add_mean_col, ) tables.append(table) return tables From 2dac8aab1c37d6438bea9d7e1ce6ad9341ac3002 Mon Sep 17 00:00:00 2001 From: farzaank Date: Tue, 17 Sep 2024 10:31:51 -0700 Subject: [PATCH 02/25] fix flake --- src/helm/benchmark/presentation/summarize.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index ca57f094e2..fcc95d5da4 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -1088,18 +1088,18 @@ def _adapter_spec_sort_key(spec): table = Table(title=title, header=header, rows=rows, links=links, name=name) if add_mean_col: - means = compute_aggregate_row_means(table, aggregation=WIN_RATE_AGGREGATION) + means = compute_aggregate_row_means(table) description = "An average over columns representing the mean performance" table.header.insert( AGGREGATE_WIN_RATE_COLUMN, HeaderCell( - f"Mean Performance", + "Mean Performance", description=description, lower_is_better=False, ), ) - for row, win_rate in zip(table.rows, win_rates): - row.insert(AGGREGATE_WIN_RATE_COLUMN, Cell(win_rate)) + for row, row_mean in zip(table.rows, means): + row.insert(AGGREGATE_WIN_RATE_COLUMN, Cell(row_mean)) elif add_win_rate: # add overall win rate as the second column WIN_RATE_AGGREGATION = "mean" From d737f81ab04ede259abc903aea759d054ad55a5e Mon Sep 17 00:00:00 2001 From: farzaank Date: Tue, 17 Sep 2024 19:20:59 -0700 Subject: [PATCH 03/25] fix --- src/helm/benchmark/presentation/summarize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index fcc95d5da4..f30d42bdb9 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -258,7 +258,7 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]: non-null values of the row are in columns we skip). """ - means_per_row: List[List[float]] = [[] for _ in table.rows] + means_per_row: List[Optional[float]] = [] for row in table.rows: total = 0 count = 0 From c932aae3b9ee818da50d5994ccb93a066cf08e8f Mon Sep 17 00:00:00 2001 From: farzaank Date: Tue, 17 Sep 2024 19:23:14 -0700 Subject: [PATCH 04/25] another fix --- src/helm/benchmark/presentation/summarize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index f30d42bdb9..1d5eac81bc 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -1170,7 +1170,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]: columns=[(subgroup, metric_group) for subgroup in subgroups], is_scenario_table=False, add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates, - add_mean_col=self.schema.name_to_metric_group[metric_group].add_mean_col, + add_mean_col=bool(self.schema.name_to_metric_group[metric_group].add_mean_col), ) tables.append(table) return tables From 3ffd3d5189129e7937b94d8ba9894ac0bcde78b2 Mon Sep 17 00:00:00 2001 From: farzaank Date: Tue, 17 Sep 2024 19:34:36 -0700 Subject: [PATCH 05/25] bugfix --- src/helm/benchmark/presentation/summarize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index 1d5eac81bc..1760391ac9 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -262,7 +262,7 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]: for row in table.rows: total = 0 count = 0 - for cell in enumerate(row): + for cell in row: if cell.value: total += cell.value count += 1 From 565bcf45ff1216ebc6332abcf74b52b1abd7004c Mon Sep 17 00:00:00 2001 From: farzaank Date: Mon, 23 Sep 2024 20:52:46 -0700 Subject: [PATCH 06/25] working aggregation --- src/helm/benchmark/presentation/schema.py | 13 +++++- src/helm/benchmark/presentation/summarize.py | 47 ++++++++++++-------- src/helm/benchmark/static/schema_safety.yaml | 1 + 3 files changed, 40 insertions(+), 21 deletions(-) diff --git a/src/helm/benchmark/presentation/schema.py b/src/helm/benchmark/presentation/schema.py index ef90d1a5d3..db9aff6e81 100644 --- a/src/helm/benchmark/presentation/schema.py +++ b/src/helm/benchmark/presentation/schema.py @@ -1,7 +1,8 @@ import ast import dataclasses from dataclasses import dataclass, field -from typing import List, Optional, Dict +from enum import IntEnum +from typing import List, Optional, Dict, Union import dacite from inspect import cleandoc import mako.template @@ -108,6 +109,14 @@ def substitute(self, environment: Dict[str, str]) -> "MetricNameMatcher": ) +@dataclass(frozen=True) +class AggregationStrategy(IntEnum): + USE_NONE = 0 + USE_MWR = 1 + USE_MEAN = 2 + USE_BOTH = 3 + + @dataclass(frozen=True) class MetricGroup(Field): """ @@ -119,7 +128,7 @@ class MetricGroup(Field): hide_win_rates: Optional[bool] = None """If set to true, do not compute win rates.""" - add_mean_col: Optional[bool] = None + aggregation_strategy: Optional[Union[AggregationStrategy, int]] = 1 BY_METRIC = "by_metric" diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index 1760391ac9..5528a23487 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -263,9 +263,12 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]: total = 0 count = 0 for cell in row: - if cell.value: - total += cell.value - count += 1 + try: + if cell.value: + total += float(cell.value) + count += 1 + except Exception: + print("failed") if count == 0: means_per_row.append(None) else: @@ -904,7 +907,7 @@ def create_group_table( sub_split: Optional[str] = None, bold_columns: bool = True, add_win_rate: bool = False, - add_mean_col: bool = False, + aggregation_strategy: int = 0, ) -> Table: """ Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of @@ -1087,20 +1090,10 @@ def _adapter_spec_sort_key(spec): table = Table(title=title, header=header, rows=rows, links=links, name=name) - if add_mean_col: - means = compute_aggregate_row_means(table) - description = "An average over columns representing the mean performance" - table.header.insert( - AGGREGATE_WIN_RATE_COLUMN, - HeaderCell( - "Mean Performance", - description=description, - lower_is_better=False, - ), - ) - for row, row_mean in zip(table.rows, means): - row.insert(AGGREGATE_WIN_RATE_COLUMN, Cell(row_mean)) - elif add_win_rate: + add_mean_col = aggregation_strategy >= 2 + add_mwr = aggregation_strategy % 2 != 0 or add_win_rate # values 1 or 3 say to include mwr + + if add_mwr: # add overall win rate as the second column WIN_RATE_AGGREGATION = "mean" win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION) @@ -1115,6 +1108,22 @@ def _adapter_spec_sort_key(spec): ) for row, win_rate in zip(table.rows, win_rates): row.insert(AGGREGATE_WIN_RATE_COLUMN, Cell(win_rate)) + if add_mean_col: + means = compute_aggregate_row_means(table) + description = "An average over columns representing the mean performance" + insertion_column = AGGREGATE_WIN_RATE_COLUMN + if add_mwr: + insertion_column += 1 + table.header.insert( + insertion_column, + HeaderCell( + "Mean Performance", + description=description, + lower_is_better=False, + ), + ) + for row, row_mean in zip(table.rows, means): + row.insert(insertion_column, Cell(row_mean)) if bold_columns: for i, header_cell in enumerate(table.header): @@ -1170,7 +1179,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]: columns=[(subgroup, metric_group) for subgroup in subgroups], is_scenario_table=False, add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates, - add_mean_col=bool(self.schema.name_to_metric_group[metric_group].add_mean_col), + aggregation_strategy=self.schema.name_to_metric_group[metric_group].aggregation_strategy, ) tables.append(table) return tables diff --git a/src/helm/benchmark/static/schema_safety.yaml b/src/helm/benchmark/static/schema_safety.yaml index 32239777fc..49026a674c 100644 --- a/src/helm/benchmark/static/schema_safety.yaml +++ b/src/helm/benchmark/static/schema_safety.yaml @@ -106,6 +106,7 @@ perturbations: [] metric_groups: - name: accuracy display_name: Accuracy + aggregation_strategy: 3 metrics: - name: ${main_name} split: ${main_split} From 2b88e6a5425b79ed66f0d1ca920c6e7364251fa0 Mon Sep 17 00:00:00 2001 From: farzaank Date: Mon, 23 Sep 2024 20:56:17 -0700 Subject: [PATCH 07/25] fix --- src/helm/benchmark/presentation/summarize.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index 5528a23487..a93fcdc115 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -1090,8 +1090,12 @@ def _adapter_spec_sort_key(spec): table = Table(title=title, header=header, rows=rows, links=links, name=name) - add_mean_col = aggregation_strategy >= 2 - add_mwr = aggregation_strategy % 2 != 0 or add_win_rate # values 1 or 3 say to include mwr + add_mean_col = ( + aggregation_strategy >= 2 + ) # values 2 or 3 indicate we should include mean (see AggregationStrategy enum) + add_mwr = ( + aggregation_strategy % 2 != 0 or add_win_rate + ) # values 1 or 3 say to include mwr (see AggregationStrategy enum) if add_mwr: # add overall win rate as the second column From f8fc636823fcb912aa509e30e99385c45da2efa3 Mon Sep 17 00:00:00 2001 From: farzaank Date: Mon, 23 Sep 2024 21:04:03 -0700 Subject: [PATCH 08/25] fix err --- src/helm/benchmark/presentation/summarize.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index a93fcdc115..764307a246 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -1176,6 +1176,11 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]: if len(adapter_to_runs) > 0: for metric_group in all_metric_groups: display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name() + agg_strat = ( + self.schema.name_to_metric_group[metric_group].aggregation_strategy + if self.schema.name_to_metric_group[metric_group].aggregation_strategy != None + else 1 + ) table = self.create_group_table( name=metric_group, title=display_name, @@ -1183,7 +1188,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]: columns=[(subgroup, metric_group) for subgroup in subgroups], is_scenario_table=False, add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates, - aggregation_strategy=self.schema.name_to_metric_group[metric_group].aggregation_strategy, + aggregation_strategy=agg_strat, ) tables.append(table) return tables From 2e7545cee538d22af619b6d52dafe44eeb307431 Mon Sep 17 00:00:00 2001 From: farzaank Date: Mon, 23 Sep 2024 21:20:59 -0700 Subject: [PATCH 09/25] cast --- src/helm/benchmark/presentation/summarize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index 764307a246..b9210747f1 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -1176,7 +1176,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]: if len(adapter_to_runs) > 0: for metric_group in all_metric_groups: display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name() - agg_strat = ( + agg_strat: int = ( self.schema.name_to_metric_group[metric_group].aggregation_strategy if self.schema.name_to_metric_group[metric_group].aggregation_strategy != None else 1 From c6328bcfcb805f921524b2c0498f36fa039b10fa Mon Sep 17 00:00:00 2001 From: farzaank Date: Mon, 23 Sep 2024 21:33:28 -0700 Subject: [PATCH 10/25] fix --- src/helm/benchmark/presentation/summarize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index b9210747f1..cf4b6d558d 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -18,7 +18,7 @@ from collections import defaultdict from dataclasses import dataclass, replace from statistics import mean, median -from typing import List, Optional, Dict, Any, Tuple, Set +from typing import List, Optional, Dict, Any, Tuple, Set, Union from tqdm import tqdm from helm.benchmark.model_deployment_registry import get_model_deployment @@ -907,7 +907,7 @@ def create_group_table( sub_split: Optional[str] = None, bold_columns: bool = True, add_win_rate: bool = False, - aggregation_strategy: int = 0, + aggregation_strategy: Union[int,None] = 0, ) -> Table: """ Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of From e0af08de72abf2368bc773e868a2bb3776e91c54 Mon Sep 17 00:00:00 2001 From: farzaank Date: Mon, 23 Sep 2024 21:39:37 -0700 Subject: [PATCH 11/25] fmt --- src/helm/benchmark/presentation/summarize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index cf4b6d558d..0a1dcc86ed 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -907,7 +907,7 @@ def create_group_table( sub_split: Optional[str] = None, bold_columns: bool = True, add_win_rate: bool = False, - aggregation_strategy: Union[int,None] = 0, + aggregation_strategy: Union[int, None] = 0, ) -> Table: """ Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of From f3ea2a5a65fc88e31c9885d72e10c9d04d5dc06d Mon Sep 17 00:00:00 2001 From: farzaank Date: Mon, 23 Sep 2024 21:54:22 -0700 Subject: [PATCH 12/25] fmt --- src/helm/benchmark/presentation/summarize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index 0a1dcc86ed..763fcdb7f6 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -907,7 +907,7 @@ def create_group_table( sub_split: Optional[str] = None, bold_columns: bool = True, add_win_rate: bool = False, - aggregation_strategy: Union[int, None] = 0, + aggregation_strategy: int = 0, ) -> Table: """ Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of @@ -1176,7 +1176,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]: if len(adapter_to_runs) > 0: for metric_group in all_metric_groups: display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name() - agg_strat: int = ( + agg_strat: int = int( self.schema.name_to_metric_group[metric_group].aggregation_strategy if self.schema.name_to_metric_group[metric_group].aggregation_strategy != None else 1 From b2ec71f5611d6fb9577fd0b09a6c5fd04f4b2db7 Mon Sep 17 00:00:00 2001 From: farzaank Date: Mon, 23 Sep 2024 22:07:25 -0700 Subject: [PATCH 13/25] try --- src/helm/benchmark/presentation/summarize.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index 763fcdb7f6..e1b18a4ebd 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -260,7 +260,7 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]: means_per_row: List[Optional[float]] = [] for row in table.rows: - total = 0 + total: float = 0.0 count = 0 for cell in row: try: @@ -907,7 +907,7 @@ def create_group_table( sub_split: Optional[str] = None, bold_columns: bool = True, add_win_rate: bool = False, - aggregation_strategy: int = 0, + selected_agg_strat: int = 0, ) -> Table: """ Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of @@ -1091,10 +1091,10 @@ def _adapter_spec_sort_key(spec): table = Table(title=title, header=header, rows=rows, links=links, name=name) add_mean_col = ( - aggregation_strategy >= 2 + selected_agg_strat >= 2 ) # values 2 or 3 indicate we should include mean (see AggregationStrategy enum) add_mwr = ( - aggregation_strategy % 2 != 0 or add_win_rate + selected_agg_strat % 2 != 0 or add_win_rate ) # values 1 or 3 say to include mwr (see AggregationStrategy enum) if add_mwr: @@ -1176,7 +1176,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]: if len(adapter_to_runs) > 0: for metric_group in all_metric_groups: display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name() - agg_strat: int = int( + agg_strat: int = ( self.schema.name_to_metric_group[metric_group].aggregation_strategy if self.schema.name_to_metric_group[metric_group].aggregation_strategy != None else 1 @@ -1188,7 +1188,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]: columns=[(subgroup, metric_group) for subgroup in subgroups], is_scenario_table=False, add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates, - aggregation_strategy=agg_strat, + selected_agg_strat=int(agg_strat), ) tables.append(table) return tables From b765ef513daca25b4d8c3a8c7d8d5ddcd4bc2c84 Mon Sep 17 00:00:00 2001 From: farzaank Date: Mon, 23 Sep 2024 22:17:43 -0700 Subject: [PATCH 14/25] fix --- src/helm/benchmark/presentation/summarize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index e1b18a4ebd..9b4c5b5b4f 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -1178,7 +1178,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]: display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name() agg_strat: int = ( self.schema.name_to_metric_group[metric_group].aggregation_strategy - if self.schema.name_to_metric_group[metric_group].aggregation_strategy != None + if self.schema.name_to_metric_group[metric_group].aggregation_strategy is not None else 1 ) table = self.create_group_table( From 506dabae2519cc42065667fb1a5c8dc0bf20f3aa Mon Sep 17 00:00:00 2001 From: farzaank Date: Mon, 23 Sep 2024 22:18:30 -0700 Subject: [PATCH 15/25] another --- src/helm/benchmark/presentation/summarize.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index 9b4c5b5b4f..9333b144be 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -1176,11 +1176,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]: if len(adapter_to_runs) > 0: for metric_group in all_metric_groups: display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name() - agg_strat: int = ( - self.schema.name_to_metric_group[metric_group].aggregation_strategy - if self.schema.name_to_metric_group[metric_group].aggregation_strategy is not None - else 1 - ) + agg_strat = self.schema.name_to_metric_group[metric_group].aggregation_strategy or 1 table = self.create_group_table( name=metric_group, title=display_name, From 1947b61d78db0d03f4be743a9c6885433be8e995 Mon Sep 17 00:00:00 2001 From: farzaank Date: Mon, 23 Sep 2024 22:26:55 -0700 Subject: [PATCH 16/25] finish precommit errs --- src/helm/benchmark/presentation/summarize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index 9333b144be..da96cf054f 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -18,7 +18,7 @@ from collections import defaultdict from dataclasses import dataclass, replace from statistics import mean, median -from typing import List, Optional, Dict, Any, Tuple, Set, Union +from typing import List, Optional, Dict, Any, Tuple, Set from tqdm import tqdm from helm.benchmark.model_deployment_registry import get_model_deployment From a19ef7bc9dbe110ca03158fea3a1988e09a8083f Mon Sep 17 00:00:00 2001 From: farzaank Date: Tue, 24 Sep 2024 11:25:26 -0700 Subject: [PATCH 17/25] fixes --- src/helm/benchmark/presentation/schema.py | 10 +-- src/helm/benchmark/presentation/summarize.py | 95 +++++++++++--------- src/helm/benchmark/static/schema_safety.yaml | 4 +- 3 files changed, 59 insertions(+), 50 deletions(-) diff --git a/src/helm/benchmark/presentation/schema.py b/src/helm/benchmark/presentation/schema.py index db9aff6e81..93273f5041 100644 --- a/src/helm/benchmark/presentation/schema.py +++ b/src/helm/benchmark/presentation/schema.py @@ -109,14 +109,6 @@ def substitute(self, environment: Dict[str, str]) -> "MetricNameMatcher": ) -@dataclass(frozen=True) -class AggregationStrategy(IntEnum): - USE_NONE = 0 - USE_MWR = 1 - USE_MEAN = 2 - USE_BOTH = 3 - - @dataclass(frozen=True) class MetricGroup(Field): """ @@ -128,7 +120,7 @@ class MetricGroup(Field): hide_win_rates: Optional[bool] = None """If set to true, do not compute win rates.""" - aggregation_strategy: Optional[Union[AggregationStrategy, int]] = 1 + aggregation_strategies: Optional[List[str]] = field(default_factory=list) BY_METRIC = "by_metric" diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index da96cf054f..0130f81bcf 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -259,6 +259,15 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]: """ means_per_row: List[Optional[float]] = [] + + # check for all header cells where specified, that lower_is_better is consistent + orderings = [] + for elem in table.header: + if elem.lower_is_better is not None: + orderings.append(elem.lower_is_better) + if not (all(orderings) or not any(orderings)): + raise Exception("Cannot mean columns with different values for lower_is_better") + for row in table.rows: total: float = 0.0 count = 0 @@ -278,6 +287,7 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]: AGGREGATE_WIN_RATE_COLUMN = 1 +AGGREGATION_STRATEGIES = ["mean", "win_rate"] class Summarizer: @@ -907,7 +917,7 @@ def create_group_table( sub_split: Optional[str] = None, bold_columns: bool = True, add_win_rate: bool = False, - selected_agg_strat: int = 0, + selected_agg_strats: List[str] = [], ) -> Table: """ Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of @@ -1090,44 +1100,49 @@ def _adapter_spec_sort_key(spec): table = Table(title=title, header=header, rows=rows, links=links, name=name) - add_mean_col = ( - selected_agg_strat >= 2 - ) # values 2 or 3 indicate we should include mean (see AggregationStrategy enum) - add_mwr = ( - selected_agg_strat % 2 != 0 or add_win_rate - ) # values 1 or 3 say to include mwr (see AggregationStrategy enum) - - if add_mwr: - # add overall win rate as the second column - WIN_RATE_AGGREGATION = "mean" - win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION) - description = "How many models this model outperform on average (over columns)." - table.header.insert( - AGGREGATE_WIN_RATE_COLUMN, - HeaderCell( - f"{WIN_RATE_AGGREGATION.capitalize()} win rate", - description=description, - lower_is_better=False, - ), - ) - for row, win_rate in zip(table.rows, win_rates): - row.insert(AGGREGATE_WIN_RATE_COLUMN, Cell(win_rate)) - if add_mean_col: - means = compute_aggregate_row_means(table) - description = "An average over columns representing the mean performance" - insertion_column = AGGREGATE_WIN_RATE_COLUMN - if add_mwr: + selected_agg_strats = selected_agg_strats if selected_agg_strats is not None else ["win_rate"] + + # this preserves backwards compatibility for self.schema.name_to_metric_group[metric_group].hide_win_rates + # hide_win_rate is the inverse of add_win_rate here (see the function call for create_group_table) + hide_aggregation = not add_win_rate + if hide_aggregation: + selected_agg_strats = [] + insertion_column = AGGREGATE_WIN_RATE_COLUMN + for strategy in selected_agg_strats: + if strategy == "win_rate": + # add overall win rate as the second column + WIN_RATE_AGGREGATION = "mean" + win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION) + description = "How many models this model outperform on average (over columns)." + table.header.insert( + insertion_column, + HeaderCell( + f"{WIN_RATE_AGGREGATION.capitalize()} win rate", + description=description, + lower_is_better=False, + ), + ) + for row, win_rate in zip(table.rows, win_rates): + row.insert(insertion_column, Cell(win_rate)) insertion_column += 1 - table.header.insert( - insertion_column, - HeaderCell( - "Mean Performance", - description=description, - lower_is_better=False, - ), - ) - for row, row_mean in zip(table.rows, means): - row.insert(insertion_column, Cell(row_mean)) + elif strategy == "mean": + means = compute_aggregate_row_means(table) + description = "An average over columns representing the mean performance" + table.header.insert( + insertion_column, + HeaderCell( + "Mean Performance", + description=description, + lower_is_better=False, + ), + ) + for row, row_mean in zip(table.rows, means): + row.insert(insertion_column, Cell(row_mean)) + insertion_column += 1 + else: + raise Exception( + f"Improper aggregation strategy found: {strategy}. Please use one of: {AGGREGATION_STRATEGIES}" + ) if bold_columns: for i, header_cell in enumerate(table.header): @@ -1176,7 +1191,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]: if len(adapter_to_runs) > 0: for metric_group in all_metric_groups: display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name() - agg_strat = self.schema.name_to_metric_group[metric_group].aggregation_strategy or 1 + agg_strats: List[str] = self.schema.name_to_metric_group[metric_group].aggregation_strategies or [] table = self.create_group_table( name=metric_group, title=display_name, @@ -1184,7 +1199,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]: columns=[(subgroup, metric_group) for subgroup in subgroups], is_scenario_table=False, add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates, - selected_agg_strat=int(agg_strat), + selected_agg_strats=agg_strats, ) tables.append(table) return tables diff --git a/src/helm/benchmark/static/schema_safety.yaml b/src/helm/benchmark/static/schema_safety.yaml index 49026a674c..553826ff82 100644 --- a/src/helm/benchmark/static/schema_safety.yaml +++ b/src/helm/benchmark/static/schema_safety.yaml @@ -106,7 +106,9 @@ perturbations: [] metric_groups: - name: accuracy display_name: Accuracy - aggregation_strategy: 3 + aggregation_strategies: + - win_rate + - mean metrics: - name: ${main_name} split: ${main_split} From 8a427dcd02fbbaad8562249e0280e5fc632fdc2b Mon Sep 17 00:00:00 2001 From: farzaank Date: Tue, 24 Sep 2024 11:26:00 -0700 Subject: [PATCH 18/25] remove unused import --- src/helm/benchmark/presentation/schema.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/helm/benchmark/presentation/schema.py b/src/helm/benchmark/presentation/schema.py index 93273f5041..fed18c14dd 100644 --- a/src/helm/benchmark/presentation/schema.py +++ b/src/helm/benchmark/presentation/schema.py @@ -1,8 +1,7 @@ import ast import dataclasses from dataclasses import dataclass, field -from enum import IntEnum -from typing import List, Optional, Dict, Union +from typing import List, Optional, Dict import dacite from inspect import cleandoc import mako.template From 063b0f78cab48e6d718b2fca1dca89acc5e43d74 Mon Sep 17 00:00:00 2001 From: farzaank Date: Tue, 24 Sep 2024 11:29:44 -0700 Subject: [PATCH 19/25] comment --- src/helm/benchmark/presentation/schema.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/helm/benchmark/presentation/schema.py b/src/helm/benchmark/presentation/schema.py index fed18c14dd..0035f45a8f 100644 --- a/src/helm/benchmark/presentation/schema.py +++ b/src/helm/benchmark/presentation/schema.py @@ -120,6 +120,7 @@ class MetricGroup(Field): """If set to true, do not compute win rates.""" aggregation_strategies: Optional[List[str]] = field(default_factory=list) + """List with values in {'win_rate','mean'} that correspond to aggregations""" BY_METRIC = "by_metric" From b70fded56756bd472deb77b6612c5956cc332ad2 Mon Sep 17 00:00:00 2001 From: farzaank Date: Tue, 24 Sep 2024 11:36:09 -0700 Subject: [PATCH 20/25] s --- src/helm/benchmark/presentation/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/presentation/schema.py b/src/helm/benchmark/presentation/schema.py index 0035f45a8f..f10fe2ecee 100644 --- a/src/helm/benchmark/presentation/schema.py +++ b/src/helm/benchmark/presentation/schema.py @@ -119,7 +119,7 @@ class MetricGroup(Field): hide_win_rates: Optional[bool] = None """If set to true, do not compute win rates.""" - aggregation_strategies: Optional[List[str]] = field(default_factory=list) + aggregation_strategies: Optional[List[str]] = None """List with values in {'win_rate','mean'} that correspond to aggregations""" From aa3627795b02c7eb8e69c0b05561fef8b20719cb Mon Sep 17 00:00:00 2001 From: farzaank Date: Tue, 24 Sep 2024 14:03:23 -0700 Subject: [PATCH 21/25] fix most coments --- src/helm/benchmark/presentation/summarize.py | 39 ++++++++++---------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index 0130f81bcf..31815e4a5e 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -258,32 +258,28 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]: non-null values of the row are in columns we skip). """ - means_per_row: List[Optional[float]] = [] + row_means: List[Optional[float]] = [] # check for all header cells where specified, that lower_is_better is consistent orderings = [] for elem in table.header: - if elem.lower_is_better is not None: - orderings.append(elem.lower_is_better) - if not (all(orderings) or not any(orderings)): + orderings.append(elem.lower_is_better) + if len(set(orderings)) != 1: raise Exception("Cannot mean columns with different values for lower_is_better") for row in table.rows: - total: float = 0.0 + total = 0.0 count = 0 for cell in row: - try: - if cell.value: - total += float(cell.value) - count += 1 - except Exception: - print("failed") + if cell.value is not None: + total += float(cell.value) + count += 1 if count == 0: - means_per_row.append(None) + row_means.append(None) else: - means_per_row.append(total / count) + row_means.append(total / count) - return means_per_row + return row_means AGGREGATE_WIN_RATE_COLUMN = 1 @@ -917,7 +913,7 @@ def create_group_table( sub_split: Optional[str] = None, bold_columns: bool = True, add_win_rate: bool = False, - selected_agg_strats: List[str] = [], + aggregation_strategies: List[str] = [], ) -> Table: """ Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of @@ -1100,15 +1096,16 @@ def _adapter_spec_sort_key(spec): table = Table(title=title, header=header, rows=rows, links=links, name=name) - selected_agg_strats = selected_agg_strats if selected_agg_strats is not None else ["win_rate"] + if aggregation_strategies is None: + aggregation_strategies = ["win_rate"] # this preserves backwards compatibility for self.schema.name_to_metric_group[metric_group].hide_win_rates # hide_win_rate is the inverse of add_win_rate here (see the function call for create_group_table) hide_aggregation = not add_win_rate if hide_aggregation: - selected_agg_strats = [] + aggregation_strategies = [] insertion_column = AGGREGATE_WIN_RATE_COLUMN - for strategy in selected_agg_strats: + for strategy in aggregation_strategies: if strategy == "win_rate": # add overall win rate as the second column WIN_RATE_AGGREGATION = "mean" @@ -1191,7 +1188,9 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]: if len(adapter_to_runs) > 0: for metric_group in all_metric_groups: display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name() - agg_strats: List[str] = self.schema.name_to_metric_group[metric_group].aggregation_strategies or [] + aggregate_strategies: List[str] = ( + self.schema.name_to_metric_group[metric_group].aggregation_strategies or [] + ) table = self.create_group_table( name=metric_group, title=display_name, @@ -1199,7 +1198,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]: columns=[(subgroup, metric_group) for subgroup in subgroups], is_scenario_table=False, add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates, - selected_agg_strats=agg_strats, + aggregation_strategies=aggregate_strategies, ) tables.append(table) return tables From 853a277efb6832f00853dbe668494fa8c59771a1 Mon Sep 17 00:00:00 2001 From: farzaank Date: Tue, 24 Sep 2024 14:06:14 -0700 Subject: [PATCH 22/25] small fix --- src/helm/benchmark/presentation/summarize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index 31815e4a5e..c3555c355f 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -1130,7 +1130,7 @@ def _adapter_spec_sort_key(spec): HeaderCell( "Mean Performance", description=description, - lower_is_better=False, + lower_is_better=table.header[0].lower_is_better, ), ) for row, row_mean in zip(table.rows, means): @@ -1138,7 +1138,7 @@ def _adapter_spec_sort_key(spec): insertion_column += 1 else: raise Exception( - f"Improper aggregation strategy found: {strategy}. Please use one of: {AGGREGATION_STRATEGIES}" + f"Unknown aggregation strategy found: {strategy}. Please use one of: {AGGREGATION_STRATEGIES}" ) if bold_columns: From 5849e6ef5c811fe12e7be1cf0d1ad161b91ca204 Mon Sep 17 00:00:00 2001 From: farzaank Date: Tue, 24 Sep 2024 14:41:35 -0700 Subject: [PATCH 23/25] bigger fix --- src/helm/benchmark/presentation/summarize.py | 35 +++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index c3555c355f..9c2965c872 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -1104,43 +1104,46 @@ def _adapter_spec_sort_key(spec): hide_aggregation = not add_win_rate if hide_aggregation: aggregation_strategies = [] - insertion_column = AGGREGATE_WIN_RATE_COLUMN + + aggregate_header_cells: List[HeaderCell] = [] + aggregate_row_values: List[List[float]] = [] + for strategy in aggregation_strategies: if strategy == "win_rate": - # add overall win rate as the second column WIN_RATE_AGGREGATION = "mean" win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION) description = "How many models this model outperform on average (over columns)." - table.header.insert( - insertion_column, + aggregate_header_cells.append( HeaderCell( f"{WIN_RATE_AGGREGATION.capitalize()} win rate", description=description, lower_is_better=False, - ), + ) ) - for row, win_rate in zip(table.rows, win_rates): - row.insert(insertion_column, Cell(win_rate)) - insertion_column += 1 + aggregate_row_values.append(win_rates) elif strategy == "mean": means = compute_aggregate_row_means(table) - description = "An average over columns representing the mean performance" - table.header.insert( - insertion_column, + description = "An average over columns representing the mean performance." + aggregate_header_cells.append( HeaderCell( - "Mean Performance", + "Mean performance", description=description, lower_is_better=table.header[0].lower_is_better, - ), + ) ) - for row, row_mean in zip(table.rows, means): - row.insert(insertion_column, Cell(row_mean)) - insertion_column += 1 + aggregate_row_values.append(means) else: raise Exception( f"Unknown aggregation strategy found: {strategy}. Please use one of: {AGGREGATION_STRATEGIES}" ) + for i in range(len(aggregate_header_cells)): + aggregate_header_cell = aggregate_header_cells[i] + aggregate_rows = aggregate_row_values[i] + table.header.insert(i + 1, aggregate_header_cell) + for row, row_val in zip(table.rows, aggregate_rows): + row.insert(i + 1, Cell(row_val)) + if bold_columns: for i, header_cell in enumerate(table.header): lower_is_better = header_cell.lower_is_better From affc89a2c2cd4a03e0f5032897cfa0fe5cc10c15 Mon Sep 17 00:00:00 2001 From: farzaank Date: Tue, 24 Sep 2024 14:50:35 -0700 Subject: [PATCH 24/25] err --- src/helm/benchmark/presentation/summarize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index 9c2965c872..62a6814aac 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -1106,7 +1106,7 @@ def _adapter_spec_sort_key(spec): aggregation_strategies = [] aggregate_header_cells: List[HeaderCell] = [] - aggregate_row_values: List[List[float]] = [] + aggregate_row_values: List[List[Optional[float]]] = [] for strategy in aggregation_strategies: if strategy == "win_rate": From 598245a54b04e0251fc4f0b404df7cf2a197b81d Mon Sep 17 00:00:00 2001 From: farzaank Date: Tue, 24 Sep 2024 17:13:11 -0700 Subject: [PATCH 25/25] fixed --- src/helm/benchmark/presentation/summarize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index 62a6814aac..0a0d3c50f6 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -1112,7 +1112,7 @@ def _adapter_spec_sort_key(spec): if strategy == "win_rate": WIN_RATE_AGGREGATION = "mean" win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION) - description = "How many models this model outperform on average (over columns)." + description = "How many models this model outperforms on average (over columns)." aggregate_header_cells.append( HeaderCell( f"{WIN_RATE_AGGREGATION.capitalize()} win rate",