From 093c4d58d10a16ddb83f8fecc4a57a9913cc2f28 Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Tue, 17 Sep 2024 10:28:00 -0700
Subject: [PATCH 01/25] draft

---
 src/helm/benchmark/presentation/schema.py    |  2 +
 src/helm/benchmark/presentation/summarize.py | 40 +++++++++++++++++++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/src/helm/benchmark/presentation/schema.py b/src/helm/benchmark/presentation/schema.py
index f71816b6b4..ef90d1a5d3 100644
--- a/src/helm/benchmark/presentation/schema.py
+++ b/src/helm/benchmark/presentation/schema.py
@@ -119,6 +119,8 @@ class MetricGroup(Field):
     hide_win_rates: Optional[bool] = None
     """If set to true, do not compute win rates."""
 
+    add_mean_col: Optional[bool] = None
+
 
 BY_METRIC = "by_metric"
 BY_GROUP = "by_group"
diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index 82828ae5ba..ca57f094e2 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -251,6 +251,29 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
     return aggregate_win_rates
 
 
+def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
+    """
+    Computes the aggregate mean of each row across columns.
+    Returns a list of means, one per row, with None if a row was never meaningfully comparable (i.e., all
+    non-null values of the row are in columns we skip).
+    """
+
+    means_per_row: List[List[float]] = [[] for _ in table.rows]
+    for row in table.rows:
+        total = 0
+        count = 0
+        for cell in enumerate(row):
+            if cell.value:
+                total += cell.value
+                count += 1
+        if count == 0:
+            means_per_row.append(None)
+        else:
+            means_per_row.append(total / count)
+
+    return means_per_row
+
+
 AGGREGATE_WIN_RATE_COLUMN = 1
 
 
@@ -881,6 +904,7 @@ def create_group_table(
         sub_split: Optional[str] = None,
         bold_columns: bool = True,
         add_win_rate: bool = False,
+        add_mean_col: bool = False,
     ) -> Table:
         """
         Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of
@@ -1063,7 +1087,20 @@ def _adapter_spec_sort_key(spec):
 
         table = Table(title=title, header=header, rows=rows, links=links, name=name)
 
-        if add_win_rate:
+        if add_mean_col:
+            means = compute_aggregate_row_means(table, aggregation=WIN_RATE_AGGREGATION)
+            description = "An average over columns representing the mean performance"
+            table.header.insert(
+                AGGREGATE_WIN_RATE_COLUMN,
+                HeaderCell(
+                    f"Mean Performance",
+                    description=description,
+                    lower_is_better=False,
+                ),
+            )
+            for row, win_rate in zip(table.rows, win_rates):
+                row.insert(AGGREGATE_WIN_RATE_COLUMN, Cell(win_rate))
+        elif add_win_rate:
             # add overall win rate as the second column
             WIN_RATE_AGGREGATION = "mean"
             win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
@@ -1133,6 +1170,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]:
                     columns=[(subgroup, metric_group) for subgroup in subgroups],
                     is_scenario_table=False,
                     add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates,
+                    add_mean_col=self.schema.name_to_metric_group[metric_group].add_mean_col,
                 )
                 tables.append(table)
         return tables

From 2dac8aab1c37d6438bea9d7e1ce6ad9341ac3002 Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Tue, 17 Sep 2024 10:31:51 -0700
Subject: [PATCH 02/25] fix flake

---
 src/helm/benchmark/presentation/summarize.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index ca57f094e2..fcc95d5da4 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -1088,18 +1088,18 @@ def _adapter_spec_sort_key(spec):
         table = Table(title=title, header=header, rows=rows, links=links, name=name)
 
         if add_mean_col:
-            means = compute_aggregate_row_means(table, aggregation=WIN_RATE_AGGREGATION)
+            means = compute_aggregate_row_means(table)
             description = "An average over columns representing the mean performance"
             table.header.insert(
                 AGGREGATE_WIN_RATE_COLUMN,
                 HeaderCell(
-                    f"Mean Performance",
+                    "Mean Performance",
                     description=description,
                     lower_is_better=False,
                 ),
             )
-            for row, win_rate in zip(table.rows, win_rates):
-                row.insert(AGGREGATE_WIN_RATE_COLUMN, Cell(win_rate))
+            for row, row_mean in zip(table.rows, means):
+                row.insert(AGGREGATE_WIN_RATE_COLUMN, Cell(row_mean))
         elif add_win_rate:
             # add overall win rate as the second column
             WIN_RATE_AGGREGATION = "mean"

From d737f81ab04ede259abc903aea759d054ad55a5e Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Tue, 17 Sep 2024 19:20:59 -0700
Subject: [PATCH 03/25] fix

---
 src/helm/benchmark/presentation/summarize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index fcc95d5da4..f30d42bdb9 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -258,7 +258,7 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
     non-null values of the row are in columns we skip).
     """
 
-    means_per_row: List[List[float]] = [[] for _ in table.rows]
+    means_per_row: List[Optional[float]] = []
     for row in table.rows:
         total = 0
         count = 0

From c932aae3b9ee818da50d5994ccb93a066cf08e8f Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Tue, 17 Sep 2024 19:23:14 -0700
Subject: [PATCH 04/25] another fix

---
 src/helm/benchmark/presentation/summarize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index f30d42bdb9..1d5eac81bc 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -1170,7 +1170,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]:
                     columns=[(subgroup, metric_group) for subgroup in subgroups],
                     is_scenario_table=False,
                     add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates,
-                    add_mean_col=self.schema.name_to_metric_group[metric_group].add_mean_col,
+                    add_mean_col=bool(self.schema.name_to_metric_group[metric_group].add_mean_col),
                 )
                 tables.append(table)
         return tables

From 3ffd3d5189129e7937b94d8ba9894ac0bcde78b2 Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Tue, 17 Sep 2024 19:34:36 -0700
Subject: [PATCH 05/25] bugfix

---
 src/helm/benchmark/presentation/summarize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index 1d5eac81bc..1760391ac9 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -262,7 +262,7 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
     for row in table.rows:
         total = 0
         count = 0
-        for cell in enumerate(row):
+        for cell in row:
             if cell.value:
                 total += cell.value
                 count += 1

From 565bcf45ff1216ebc6332abcf74b52b1abd7004c Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Mon, 23 Sep 2024 20:52:46 -0700
Subject: [PATCH 06/25] working aggregation

---
 src/helm/benchmark/presentation/schema.py    | 13 +++++-
 src/helm/benchmark/presentation/summarize.py | 47 ++++++++++++--------
 src/helm/benchmark/static/schema_safety.yaml |  1 +
 3 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/src/helm/benchmark/presentation/schema.py b/src/helm/benchmark/presentation/schema.py
index ef90d1a5d3..db9aff6e81 100644
--- a/src/helm/benchmark/presentation/schema.py
+++ b/src/helm/benchmark/presentation/schema.py
@@ -1,7 +1,8 @@
 import ast
 import dataclasses
 from dataclasses import dataclass, field
-from typing import List, Optional, Dict
+from enum import IntEnum
+from typing import List, Optional, Dict, Union
 import dacite
 from inspect import cleandoc
 import mako.template
@@ -108,6 +109,14 @@ def substitute(self, environment: Dict[str, str]) -> "MetricNameMatcher":
         )
 
 
+@dataclass(frozen=True)
+class AggregationStrategy(IntEnum):
+    USE_NONE = 0
+    USE_MWR = 1
+    USE_MEAN = 2
+    USE_BOTH = 3
+
+
 @dataclass(frozen=True)
 class MetricGroup(Field):
     """
@@ -119,7 +128,7 @@ class MetricGroup(Field):
     hide_win_rates: Optional[bool] = None
     """If set to true, do not compute win rates."""
 
-    add_mean_col: Optional[bool] = None
+    aggregation_strategy: Optional[Union[AggregationStrategy, int]] = 1
 
 
 BY_METRIC = "by_metric"
diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index 1760391ac9..5528a23487 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -263,9 +263,12 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
         total = 0
         count = 0
         for cell in row:
-            if cell.value:
-                total += cell.value
-                count += 1
+            try:
+                if cell.value:
+                    total += float(cell.value)
+                    count += 1
+            except Exception:
+                print("failed")
         if count == 0:
             means_per_row.append(None)
         else:
@@ -904,7 +907,7 @@ def create_group_table(
         sub_split: Optional[str] = None,
         bold_columns: bool = True,
         add_win_rate: bool = False,
-        add_mean_col: bool = False,
+        aggregation_strategy: int = 0,
     ) -> Table:
         """
         Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of
@@ -1087,20 +1090,10 @@ def _adapter_spec_sort_key(spec):
 
         table = Table(title=title, header=header, rows=rows, links=links, name=name)
 
-        if add_mean_col:
-            means = compute_aggregate_row_means(table)
-            description = "An average over columns representing the mean performance"
-            table.header.insert(
-                AGGREGATE_WIN_RATE_COLUMN,
-                HeaderCell(
-                    "Mean Performance",
-                    description=description,
-                    lower_is_better=False,
-                ),
-            )
-            for row, row_mean in zip(table.rows, means):
-                row.insert(AGGREGATE_WIN_RATE_COLUMN, Cell(row_mean))
-        elif add_win_rate:
+        add_mean_col = aggregation_strategy >= 2
+        add_mwr = aggregation_strategy % 2 != 0 or add_win_rate  # values 1 or 3 say to include mwr
+
+        if add_mwr:
             # add overall win rate as the second column
             WIN_RATE_AGGREGATION = "mean"
             win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
@@ -1115,6 +1108,22 @@ def _adapter_spec_sort_key(spec):
             )
             for row, win_rate in zip(table.rows, win_rates):
                 row.insert(AGGREGATE_WIN_RATE_COLUMN, Cell(win_rate))
+        if add_mean_col:
+            means = compute_aggregate_row_means(table)
+            description = "An average over columns representing the mean performance"
+            insertion_column = AGGREGATE_WIN_RATE_COLUMN
+            if add_mwr:
+                insertion_column += 1
+            table.header.insert(
+                insertion_column,
+                HeaderCell(
+                    "Mean Performance",
+                    description=description,
+                    lower_is_better=False,
+                ),
+            )
+            for row, row_mean in zip(table.rows, means):
+                row.insert(insertion_column, Cell(row_mean))
 
         if bold_columns:
             for i, header_cell in enumerate(table.header):
@@ -1170,7 +1179,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]:
                     columns=[(subgroup, metric_group) for subgroup in subgroups],
                     is_scenario_table=False,
                     add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates,
-                    add_mean_col=bool(self.schema.name_to_metric_group[metric_group].add_mean_col),
+                    aggregation_strategy=self.schema.name_to_metric_group[metric_group].aggregation_strategy,
                 )
                 tables.append(table)
         return tables
diff --git a/src/helm/benchmark/static/schema_safety.yaml b/src/helm/benchmark/static/schema_safety.yaml
index 32239777fc..49026a674c 100644
--- a/src/helm/benchmark/static/schema_safety.yaml
+++ b/src/helm/benchmark/static/schema_safety.yaml
@@ -106,6 +106,7 @@ perturbations: []
 metric_groups:
   - name: accuracy
     display_name: Accuracy
+    aggregation_strategy: 3
     metrics:
       - name: ${main_name}
         split: ${main_split}

From 2b88e6a5425b79ed66f0d1ca920c6e7364251fa0 Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Mon, 23 Sep 2024 20:56:17 -0700
Subject: [PATCH 07/25] fix

---
 src/helm/benchmark/presentation/summarize.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index 5528a23487..a93fcdc115 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -1090,8 +1090,12 @@ def _adapter_spec_sort_key(spec):
 
         table = Table(title=title, header=header, rows=rows, links=links, name=name)
 
-        add_mean_col = aggregation_strategy >= 2
-        add_mwr = aggregation_strategy % 2 != 0 or add_win_rate  # values 1 or 3 say to include mwr
+        add_mean_col = (
+            aggregation_strategy >= 2
+        )  # values 2 or 3 indicate we should include mean (see AggregationStrategy enum)
+        add_mwr = (
+            aggregation_strategy % 2 != 0 or add_win_rate
+        )  # values 1 or 3 say to include mwr (see AggregationStrategy enum)
 
         if add_mwr:
             # add overall win rate as the second column

From f8fc636823fcb912aa509e30e99385c45da2efa3 Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Mon, 23 Sep 2024 21:04:03 -0700
Subject: [PATCH 08/25] fix err

---
 src/helm/benchmark/presentation/summarize.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index a93fcdc115..764307a246 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -1176,6 +1176,11 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]:
         if len(adapter_to_runs) > 0:
             for metric_group in all_metric_groups:
                 display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name()
+                agg_strat = (
+                    self.schema.name_to_metric_group[metric_group].aggregation_strategy
+                    if self.schema.name_to_metric_group[metric_group].aggregation_strategy != None
+                    else 1
+                )
                 table = self.create_group_table(
                     name=metric_group,
                     title=display_name,
@@ -1183,7 +1188,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]:
                     columns=[(subgroup, metric_group) for subgroup in subgroups],
                     is_scenario_table=False,
                     add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates,
-                    aggregation_strategy=self.schema.name_to_metric_group[metric_group].aggregation_strategy,
+                    aggregation_strategy=agg_strat,
                 )
                 tables.append(table)
         return tables

From 2e7545cee538d22af619b6d52dafe44eeb307431 Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Mon, 23 Sep 2024 21:20:59 -0700
Subject: [PATCH 09/25] cast

---
 src/helm/benchmark/presentation/summarize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index 764307a246..b9210747f1 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -1176,7 +1176,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]:
         if len(adapter_to_runs) > 0:
             for metric_group in all_metric_groups:
                 display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name()
-                agg_strat = (
+                agg_strat: int = (
                     self.schema.name_to_metric_group[metric_group].aggregation_strategy
                     if self.schema.name_to_metric_group[metric_group].aggregation_strategy != None
                     else 1

From c6328bcfcb805f921524b2c0498f36fa039b10fa Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Mon, 23 Sep 2024 21:33:28 -0700
Subject: [PATCH 10/25] fix

---
 src/helm/benchmark/presentation/summarize.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index b9210747f1..cf4b6d558d 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -18,7 +18,7 @@
 from collections import defaultdict
 from dataclasses import dataclass, replace
 from statistics import mean, median
-from typing import List, Optional, Dict, Any, Tuple, Set
+from typing import List, Optional, Dict, Any, Tuple, Set, Union
 
 from tqdm import tqdm
 from helm.benchmark.model_deployment_registry import get_model_deployment
@@ -907,7 +907,7 @@ def create_group_table(
         sub_split: Optional[str] = None,
         bold_columns: bool = True,
         add_win_rate: bool = False,
-        aggregation_strategy: int = 0,
+        aggregation_strategy: Union[int,None] = 0,
     ) -> Table:
         """
         Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of

From e0af08de72abf2368bc773e868a2bb3776e91c54 Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Mon, 23 Sep 2024 21:39:37 -0700
Subject: [PATCH 11/25] fmt

---
 src/helm/benchmark/presentation/summarize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index cf4b6d558d..0a1dcc86ed 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -907,7 +907,7 @@ def create_group_table(
         sub_split: Optional[str] = None,
         bold_columns: bool = True,
         add_win_rate: bool = False,
-        aggregation_strategy: Union[int,None] = 0,
+        aggregation_strategy: Union[int, None] = 0,
     ) -> Table:
         """
         Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of

From f3ea2a5a65fc88e31c9885d72e10c9d04d5dc06d Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Mon, 23 Sep 2024 21:54:22 -0700
Subject: [PATCH 12/25] fmt

---
 src/helm/benchmark/presentation/summarize.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index 0a1dcc86ed..763fcdb7f6 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -907,7 +907,7 @@ def create_group_table(
         sub_split: Optional[str] = None,
         bold_columns: bool = True,
         add_win_rate: bool = False,
-        aggregation_strategy: Union[int, None] = 0,
+        aggregation_strategy: int = 0,
     ) -> Table:
         """
         Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of
@@ -1176,7 +1176,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]:
         if len(adapter_to_runs) > 0:
             for metric_group in all_metric_groups:
                 display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name()
-                agg_strat: int = (
+                agg_strat: int = int(
                     self.schema.name_to_metric_group[metric_group].aggregation_strategy
                     if self.schema.name_to_metric_group[metric_group].aggregation_strategy != None
                     else 1

From b2ec71f5611d6fb9577fd0b09a6c5fd04f4b2db7 Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Mon, 23 Sep 2024 22:07:25 -0700
Subject: [PATCH 13/25] try

---
 src/helm/benchmark/presentation/summarize.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index 763fcdb7f6..e1b18a4ebd 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -260,7 +260,7 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
 
     means_per_row: List[Optional[float]] = []
     for row in table.rows:
-        total = 0
+        total: float = 0.0
         count = 0
         for cell in row:
             try:
@@ -907,7 +907,7 @@ def create_group_table(
         sub_split: Optional[str] = None,
         bold_columns: bool = True,
         add_win_rate: bool = False,
-        aggregation_strategy: int = 0,
+        selected_agg_strat: int = 0,
     ) -> Table:
         """
         Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of
@@ -1091,10 +1091,10 @@ def _adapter_spec_sort_key(spec):
         table = Table(title=title, header=header, rows=rows, links=links, name=name)
 
         add_mean_col = (
-            aggregation_strategy >= 2
+            selected_agg_strat >= 2
         )  # values 2 or 3 indicate we should include mean (see AggregationStrategy enum)
         add_mwr = (
-            aggregation_strategy % 2 != 0 or add_win_rate
+            selected_agg_strat % 2 != 0 or add_win_rate
         )  # values 1 or 3 say to include mwr (see AggregationStrategy enum)
 
         if add_mwr:
@@ -1176,7 +1176,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]:
         if len(adapter_to_runs) > 0:
             for metric_group in all_metric_groups:
                 display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name()
-                agg_strat: int = int(
+                agg_strat: int = (
                     self.schema.name_to_metric_group[metric_group].aggregation_strategy
                     if self.schema.name_to_metric_group[metric_group].aggregation_strategy != None
                     else 1
@@ -1188,7 +1188,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]:
                     columns=[(subgroup, metric_group) for subgroup in subgroups],
                     is_scenario_table=False,
                     add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates,
-                    aggregation_strategy=agg_strat,
+                    selected_agg_strat=int(agg_strat),
                 )
                 tables.append(table)
         return tables

From b765ef513daca25b4d8c3a8c7d8d5ddcd4bc2c84 Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Mon, 23 Sep 2024 22:17:43 -0700
Subject: [PATCH 14/25] fix

---
 src/helm/benchmark/presentation/summarize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index e1b18a4ebd..9b4c5b5b4f 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -1178,7 +1178,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]:
                 display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name()
                 agg_strat: int = (
                     self.schema.name_to_metric_group[metric_group].aggregation_strategy
-                    if self.schema.name_to_metric_group[metric_group].aggregation_strategy != None
+                    if self.schema.name_to_metric_group[metric_group].aggregation_strategy is not None
                     else 1
                 )
                 table = self.create_group_table(

From 506dabae2519cc42065667fb1a5c8dc0bf20f3aa Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Mon, 23 Sep 2024 22:18:30 -0700
Subject: [PATCH 15/25] another

---
 src/helm/benchmark/presentation/summarize.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index 9b4c5b5b4f..9333b144be 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -1176,11 +1176,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]:
         if len(adapter_to_runs) > 0:
             for metric_group in all_metric_groups:
                 display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name()
-                agg_strat: int = (
-                    self.schema.name_to_metric_group[metric_group].aggregation_strategy
-                    if self.schema.name_to_metric_group[metric_group].aggregation_strategy is not None
-                    else 1
-                )
+                agg_strat = self.schema.name_to_metric_group[metric_group].aggregation_strategy or 1
                 table = self.create_group_table(
                     name=metric_group,
                     title=display_name,

From 1947b61d78db0d03f4be743a9c6885433be8e995 Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Mon, 23 Sep 2024 22:26:55 -0700
Subject: [PATCH 16/25] finish precommit errs

---
 src/helm/benchmark/presentation/summarize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index 9333b144be..da96cf054f 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -18,7 +18,7 @@
 from collections import defaultdict
 from dataclasses import dataclass, replace
 from statistics import mean, median
-from typing import List, Optional, Dict, Any, Tuple, Set, Union
+from typing import List, Optional, Dict, Any, Tuple, Set
 
 from tqdm import tqdm
 from helm.benchmark.model_deployment_registry import get_model_deployment

From a19ef7bc9dbe110ca03158fea3a1988e09a8083f Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Tue, 24 Sep 2024 11:25:26 -0700
Subject: [PATCH 17/25] fixes

---
 src/helm/benchmark/presentation/schema.py    | 10 +--
 src/helm/benchmark/presentation/summarize.py | 95 +++++++++++---------
 src/helm/benchmark/static/schema_safety.yaml |  4 +-
 3 files changed, 59 insertions(+), 50 deletions(-)

diff --git a/src/helm/benchmark/presentation/schema.py b/src/helm/benchmark/presentation/schema.py
index db9aff6e81..93273f5041 100644
--- a/src/helm/benchmark/presentation/schema.py
+++ b/src/helm/benchmark/presentation/schema.py
@@ -109,14 +109,6 @@ def substitute(self, environment: Dict[str, str]) -> "MetricNameMatcher":
         )
 
 
-@dataclass(frozen=True)
-class AggregationStrategy(IntEnum):
-    USE_NONE = 0
-    USE_MWR = 1
-    USE_MEAN = 2
-    USE_BOTH = 3
-
-
 @dataclass(frozen=True)
 class MetricGroup(Field):
     """
@@ -128,7 +120,7 @@ class MetricGroup(Field):
     hide_win_rates: Optional[bool] = None
     """If set to true, do not compute win rates."""
 
-    aggregation_strategy: Optional[Union[AggregationStrategy, int]] = 1
+    aggregation_strategies: Optional[List[str]] = field(default_factory=list)
 
 
 BY_METRIC = "by_metric"
diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index da96cf054f..0130f81bcf 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -259,6 +259,15 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
     """
 
     means_per_row: List[Optional[float]] = []
+
+    # check for all header cells where specified, that lower_is_better is consistent
+    orderings = []
+    for elem in table.header:
+        if elem.lower_is_better is not None:
+            orderings.append(elem.lower_is_better)
+    if not (all(orderings) or not any(orderings)):
+        raise Exception("Cannot mean columns with different values for lower_is_better")
+
     for row in table.rows:
         total: float = 0.0
         count = 0
@@ -278,6 +287,7 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
 
 
 AGGREGATE_WIN_RATE_COLUMN = 1
+AGGREGATION_STRATEGIES = ["mean", "win_rate"]
 
 
 class Summarizer:
@@ -907,7 +917,7 @@ def create_group_table(
         sub_split: Optional[str] = None,
         bold_columns: bool = True,
         add_win_rate: bool = False,
-        selected_agg_strat: int = 0,
+        selected_agg_strats: List[str] = [],
     ) -> Table:
         """
         Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of
@@ -1090,44 +1100,49 @@ def _adapter_spec_sort_key(spec):
 
         table = Table(title=title, header=header, rows=rows, links=links, name=name)
 
-        add_mean_col = (
-            selected_agg_strat >= 2
-        )  # values 2 or 3 indicate we should include mean (see AggregationStrategy enum)
-        add_mwr = (
-            selected_agg_strat % 2 != 0 or add_win_rate
-        )  # values 1 or 3 say to include mwr (see AggregationStrategy enum)
-
-        if add_mwr:
-            # add overall win rate as the second column
-            WIN_RATE_AGGREGATION = "mean"
-            win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
-            description = "How many models this model outperform on average (over columns)."
-            table.header.insert(
-                AGGREGATE_WIN_RATE_COLUMN,
-                HeaderCell(
-                    f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
-                    description=description,
-                    lower_is_better=False,
-                ),
-            )
-            for row, win_rate in zip(table.rows, win_rates):
-                row.insert(AGGREGATE_WIN_RATE_COLUMN, Cell(win_rate))
-        if add_mean_col:
-            means = compute_aggregate_row_means(table)
-            description = "An average over columns representing the mean performance"
-            insertion_column = AGGREGATE_WIN_RATE_COLUMN
-            if add_mwr:
+        selected_agg_strats = selected_agg_strats if selected_agg_strats is not None else ["win_rate"]
+
+        # this preserves backwards compatibility for self.schema.name_to_metric_group[metric_group].hide_win_rates
+        # hide_win_rate is the inverse of add_win_rate here (see the function call for create_group_table)
+        hide_aggregation = not add_win_rate
+        if hide_aggregation:
+            selected_agg_strats = []
+        insertion_column = AGGREGATE_WIN_RATE_COLUMN
+        for strategy in selected_agg_strats:
+            if strategy == "win_rate":
+                # add overall win rate as the second column
+                WIN_RATE_AGGREGATION = "mean"
+                win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
+                description = "How many models this model outperform on average (over columns)."
+                table.header.insert(
+                    insertion_column,
+                    HeaderCell(
+                        f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
+                        description=description,
+                        lower_is_better=False,
+                    ),
+                )
+                for row, win_rate in zip(table.rows, win_rates):
+                    row.insert(insertion_column, Cell(win_rate))
                 insertion_column += 1
-            table.header.insert(
-                insertion_column,
-                HeaderCell(
-                    "Mean Performance",
-                    description=description,
-                    lower_is_better=False,
-                ),
-            )
-            for row, row_mean in zip(table.rows, means):
-                row.insert(insertion_column, Cell(row_mean))
+            elif strategy == "mean":
+                means = compute_aggregate_row_means(table)
+                description = "An average over columns representing the mean performance"
+                table.header.insert(
+                    insertion_column,
+                    HeaderCell(
+                        "Mean Performance",
+                        description=description,
+                        lower_is_better=False,
+                    ),
+                )
+                for row, row_mean in zip(table.rows, means):
+                    row.insert(insertion_column, Cell(row_mean))
+                insertion_column += 1
+            else:
+                raise Exception(
+                    f"Improper aggregation strategy found: {strategy}. Please use one of: {AGGREGATION_STRATEGIES}"
+                )
 
         if bold_columns:
             for i, header_cell in enumerate(table.header):
@@ -1176,7 +1191,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]:
         if len(adapter_to_runs) > 0:
             for metric_group in all_metric_groups:
                 display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name()
-                agg_strat = self.schema.name_to_metric_group[metric_group].aggregation_strategy or 1
+                agg_strats: List[str] = self.schema.name_to_metric_group[metric_group].aggregation_strategies or []
                 table = self.create_group_table(
                     name=metric_group,
                     title=display_name,
@@ -1184,7 +1199,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]:
                     columns=[(subgroup, metric_group) for subgroup in subgroups],
                     is_scenario_table=False,
                     add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates,
-                    selected_agg_strat=int(agg_strat),
+                    selected_agg_strats=agg_strats,
                 )
                 tables.append(table)
         return tables
diff --git a/src/helm/benchmark/static/schema_safety.yaml b/src/helm/benchmark/static/schema_safety.yaml
index 49026a674c..553826ff82 100644
--- a/src/helm/benchmark/static/schema_safety.yaml
+++ b/src/helm/benchmark/static/schema_safety.yaml
@@ -106,7 +106,9 @@ perturbations: []
 metric_groups:
   - name: accuracy
     display_name: Accuracy
-    aggregation_strategy: 3
+    aggregation_strategies: 
+      - win_rate
+      - mean
     metrics:
       - name: ${main_name}
         split: ${main_split}

From 8a427dcd02fbbaad8562249e0280e5fc632fdc2b Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Tue, 24 Sep 2024 11:26:00 -0700
Subject: [PATCH 18/25] remove unused import

---
 src/helm/benchmark/presentation/schema.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/helm/benchmark/presentation/schema.py b/src/helm/benchmark/presentation/schema.py
index 93273f5041..fed18c14dd 100644
--- a/src/helm/benchmark/presentation/schema.py
+++ b/src/helm/benchmark/presentation/schema.py
@@ -1,8 +1,7 @@
 import ast
 import dataclasses
 from dataclasses import dataclass, field
-from enum import IntEnum
-from typing import List, Optional, Dict, Union
+from typing import List, Optional, Dict
 import dacite
 from inspect import cleandoc
 import mako.template

From 063b0f78cab48e6d718b2fca1dca89acc5e43d74 Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Tue, 24 Sep 2024 11:29:44 -0700
Subject: [PATCH 19/25] comment

---
 src/helm/benchmark/presentation/schema.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/helm/benchmark/presentation/schema.py b/src/helm/benchmark/presentation/schema.py
index fed18c14dd..0035f45a8f 100644
--- a/src/helm/benchmark/presentation/schema.py
+++ b/src/helm/benchmark/presentation/schema.py
@@ -120,6 +120,7 @@ class MetricGroup(Field):
     """If set to true, do not compute win rates."""
 
     aggregation_strategies: Optional[List[str]] = field(default_factory=list)
+    """List with values in {'win_rate','mean'} that correspond to aggregations"""
 
 
 BY_METRIC = "by_metric"

From b70fded56756bd472deb77b6612c5956cc332ad2 Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Tue, 24 Sep 2024 11:36:09 -0700
Subject: [PATCH 20/25] s

---
 src/helm/benchmark/presentation/schema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/benchmark/presentation/schema.py b/src/helm/benchmark/presentation/schema.py
index 0035f45a8f..f10fe2ecee 100644
--- a/src/helm/benchmark/presentation/schema.py
+++ b/src/helm/benchmark/presentation/schema.py
@@ -119,7 +119,7 @@ class MetricGroup(Field):
     hide_win_rates: Optional[bool] = None
     """If set to true, do not compute win rates."""
 
-    aggregation_strategies: Optional[List[str]] = field(default_factory=list)
+    aggregation_strategies: Optional[List[str]] = None
     """List with values in {'win_rate','mean'} that correspond to aggregations"""
 
 

From aa3627795b02c7eb8e69c0b05561fef8b20719cb Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Tue, 24 Sep 2024 14:03:23 -0700
Subject: [PATCH 21/25] fix most coments

---
 src/helm/benchmark/presentation/summarize.py | 39 ++++++++++----------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index 0130f81bcf..31815e4a5e 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -258,32 +258,28 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
     non-null values of the row are in columns we skip).
     """
 
-    means_per_row: List[Optional[float]] = []
+    row_means: List[Optional[float]] = []
 
     # check for all header cells where specified, that lower_is_better is consistent
     orderings = []
     for elem in table.header:
-        if elem.lower_is_better is not None:
-            orderings.append(elem.lower_is_better)
-    if not (all(orderings) or not any(orderings)):
+        orderings.append(elem.lower_is_better)
+    if len(set(orderings)) != 1:
         raise Exception("Cannot mean columns with different values for lower_is_better")
 
     for row in table.rows:
-        total: float = 0.0
+        total = 0.0
         count = 0
         for cell in row:
-            try:
-                if cell.value:
-                    total += float(cell.value)
-                    count += 1
-            except Exception:
-                print("failed")
+            if cell.value is not None:
+                total += float(cell.value)
+                count += 1
         if count == 0:
-            means_per_row.append(None)
+            row_means.append(None)
         else:
-            means_per_row.append(total / count)
+            row_means.append(total / count)
 
-    return means_per_row
+    return row_means
 
 
 AGGREGATE_WIN_RATE_COLUMN = 1
@@ -917,7 +913,7 @@ def create_group_table(
         sub_split: Optional[str] = None,
         bold_columns: bool = True,
         add_win_rate: bool = False,
-        selected_agg_strats: List[str] = [],
+        aggregation_strategies: List[str] = [],
     ) -> Table:
         """
         Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of
@@ -1100,15 +1096,16 @@ def _adapter_spec_sort_key(spec):
 
         table = Table(title=title, header=header, rows=rows, links=links, name=name)
 
-        selected_agg_strats = selected_agg_strats if selected_agg_strats is not None else ["win_rate"]
+        if aggregation_strategies is None:
+            aggregation_strategies = ["win_rate"]
 
         # this preserves backwards compatibility for self.schema.name_to_metric_group[metric_group].hide_win_rates
         # hide_win_rate is the inverse of add_win_rate here (see the function call for create_group_table)
         hide_aggregation = not add_win_rate
         if hide_aggregation:
-            selected_agg_strats = []
+            aggregation_strategies = []
         insertion_column = AGGREGATE_WIN_RATE_COLUMN
-        for strategy in selected_agg_strats:
+        for strategy in aggregation_strategies:
             if strategy == "win_rate":
                 # add overall win rate as the second column
                 WIN_RATE_AGGREGATION = "mean"
@@ -1191,7 +1188,9 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]:
         if len(adapter_to_runs) > 0:
             for metric_group in all_metric_groups:
                 display_name = self.schema.name_to_metric_group[metric_group].get_short_display_name()
-                agg_strats: List[str] = self.schema.name_to_metric_group[metric_group].aggregation_strategies or []
+                aggregate_strategies: List[str] = (
+                    self.schema.name_to_metric_group[metric_group].aggregation_strategies or []
+                )
                 table = self.create_group_table(
                     name=metric_group,
                     title=display_name,
@@ -1199,7 +1198,7 @@ def create_group_tables_by_metric_group(self, group: RunGroup) -> List[Table]:
                     columns=[(subgroup, metric_group) for subgroup in subgroups],
                     is_scenario_table=False,
                     add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates,
-                    selected_agg_strats=agg_strats,
+                    aggregation_strategies=aggregate_strategies,
                 )
                 tables.append(table)
         return tables

From 853a277efb6832f00853dbe668494fa8c59771a1 Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Tue, 24 Sep 2024 14:06:14 -0700
Subject: [PATCH 22/25] small fix

---
 src/helm/benchmark/presentation/summarize.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index 31815e4a5e..c3555c355f 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -1130,7 +1130,7 @@ def _adapter_spec_sort_key(spec):
                     HeaderCell(
                         "Mean Performance",
                         description=description,
-                        lower_is_better=False,
+                        lower_is_better=table.header[0].lower_is_better,
                     ),
                 )
                 for row, row_mean in zip(table.rows, means):
@@ -1138,7 +1138,7 @@ def _adapter_spec_sort_key(spec):
                 insertion_column += 1
             else:
                 raise Exception(
-                    f"Improper aggregation strategy found: {strategy}. Please use one of: {AGGREGATION_STRATEGIES}"
+                    f"Unknown aggregation strategy found: {strategy}. Please use one of: {AGGREGATION_STRATEGIES}"
                 )
 
         if bold_columns:

From 5849e6ef5c811fe12e7be1cf0d1ad161b91ca204 Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Tue, 24 Sep 2024 14:41:35 -0700
Subject: [PATCH 23/25] bigger fix

---
 src/helm/benchmark/presentation/summarize.py | 35 +++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index c3555c355f..9c2965c872 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -1104,43 +1104,46 @@ def _adapter_spec_sort_key(spec):
         hide_aggregation = not add_win_rate
         if hide_aggregation:
             aggregation_strategies = []
-        insertion_column = AGGREGATE_WIN_RATE_COLUMN
+
+        aggregate_header_cells: List[HeaderCell] = []
+        aggregate_row_values: List[List[float]] = []
+
         for strategy in aggregation_strategies:
             if strategy == "win_rate":
-                # add overall win rate as the second column
                 WIN_RATE_AGGREGATION = "mean"
                 win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
                 description = "How many models this model outperform on average (over columns)."
-                table.header.insert(
-                    insertion_column,
+                aggregate_header_cells.append(
                     HeaderCell(
                         f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
                         description=description,
                         lower_is_better=False,
-                    ),
+                    )
                 )
-                for row, win_rate in zip(table.rows, win_rates):
-                    row.insert(insertion_column, Cell(win_rate))
-                insertion_column += 1
+                aggregate_row_values.append(win_rates)
             elif strategy == "mean":
                 means = compute_aggregate_row_means(table)
-                description = "An average over columns representing the mean performance"
-                table.header.insert(
-                    insertion_column,
+                description = "An average over columns representing the mean performance."
+                aggregate_header_cells.append(
                     HeaderCell(
-                        "Mean Performance",
+                        "Mean performance",
                         description=description,
                         lower_is_better=table.header[0].lower_is_better,
-                    ),
+                    )
                 )
-                for row, row_mean in zip(table.rows, means):
-                    row.insert(insertion_column, Cell(row_mean))
-                insertion_column += 1
+                aggregate_row_values.append(means)
             else:
                 raise Exception(
                     f"Unknown aggregation strategy found: {strategy}. Please use one of: {AGGREGATION_STRATEGIES}"
                 )
 
+        for i in range(len(aggregate_header_cells)):
+            aggregate_header_cell = aggregate_header_cells[i]
+            aggregate_rows = aggregate_row_values[i]
+            table.header.insert(i + 1, aggregate_header_cell)
+            for row, row_val in zip(table.rows, aggregate_rows):
+                row.insert(i + 1, Cell(row_val))
+
         if bold_columns:
             for i, header_cell in enumerate(table.header):
                 lower_is_better = header_cell.lower_is_better

From affc89a2c2cd4a03e0f5032897cfa0fe5cc10c15 Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Tue, 24 Sep 2024 14:50:35 -0700
Subject: [PATCH 24/25] err

---
 src/helm/benchmark/presentation/summarize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index 9c2965c872..62a6814aac 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -1106,7 +1106,7 @@ def _adapter_spec_sort_key(spec):
             aggregation_strategies = []
 
         aggregate_header_cells: List[HeaderCell] = []
-        aggregate_row_values: List[List[float]] = []
+        aggregate_row_values: List[List[Optional[float]]] = []
 
         for strategy in aggregation_strategies:
             if strategy == "win_rate":

From 598245a54b04e0251fc4f0b404df7cf2a197b81d Mon Sep 17 00:00:00 2001
From: farzaank <fkaiyom@gmail.com>
Date: Tue, 24 Sep 2024 17:13:11 -0700
Subject: [PATCH 25/25] fixed

---
 src/helm/benchmark/presentation/summarize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py
index 62a6814aac..0a0d3c50f6 100644
--- a/src/helm/benchmark/presentation/summarize.py
+++ b/src/helm/benchmark/presentation/summarize.py
@@ -1112,7 +1112,7 @@ def _adapter_spec_sort_key(spec):
             if strategy == "win_rate":
                 WIN_RATE_AGGREGATION = "mean"
                 win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
-                description = "How many models this model outperform on average (over columns)."
+                description = "How many models this model outperforms on average (over columns)."
                 aggregate_header_cells.append(
                     HeaderCell(
                         f"{WIN_RATE_AGGREGATION.capitalize()} win rate",