[Update] Add MATH500 & AIME2024 to LiveMathBench (#1741)

* upload dataset definitions & configs * add single dataset split specific metrics * add k-pass@threshold & MATH500 * update std computation & k-pass computation * add AIME224 * update README
open-compass · Dec 6, 2024 · f333be1 · f333be1
1 parent 08d63b5
commit f333be1
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 25 deletions.
diff --git a/opencompass/configs/datasets/livemathbench/README.md b/opencompass/configs/datasets/livemathbench/README.md
@@ -11,6 +11,7 @@
 | CMO | cn | 0 | 0 | 0 | 18 |
 | CMO | en | 0 | 0 | 0 | 18 |
 | MATH500 | en | 0 | 0 | 0 | 500 |
+| AIME2024 | en | 0 | 0 | 0 | 44 |
 
 
 ## How to use

diff --git a/opencompass/datasets/livemathbench/livemathbench.py b/opencompass/datasets/livemathbench/livemathbench.py
@@ -1,6 +1,7 @@
 import concurrent.futures
 import os
 import re
+from collections import OrderedDict
 from copy import deepcopy
 from itertools import product
 from typing import Any, Dict, List
@@ -21,7 +22,7 @@
 
 @LOAD_DATASET.register_module()
 class LiveMathBenchDataset(BaseDataset):
-    dataset_splits = ['AIMC', 'CEE', 'CMO', 'MATH500']
+    dataset_splits = ['AIMC', 'CEE', 'CMO', 'MATH500', 'AIME2024']
     dataset_languages = ['cn', 'en']
 
     @staticmethod
@@ -276,12 +277,11 @@ def score(self, predictions, references, origin_prompt, test_set):
         details = []
         all_dataset = set()
         for key, examples in key2example.items():
-            detail = {
-                'question': examples[0][0]['question'],
-                'answer': examples[0][0]['answer'],
-                'responses': [],
-                'dataset': '_'.join(key.split('_')[:-1])
-            }
+            detail = OrderedDict()
+            detail['question'] = examples[0][0]['question']
+            detail['answer'] = examples[0][0]['answer']
+            detail['responses'] = []
+            detail['dataset'] = '_'.join(key.split('_')[:-1])
             all_dataset.add('_'.join(key.split('_')[:-1]))
             if_pass_list = []
             for single_run_examples in examples:
@@ -308,9 +308,11 @@ def score(self, predictions, references, origin_prompt, test_set):
                     f'pass-rate@{i}/std':
                     if_pass_list[:, :i].mean(axis=1).std(axis=0).item(),
                     f'pass@{i}':
-                    if_pass_list[:, :1].mean(axis=1).mean(axis=0).item(),
+                    np.ceil(
+                        if_pass_list[:, :i].mean(axis=1)).mean(axis=0).item(),
                     f'pass@{i}/std':
-                    if_pass_list[:, :1].mean(axis=1).std(axis=0).item(),
+                    np.ceil(
+                        if_pass_list[:, :i].mean(axis=1)).std(axis=0).item(),
                 })
                 i = i * 2
 
@@ -328,7 +330,8 @@ def score(self, predictions, references, origin_prompt, test_set):
 
             details.append(detail)
 
-        detailed_result = {'details': details}
+        detailed_result = OrderedDict()
+        detailed_result['details'] = details
 
         i = 1
         while i <= K:
@@ -378,24 +381,26 @@ def score(self, predictions, references, origin_prompt, test_set):
                 })
                 detailed_result.update({
                     f'{K}-pass@{threshold}/std':
-                    100. * np.std([
+                    100. * np.mean([
                         detail[f'{K}-pass@{threshold}'] for detail in details
                     ])
                 })
             for d in sorted(list(all_dataset)):
-                detailed_result.update({
-                    f'{d}/{K}-pass@{threshold}':
-                    100. * np.mean([
-                        detail[f'{K}-pass@{threshold}']
-                        for detail in details if detail['dataset'] == d
-                    ])
-                })
-                detailed_result.update({
-                    f'{d}/{K}-pass@{threshold}/std':
-                    100. * np.std([
-                        detail[f'{K}-pass@{threshold}']
-                        for detail in details if detail['dataset'] == d
-                    ])
-                })
+
+                for threshold in [0.5, 0.75, 1.0]:
+                    detailed_result.update({
+                        f'{d}/{K}-pass@{threshold}':
+                        100. * np.mean([
+                            detail[f'{K}-pass@{threshold}']
+                            for detail in details if detail['dataset'] == d
+                        ])
+                    })
+                    detailed_result.update({
+                        f'{d}/{K}-pass@{threshold}/std':
+                        100. * np.mean([
+                            detail[f'{K}-pass@{threshold}']
+                            for detail in details if detail['dataset'] == d
+                        ])
+                    })
 
         return detailed_result