Skip to content

Commit

Permalink
[Update] Add MATH500 & AIME2024 to LiveMathBench (#1741)
Browse files Browse the repository at this point in the history
* upload dataset definitions & configs

* add single dataset split specific metrics

* add k-pass@threshold & MATH500

* update std computation & k-pass computation

* add AIME224

* update README
  • Loading branch information
jnanliu authored Dec 6, 2024
1 parent 08d63b5 commit f333be1
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 25 deletions.
1 change: 1 addition & 0 deletions opencompass/configs/datasets/livemathbench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
| CMO | cn | 0 | 0 | 0 | 18 |
| CMO | en | 0 | 0 | 0 | 18 |
| MATH500 | en | 0 | 0 | 0 | 500 |
| AIME2024 | en | 0 | 0 | 0 | 44 |


## How to use
Expand Down
55 changes: 30 additions & 25 deletions opencompass/datasets/livemathbench/livemathbench.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import concurrent.futures
import os
import re
from collections import OrderedDict
from copy import deepcopy
from itertools import product
from typing import Any, Dict, List
Expand All @@ -21,7 +22,7 @@

@LOAD_DATASET.register_module()
class LiveMathBenchDataset(BaseDataset):
dataset_splits = ['AIMC', 'CEE', 'CMO', 'MATH500']
dataset_splits = ['AIMC', 'CEE', 'CMO', 'MATH500', 'AIME2024']
dataset_languages = ['cn', 'en']

@staticmethod
Expand Down Expand Up @@ -276,12 +277,11 @@ def score(self, predictions, references, origin_prompt, test_set):
details = []
all_dataset = set()
for key, examples in key2example.items():
detail = {
'question': examples[0][0]['question'],
'answer': examples[0][0]['answer'],
'responses': [],
'dataset': '_'.join(key.split('_')[:-1])
}
detail = OrderedDict()
detail['question'] = examples[0][0]['question']
detail['answer'] = examples[0][0]['answer']
detail['responses'] = []
detail['dataset'] = '_'.join(key.split('_')[:-1])
all_dataset.add('_'.join(key.split('_')[:-1]))
if_pass_list = []
for single_run_examples in examples:
Expand All @@ -308,9 +308,11 @@ def score(self, predictions, references, origin_prompt, test_set):
f'pass-rate@{i}/std':
if_pass_list[:, :i].mean(axis=1).std(axis=0).item(),
f'pass@{i}':
if_pass_list[:, :1].mean(axis=1).mean(axis=0).item(),
np.ceil(
if_pass_list[:, :i].mean(axis=1)).mean(axis=0).item(),
f'pass@{i}/std':
if_pass_list[:, :1].mean(axis=1).std(axis=0).item(),
np.ceil(
if_pass_list[:, :i].mean(axis=1)).std(axis=0).item(),
})
i = i * 2

Expand All @@ -328,7 +330,8 @@ def score(self, predictions, references, origin_prompt, test_set):

details.append(detail)

detailed_result = {'details': details}
detailed_result = OrderedDict()
detailed_result['details'] = details

i = 1
while i <= K:
Expand Down Expand Up @@ -378,24 +381,26 @@ def score(self, predictions, references, origin_prompt, test_set):
})
detailed_result.update({
f'{K}-pass@{threshold}/std':
100. * np.std([
100. * np.mean([
detail[f'{K}-pass@{threshold}'] for detail in details
])
})
for d in sorted(list(all_dataset)):
detailed_result.update({
f'{d}/{K}-pass@{threshold}':
100. * np.mean([
detail[f'{K}-pass@{threshold}']
for detail in details if detail['dataset'] == d
])
})
detailed_result.update({
f'{d}/{K}-pass@{threshold}/std':
100. * np.std([
detail[f'{K}-pass@{threshold}']
for detail in details if detail['dataset'] == d
])
})

for threshold in [0.5, 0.75, 1.0]:
detailed_result.update({
f'{d}/{K}-pass@{threshold}':
100. * np.mean([
detail[f'{K}-pass@{threshold}']
for detail in details if detail['dataset'] == d
])
})
detailed_result.update({
f'{d}/{K}-pass@{threshold}/std':
100. * np.mean([
detail[f'{K}-pass@{threshold}']
for detail in details if detail['dataset'] == d
])
})

return detailed_result

0 comments on commit f333be1

Please sign in to comment.