Merge branch 'main' into g-passk

open-compass · Dec 21, 2024 · 6ca63ca · 6ca63ca
2 parents 3fdc500 + ebefffe
commit 6ca63ca
Show file tree

Hide file tree

Showing 25 changed files with 230 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -79,6 +79,8 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
 
 We provide [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `opencompass@pjlab.org.cn`.
 
+You can also refer to [CompassAcademic](configs/eval_academic_leaderboard_202412.py) to quickly reproduce the leaderboard results. The currently selected datasets include Knowledge Reasoning (MMLU-Pro/GPQA Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Generation (LiveCodeBench, HumanEval), and Instruction Following (IFEval)."
+
 <p align="right"><a href="#top">🔝Back to top</a></p>
 
 ## 🛠️ Installation

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -77,6 +77,8 @@
 
 我们将陆续提供开源模型和 API 模型的具体性能榜单，请见 [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) 。如需加入评测，请提供模型仓库地址或标准的 API 接口至邮箱  `opencompass@pjlab.org.cn`.
 
+你也可以参考[CompassAcademic](configs/eval_academic_leaderboard_202412.py)，快速地复现榜单的结果，目前选取的数据集包括 综合知识推理 (MMLU-Pro/GPQA Diamond) ,逻辑推理 (BBH) ,数学推理 (MATH-500, AIME) ,代码生成 (LiveCodeBench, HumanEval) ,指令跟随 (IFEval) 。
+
 <p align="right"><a href="#top">🔝返回顶部</a></p>
 
 ## 🛠️ 安装指南

diff --git a/configs/datasets/ruler/ruler_128k_gen.py b/configs/datasets/ruler/ruler_128k_gen.py
@@ -1,3 +1,5 @@
+import os
+
 from mmengine.config import read_base
 
 with read_base():
@@ -12,6 +14,7 @@
 
 # Evaluation config
 NUM_SAMPLES = 100  # Change to the number of samples you need
+tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 128]
 abbr_suffixs = ['128k']
@@ -25,4 +28,5 @@
         tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
         tmp_dataset['num_samples'] = NUM_SAMPLES
         tmp_dataset['max_seq_length'] = max_seq_len
+        tmp_dataset['tokenizer_model'] = tokenizer_model
         ruler_datasets.append(tmp_dataset)
diff --git a/configs/datasets/ruler/ruler_16k_gen.py b/configs/datasets/ruler/ruler_16k_gen.py
@@ -1,3 +1,4 @@
+import os
 
 from mmengine.config import read_base
 
@@ -13,6 +14,7 @@
 
 # Evaluation config
 NUM_SAMPLES = 100 # Change to the number of samples you need
+tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 16]
 abbr_suffixs = ['16k']
@@ -26,4 +28,5 @@
         tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
         tmp_dataset['num_samples'] = NUM_SAMPLES
         tmp_dataset['max_seq_length'] = max_seq_len
+        tmp_dataset['tokenizer_model'] = tokenizer_model
         ruler_datasets.append(tmp_dataset)
diff --git a/configs/datasets/ruler/ruler_1m_gen.py b/configs/datasets/ruler/ruler_1m_gen.py
@@ -1,3 +1,4 @@
+import os
 
 from mmengine.config import read_base
 
@@ -13,6 +14,7 @@
 
 # Evaluation config
 NUM_SAMPLES = 100 # Change to the number of samples you need
+tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 1024]
 abbr_suffixs = ['1m']
@@ -26,4 +28,5 @@
         tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
         tmp_dataset['num_samples'] = NUM_SAMPLES
         tmp_dataset['max_seq_length'] = max_seq_len
+        tmp_dataset['tokenizer_model'] = tokenizer_model
         ruler_datasets.append(tmp_dataset)
diff --git a/configs/datasets/ruler/ruler_32k_gen.py b/configs/datasets/ruler/ruler_32k_gen.py
@@ -1,3 +1,4 @@
+import os
 
 from mmengine.config import read_base
 
@@ -13,6 +14,7 @@
 
 # Evaluation config
 NUM_SAMPLES = 100 # Change to the number of samples you need
+tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 32]
 abbr_suffixs = ['32k']
@@ -26,4 +28,5 @@
         tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
         tmp_dataset['num_samples'] = NUM_SAMPLES
         tmp_dataset['max_seq_length'] = max_seq_len
+        tmp_dataset['tokenizer_model'] = tokenizer_model
         ruler_datasets.append(tmp_dataset)
diff --git a/configs/datasets/ruler/ruler_4k_gen.py b/configs/datasets/ruler/ruler_4k_gen.py
@@ -1,3 +1,5 @@
+import os
+
 from mmengine.config import read_base
 
 with read_base():
@@ -12,6 +14,7 @@
 
 # Evaluation config
 NUM_SAMPLES = 100  # Change to the number of samples you need
+tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 4]
 abbr_suffixs = ['4k']
@@ -25,4 +28,5 @@
         tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
         tmp_dataset['num_samples'] = NUM_SAMPLES
         tmp_dataset['max_seq_length'] = max_seq_len
+        tmp_dataset['tokenizer_model'] = tokenizer_model
         ruler_datasets.append(tmp_dataset)
diff --git a/configs/datasets/ruler/ruler_64k_gen.py b/configs/datasets/ruler/ruler_64k_gen.py
@@ -1,3 +1,5 @@
+import os
+
 from mmengine.config import read_base
 
 with read_base():
@@ -12,6 +14,7 @@
 
 # Evaluation config
 NUM_SAMPLES = 100  # Change to the number of samples you need
+tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 64]
 abbr_suffixs: list[str] = ['64k']
@@ -25,4 +28,5 @@
         tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
         tmp_dataset['num_samples'] = NUM_SAMPLES
         tmp_dataset['max_seq_length'] = max_seq_len
+        tmp_dataset['tokenizer_model'] = tokenizer_model
         ruler_datasets.append(tmp_dataset)
diff --git a/configs/datasets/ruler/ruler_8k_gen.py b/configs/datasets/ruler/ruler_8k_gen.py
@@ -1,3 +1,4 @@
+import os
 
 from mmengine.config import read_base
 
@@ -13,6 +14,7 @@
 
 # Evaluation config
 NUM_SAMPLES = 100 # Change to the number of samples you need
+tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
 # Change the context lengths to be tested
 max_seq_lens = [1024 * 8]
 abbr_suffixs = ['8k']
@@ -26,4 +28,5 @@
         tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
         tmp_dataset['num_samples'] = NUM_SAMPLES
         tmp_dataset['max_seq_length'] = max_seq_len
+        tmp_dataset['tokenizer_model'] = tokenizer_model
         ruler_datasets.append(tmp_dataset)
diff --git a/configs/datasets/ruler/ruler_niah_gen.py b/configs/datasets/ruler/ruler_niah_gen.py
@@ -1,9 +1,7 @@
+from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset, RulerNiahEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset
-from opencompass.datasets.ruler.ruler_niah import RulerNiahEvaluator
-
 
 # Ruler Dataset settings
 niah_configurations = [
@@ -92,10 +90,7 @@
         'type': RulerNiahDataset,
         'base_path': base_path,
         'file_path': file_path,
-        # 'tokenizer_model': model_path,
         'tokens_to_generate': 128,
-        # 'max_seq_length': max_seq_len,
-        # 'num_samples': NUM_SAMPLES,
         'type_haystack': config['type_haystack'],
         'type_needle_k': config['type_needle_k'],
         'type_needle_v': config['type_needle_v'],

diff --git a/configs/eval_academic_leaderboard_202412.py b/configs/eval_academic_leaderboard_202412.py
@@ -10,35 +10,30 @@
 #######################################################################
 with read_base():
     # Datasets Part
-    ## Core Set
     # Knowledge
     from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import (
         mmlu_pro_datasets,
     )
-
     # General Reasoning
     from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import (
         gpqa_datasets,
     )
     from opencompass.configs.datasets.bbh.bbh_0shot_nocot_gen_925fc4 import (
         bbh_datasets,
     )
-    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import (
+    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import (
         humaneval_datasets,
     )
 
     # Instruction Following
-    from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import (
+    from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import (
         ifeval_datasets,
     )
-    from opencompass.configs.datasets.livecodebench.livecodebench_gen_6966bc import (
+    from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import (
         LCBCodeGeneration_dataset,
     )
 
     # Math
-    from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import (
-        cmo_fib_datasets,
-    )
     from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import (
         aime2024_datasets,
     )
@@ -77,7 +72,6 @@
             ['IFEval', 'Prompt-level-strict-accuracy'],
             ['bbh', 'naive_average'],
             ['math_prm800k_500', 'accuracy'],
-            ['cmo_fib', 'accuracy'],
             ['aime2024', 'accuracy'],
             ['GPQA_diamond', 'accuracy'],
             ['mmlu_pro', 'naive_average'],
@@ -101,7 +95,6 @@
         '',
         'Math Calculation',
         ['math_prm800k_500', 'accuracy'],
-        ['cmo_fib', 'accuracy'],
         ['aime2024', 'accuracy'],
         '',
         'Knowledge',

diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_gen_a4f90b.py b/opencompass/configs/datasets/livecodebench/livecodebench_gen_a4f90b.py
@@ -0,0 +1,164 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    LCBCodeGenerationDataset,
+    LCBCodeExecutionDataset,
+    LCBTestOutputPredictionDataset,
+    LCBCodeGenerationEvaluator,
+    LCBCodeExecutionEvaluator,
+    LCBTestOutputEvaluator
+)
+from opencompass.datasets.livecodebench import TestOutputPromptConstants
+
+
+lcb_code_generation_reader_cfg = dict(
+    input_columns=[
+        'question_content',
+        'format_prompt',
+    ],
+    # output_column='evaluation_sample',
+    output_column='question_id',
+)
+
+SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+
+prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
+                    '### Answer: (use the provided format with backticks)\n\n'
+
+
+# Code Generation Tasks
+lcb_code_generation_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=prompt_template
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+lcb_code_generation_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeGenerationEvaluator,
+        num_process_evaluate=4,
+        timeout=6,
+    ),
+    pred_role='BOT',
+)
+
+LCBCodeGeneration_dataset = dict(
+    type=LCBCodeGenerationDataset,
+    abbr='lcb_code_generation',
+    path='opencompass/code_generation_lite',
+    reader_cfg=lcb_code_generation_reader_cfg,
+    infer_cfg=lcb_code_generation_infer_cfg,
+    eval_cfg=lcb_code_generation_eval_cfg
+)
+
+# Code Execution Dataset
+lcb_code_execution_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+
+lcb_code_execution_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
+                ),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+lcb_code_execution_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeExecutionEvaluator,
+    ),
+    pred_role='BOT',
+)
+
+LCBCodeExecution_dataset = dict(
+    type=LCBCodeExecutionDataset,
+    abbr='lcb_code_execution',
+    path='opencompass/execution-v2',
+    reader_cfg=lcb_code_execution_reader_cfg,
+    infer_cfg=lcb_code_execution_infer_cfg,
+    eval_cfg=lcb_code_execution_eval_cfg,
+)
+
+# TestOuputput Dataset
+lcb_test_output_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+
+system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+
+lcb_test_output_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            # begin=[
+            #     dict(
+            #         role='SYSTEM',
+            #         prompt=system_prompt
+            #     ),
+            # ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+lcb_test_output_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBTestOutputEvaluator,
+    ),
+    pred_role='BOT',
+)
+
+LCBTestOutput_dataset = dict(
+    type=LCBTestOutputPredictionDataset,
+    abbr='lcb_test_output',
+    path='opencompass/test_generation',
+    reader_cfg=lcb_test_output_reader_cfg,
+    infer_cfg=lcb_test_output_infer_cfg,
+    eval_cfg=lcb_test_output_eval_cfg,
+)
+
+LCB_datasets = [
+    LCBCodeGeneration_dataset,
+    LCBCodeExecution_dataset,
+    LCBTestOutput_dataset,
+]