Skip to content

Commit

Permalink
Merge branch 'main' into g-passk
Browse files Browse the repository at this point in the history
  • Loading branch information
jnanliu authored Dec 21, 2024
2 parents 3fdc500 + ebefffe commit 6ca63ca
Show file tree
Hide file tree
Showing 25 changed files with 230 additions and 30 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ Just like a compass guides us on our journey, OpenCompass will guide you through

We provide [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `opencompass@pjlab.org.cn`.

You can also refer to [CompassAcademic](configs/eval_academic_leaderboard_202412.py) to quickly reproduce the leaderboard results. The currently selected datasets include Knowledge Reasoning (MMLU-Pro/GPQA Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Generation (LiveCodeBench, HumanEval), and Instruction Following (IFEval)."

<p align="right"><a href="#top">🔝Back to top</a></p>

## 🛠️ Installation
Expand Down
2 changes: 2 additions & 0 deletions README_zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@

我们将陆续提供开源模型和 API 模型的具体性能榜单,请见 [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) 。如需加入评测,请提供模型仓库地址或标准的 API 接口至邮箱 `opencompass@pjlab.org.cn`.

你也可以参考[CompassAcademic](configs/eval_academic_leaderboard_202412.py),快速地复现榜单的结果,目前选取的数据集包括 综合知识推理 (MMLU-Pro/GPQA Diamond) ,逻辑推理 (BBH) ,数学推理 (MATH-500, AIME) ,代码生成 (LiveCodeBench, HumanEval) ,指令跟随 (IFEval) 。

<p align="right"><a href="#top">🔝返回顶部</a></p>

## 🛠️ 安装指南
Expand Down
4 changes: 4 additions & 0 deletions configs/datasets/ruler/ruler_128k_gen.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os

from mmengine.config import read_base

with read_base():
Expand All @@ -12,6 +14,7 @@

# Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested
max_seq_lens = [1024 * 128]
abbr_suffixs = ['128k']
Expand All @@ -25,4 +28,5 @@
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset)
3 changes: 3 additions & 0 deletions configs/datasets/ruler/ruler_16k_gen.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os

from mmengine.config import read_base

Expand All @@ -13,6 +14,7 @@

# Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested
max_seq_lens = [1024 * 16]
abbr_suffixs = ['16k']
Expand All @@ -26,4 +28,5 @@
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset)
3 changes: 3 additions & 0 deletions configs/datasets/ruler/ruler_1m_gen.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os

from mmengine.config import read_base

Expand All @@ -13,6 +14,7 @@

# Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested
max_seq_lens = [1024 * 1024]
abbr_suffixs = ['1m']
Expand All @@ -26,4 +28,5 @@
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset)
3 changes: 3 additions & 0 deletions configs/datasets/ruler/ruler_32k_gen.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os

from mmengine.config import read_base

Expand All @@ -13,6 +14,7 @@

# Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested
max_seq_lens = [1024 * 32]
abbr_suffixs = ['32k']
Expand All @@ -26,4 +28,5 @@
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset)
4 changes: 4 additions & 0 deletions configs/datasets/ruler/ruler_4k_gen.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os

from mmengine.config import read_base

with read_base():
Expand All @@ -12,6 +14,7 @@

# Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested
max_seq_lens = [1024 * 4]
abbr_suffixs = ['4k']
Expand All @@ -25,4 +28,5 @@
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset)
4 changes: 4 additions & 0 deletions configs/datasets/ruler/ruler_64k_gen.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os

from mmengine.config import read_base

with read_base():
Expand All @@ -12,6 +14,7 @@

# Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested
max_seq_lens = [1024 * 64]
abbr_suffixs: list[str] = ['64k']
Expand All @@ -25,4 +28,5 @@
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset)
3 changes: 3 additions & 0 deletions configs/datasets/ruler/ruler_8k_gen.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os

from mmengine.config import read_base

Expand All @@ -13,6 +14,7 @@

# Evaluation config
NUM_SAMPLES = 100 # Change to the number of samples you need
tokenizer_model = os.environ.get('TOKENIZER_MODEL', 'gpt-4')
# Change the context lengths to be tested
max_seq_lens = [1024 * 8]
abbr_suffixs = ['8k']
Expand All @@ -26,4 +28,5 @@
tmp_dataset['abbr'] = tmp_dataset['abbr'] + '_' + abbr_suffix
tmp_dataset['num_samples'] = NUM_SAMPLES
tmp_dataset['max_seq_length'] = max_seq_len
tmp_dataset['tokenizer_model'] = tokenizer_model
ruler_datasets.append(tmp_dataset)
9 changes: 2 additions & 7 deletions configs/datasets/ruler/ruler_niah_gen.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset, RulerNiahEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.ruler.ruler_niah import RulerNiahDataset
from opencompass.datasets.ruler.ruler_niah import RulerNiahEvaluator


# Ruler Dataset settings
niah_configurations = [
Expand Down Expand Up @@ -92,10 +90,7 @@
'type': RulerNiahDataset,
'base_path': base_path,
'file_path': file_path,
# 'tokenizer_model': model_path,
'tokens_to_generate': 128,
# 'max_seq_length': max_seq_len,
# 'num_samples': NUM_SAMPLES,
'type_haystack': config['type_haystack'],
'type_needle_k': config['type_needle_k'],
'type_needle_v': config['type_needle_v'],
Expand Down
13 changes: 3 additions & 10 deletions configs/eval_academic_leaderboard_202412.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,35 +10,30 @@
#######################################################################
with read_base():
# Datasets Part
## Core Set
# Knowledge
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import (
mmlu_pro_datasets,
)

# General Reasoning
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import (
gpqa_datasets,
)
from opencompass.configs.datasets.bbh.bbh_0shot_nocot_gen_925fc4 import (
bbh_datasets,
)
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import (
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import (
humaneval_datasets,
)

# Instruction Following
from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import (
from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import (
ifeval_datasets,
)
from opencompass.configs.datasets.livecodebench.livecodebench_gen_6966bc import (
from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import (
LCBCodeGeneration_dataset,
)

# Math
from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import (
cmo_fib_datasets,
)
from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import (
aime2024_datasets,
)
Expand Down Expand Up @@ -77,7 +72,6 @@
['IFEval', 'Prompt-level-strict-accuracy'],
['bbh', 'naive_average'],
['math_prm800k_500', 'accuracy'],
['cmo_fib', 'accuracy'],
['aime2024', 'accuracy'],
['GPQA_diamond', 'accuracy'],
['mmlu_pro', 'naive_average'],
Expand All @@ -101,7 +95,6 @@
'',
'Math Calculation',
['math_prm800k_500', 'accuracy'],
['cmo_fib', 'accuracy'],
['aime2024', 'accuracy'],
'',
'Knowledge',
Expand Down
164 changes: 164 additions & 0 deletions opencompass/configs/datasets/livecodebench/livecodebench_gen_a4f90b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (
LCBCodeGenerationDataset,
LCBCodeExecutionDataset,
LCBTestOutputPredictionDataset,
LCBCodeGenerationEvaluator,
LCBCodeExecutionEvaluator,
LCBTestOutputEvaluator
)
from opencompass.datasets.livecodebench import TestOutputPromptConstants


lcb_code_generation_reader_cfg = dict(
input_columns=[
'question_content',
'format_prompt',
],
# output_column='evaluation_sample',
output_column='question_id',
)

SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'

prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
'### Answer: (use the provided format with backticks)\n\n'


# Code Generation Tasks
lcb_code_generation_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt=prompt_template
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)

lcb_code_generation_eval_cfg = dict(
evaluator=dict(
type=LCBCodeGenerationEvaluator,
num_process_evaluate=4,
timeout=6,
),
pred_role='BOT',
)

LCBCodeGeneration_dataset = dict(
type=LCBCodeGenerationDataset,
abbr='lcb_code_generation',
path='opencompass/code_generation_lite',
reader_cfg=lcb_code_generation_reader_cfg,
infer_cfg=lcb_code_generation_infer_cfg,
eval_cfg=lcb_code_generation_eval_cfg
)

# Code Execution Dataset
lcb_code_execution_reader_cfg = dict(
input_columns=[
'prompt',
],
output_column='evaluation_sample',
)

lcb_code_execution_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
),
],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)

lcb_code_execution_eval_cfg = dict(
evaluator=dict(
type=LCBCodeExecutionEvaluator,
),
pred_role='BOT',
)

LCBCodeExecution_dataset = dict(
type=LCBCodeExecutionDataset,
abbr='lcb_code_execution',
path='opencompass/execution-v2',
reader_cfg=lcb_code_execution_reader_cfg,
infer_cfg=lcb_code_execution_infer_cfg,
eval_cfg=lcb_code_execution_eval_cfg,
)

# TestOuputput Dataset
lcb_test_output_reader_cfg = dict(
input_columns=[
'prompt',
],
output_column='evaluation_sample',
)

system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'

lcb_test_output_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
# begin=[
# dict(
# role='SYSTEM',
# prompt=system_prompt
# ),
# ],
round=[
dict(
role='HUMAN',
prompt='{prompt}'
)
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)

lcb_test_output_eval_cfg = dict(
evaluator=dict(
type=LCBTestOutputEvaluator,
),
pred_role='BOT',
)

LCBTestOutput_dataset = dict(
type=LCBTestOutputPredictionDataset,
abbr='lcb_test_output',
path='opencompass/test_generation',
reader_cfg=lcb_test_output_reader_cfg,
infer_cfg=lcb_test_output_infer_cfg,
eval_cfg=lcb_test_output_eval_cfg,
)

LCB_datasets = [
LCBCodeGeneration_dataset,
LCBCodeExecution_dataset,
LCBTestOutput_dataset,
]
Loading

0 comments on commit 6ca63ca

Please sign in to comment.