Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ci] benchmark react #2183

Merged
merged 159 commits into from
Aug 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
159 commits
Select commit Hold shift + click to select a range
5987d68
Create env.sh
zhulinJulia24 Mar 29, 2024
e04ba82
Create test.yml
zhulinJulia24 Mar 29, 2024
81f9e7a
Update test.yml
zhulinJulia24 Mar 29, 2024
6617cb8
Update test.yml
zhulinJulia24 Mar 29, 2024
1c7f3ff
Update test.yml
zhulinJulia24 Mar 29, 2024
d7274f4
Update test.yml
zhulinJulia24 Mar 29, 2024
750ed34
Merge branch 'InternLM:main' into main
zhulinJulia24 Mar 29, 2024
a4cc1a7
Create benchmark.yml
zhulinJulia24 Mar 29, 2024
7ec0995
Update daily_ete_test.yml
zhulinJulia24 Mar 30, 2024
653d7d3
Update test.yml
zhulinJulia24 Apr 1, 2024
e10e0fe
Update env.sh
zhulinJulia24 Apr 1, 2024
b36e74b
Update test.yml
zhulinJulia24 Apr 1, 2024
e1e635a
Merge branch 'InternLM:main' into main
zhulinJulia24 Apr 1, 2024
35b8d84
Update benchmark.yml
zhulinJulia24 Apr 2, 2024
84a26b9
update
Jul 19, 2024
a18ad00
update
Jul 19, 2024
27c834a
Update benchmark_utils.py
zhulinJulia24 Jul 19, 2024
e6feeb4
update
Jul 19, 2024
b9f7f3e
update
Jul 22, 2024
41f4536
update
Jul 22, 2024
004ce4d
updatre
Jul 22, 2024
f6cbb9e
update
Jul 22, 2024
6002e4a
update
Jul 22, 2024
10273ee
update
Jul 22, 2024
e7ad329
Merge branch 'InternLM:main' into benchmark_react
zhulinJulia24 Jul 23, 2024
c97ace8
update
Jul 23, 2024
b955215
update
Jul 23, 2024
d20a2c0
update
Jul 23, 2024
db309aa
update
Jul 23, 2024
9142ee2
Merge branch 'InternLM:main' into benchmark_react
zhulinJulia24 Jul 23, 2024
a145bff
update
Jul 24, 2024
3f87f97
update
Jul 25, 2024
6bb4ab3
Merge branch 'InternLM:main' into benchmark_react
zhulinJulia24 Jul 25, 2024
aa424e7
update
Jul 25, 2024
ef9435c
Merge branch 'InternLM:main' into benchmark_react
zhulinJulia24 Jul 25, 2024
3724c10
update
Jul 26, 2024
f07ac2c
update
Jul 26, 2024
5c9c38a
update
Jul 26, 2024
5410785
update
Jul 26, 2024
f1aabd3
Merge branch 'InternLM:main' into benchmark_react
zhulinJulia24 Jul 26, 2024
32bd428
update
Jul 26, 2024
72098e0
update
Jul 26, 2024
a7036f6
update
Jul 29, 2024
1630f53
update
Jul 29, 2024
5f95828
update
Jul 30, 2024
6ed31bf
update
Jul 30, 2024
54889a7
u[date
Jul 31, 2024
a60df50
update
Jul 31, 2024
f963caf
update
Jul 31, 2024
34eb93d
Update pr_ete_test.yml
zhulinJulia24 Jul 31, 2024
99b6dfc
Update pr_ete_test.yml
zhulinJulia24 Jul 31, 2024
b905dbd
Update pr_ete_test.yml
zhulinJulia24 Jul 31, 2024
1f0cc6d
update
Aug 1, 2024
b5c82ce
update
Aug 1, 2024
144ce00
update
Aug 1, 2024
a0613c1
update
Aug 1, 2024
9b17b69
update
Aug 1, 2024
f0317d5
update
Aug 1, 2024
7d45f5e
update
Aug 1, 2024
ed6bf6a
update
Aug 1, 2024
ff3a447
update
Aug 1, 2024
f37096c
Merge branch 'InternLM:main' into update_pr_image
zhulinJulia24 Aug 1, 2024
ed9870b
update
Aug 1, 2024
184f8a9
updatr
Aug 1, 2024
4c43434
update
Aug 2, 2024
db4d615
update
Aug 2, 2024
e988ef5
update
Aug 2, 2024
4a4c719
updaet
Aug 2, 2024
228291d
update
Aug 2, 2024
342e62b
update
Aug 2, 2024
8a03323
update
Aug 2, 2024
6dc0ea4
updaste
Aug 5, 2024
a335b1a
update
Aug 5, 2024
0c622c4
update
Aug 5, 2024
40b19d2
update
Aug 5, 2024
e09ad05
update
Aug 5, 2024
a437f16
update
Aug 5, 2024
ade7fc1
update
Aug 6, 2024
6bf5da1
Merge branch 'InternLM:main' into benchmark_react
zhulinJulia24 Aug 6, 2024
25e3e2e
update
Aug 6, 2024
add29f6
update
Aug 6, 2024
0d6241e
update
Aug 6, 2024
95c39c3
update
Aug 6, 2024
e701dca
update
Aug 6, 2024
e4fba0c
Merge branch 'InternLM:main' into update_pr_image
zhulinJulia24 Aug 6, 2024
be238e4
update
Aug 7, 2024
b507ae8
update
Aug 7, 2024
65637ff
update
Aug 7, 2024
3cf107d
updaste
Aug 7, 2024
c0de5c4
update
Aug 7, 2024
a50bffd
update
Aug 7, 2024
fab464b
update
Aug 7, 2024
48d3343
update
Aug 7, 2024
c366dff
update
Aug 7, 2024
c8bf936
Merge branch 'InternLM:main' into benchmark_react
zhulinJulia24 Aug 8, 2024
5cd358e
update
Aug 8, 2024
494772f
merge main
Aug 8, 2024
10e649c
update
Aug 9, 2024
34255ee
Merge branch 'InternLM:main' into benchmark_react
zhulinJulia24 Aug 9, 2024
6c67be5
Update benchmark.yml
zhulinJulia24 Aug 10, 2024
63e3b04
Merge branch 'InternLM:main' into benchmark_react
zhulinJulia24 Aug 13, 2024
74a0be0
update
Aug 14, 2024
d36579f
update
Aug 14, 2024
3483fbd
update
Aug 14, 2024
823b224
update
Aug 14, 2024
810647c
update
Aug 14, 2024
06fa00b
update
Aug 14, 2024
380fe48
update
Aug 15, 2024
651623f
update
Aug 15, 2024
8253e0e
update
Aug 15, 2024
28133a2
update
Aug 15, 2024
65520e5
update
Aug 15, 2024
259b514
Update stable.yml
zhulinJulia24 Aug 15, 2024
36b4f23
update
Aug 15, 2024
020ce7b
update
Aug 15, 2024
9c62f57
update
Aug 15, 2024
6c23926
update
Aug 15, 2024
8b3d61f
update
Aug 15, 2024
42fb224
update
Aug 15, 2024
0cb1cce
updaet
Aug 15, 2024
1c73201
update
Aug 16, 2024
3d0310b
Merge branch 'InternLM:main' into benchmark_react
zhulinJulia24 Aug 16, 2024
9a9a147
update
Aug 16, 2024
db21618
update
Aug 16, 2024
26d82d5
update
Aug 19, 2024
fdc730c
update
Aug 19, 2024
2b6f172
update
Aug 19, 2024
94e200c
update
Aug 19, 2024
ce42738
update
Aug 19, 2024
78f158e
Merge branch 'InternLM:main' into benchmark_react
zhulinJulia24 Aug 20, 2024
034f645
update
Aug 20, 2024
e07f9c4
update
Aug 20, 2024
ed13f74
Merge branch 'InternLM:main' into benchmark_react
zhulinJulia24 Aug 20, 2024
6601eb6
Update stable.yml
zhulinJulia24 Aug 20, 2024
e6cab7e
Update stable.yml
zhulinJulia24 Aug 20, 2024
b777022
update
Aug 20, 2024
37630b5
update
Aug 20, 2024
168269e
Update stable.yml
zhulinJulia24 Aug 20, 2024
3e92a42
Update stable.yml
zhulinJulia24 Aug 20, 2024
1491c93
Update stable.yml
zhulinJulia24 Aug 20, 2024
4ebe9cc
update
Aug 21, 2024
9748e94
update
Aug 21, 2024
532c87e
update
Aug 21, 2024
12c9162
update
Aug 21, 2024
6270a1a
update
Aug 22, 2024
7430b83
update
Aug 22, 2024
8786434
update
Aug 22, 2024
415925e
update
Aug 22, 2024
f4a352c
update
Aug 22, 2024
306caf6
update
Aug 22, 2024
cf857b0
update
Aug 22, 2024
d904333
update
Aug 23, 2024
30224d4
updaste
Aug 23, 2024
2a5e53d
Merge branch 'InternLM:main' into benchmark_react
zhulinJulia24 Aug 23, 2024
d4642ac
update
Aug 26, 2024
0d6a4ae
update
Aug 26, 2024
1494e6c
update
Aug 26, 2024
bbf9e7a
update
Aug 26, 2024
f5196d2
update
Aug 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/resources/opencompass-hf-results.json
Original file line number Diff line number Diff line change
Expand Up @@ -129,5 +129,9 @@
"race-middle": "88.63",
"race-high": "81.22",
"crows_pairs": "86.07"
},
"internlm/internlm2_5-7b-chat": {
"mmlu": "72.8",
"gsm8k": "86.0"
}
}
96 changes: 47 additions & 49 deletions .github/scripts/action_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import shutil
import subprocess
import time
from collections import OrderedDict
from typing import List

Expand Down Expand Up @@ -166,7 +167,11 @@ def evaluate(models: List[str], datasets: List[str], workspace: str):
f'python3 {opencompass_dir}/run.py {config_path_new} -w {work_dir} --reuse --max-num-workers 8' # noqa: E501
]
eval_log = os.path.join(workspace, f'eval.{ori_model}.txt')
start_time = time.time()
ret = run_cmd(cmd_eval, log_path=eval_log, cwd=lmdeploy_dir)
end_time = time.time()
task_duration_seconds = round(end_time - start_time, 2)
logging.info(f'task_duration_seconds: {task_duration_seconds}\n')
if ret != 0:
continue
csv_files = glob.glob(f'{work_dir}/*/summary/summary_*.csv')
Expand Down Expand Up @@ -204,6 +209,7 @@ def evaluate(models: List[str], datasets: List[str], workspace: str):
prec = precision if do_lite else '-'

row = ','.join([model, engine_type, prec] +
[str(task_duration_seconds)] +
[model_results[_] for _ in dataset_names])
hf_res_row = None
if hf_model_path not in test_model_names:
Expand All @@ -213,11 +219,11 @@ def evaluate(models: List[str], datasets: List[str], workspace: str):
hf_metrics = [
hf_res[d] if d in hf_res else '-' for d in dataset_names
]
hf_res_row = ','.join([model, 'hf', '-'] + hf_metrics)
hf_res_row = ','.join([model, 'hf', '-', '-'] + hf_metrics)
if not os.path.exists(output_csv):
with open(output_csv, 'w') as f:
header = ','.join(['Model', 'Engine', 'Precision'] +
dataset_names)
['task_duration_secs'] + dataset_names)
f.write(header + '\n')
if hf_res_row:
f.write(hf_res_row + '\n')
Expand Down Expand Up @@ -264,53 +270,45 @@ def generate_benchmark_report(report_path: str):
benchmark_subfolders = [
f.path for f in os.scandir(sec_dir_path) if f.is_dir()
]
for benchmark_subfolder in benchmark_subfolders:
backend_subfolders = [
f.path for f in os.scandir(benchmark_subfolder)
if f.is_dir()
]
for backend_subfolder in backend_subfolders:
benchmark_type = backend_subfolder.replace(
sec_dir_path + '/', '')
print('*' * 10, benchmark_type, '*' * 10)
_append_summary('-' * 10 + benchmark_type + '-' * 10 +
'\n')
merged_csv_path = os.path.join(backend_subfolder,
'summary.csv')
csv_files = glob.glob(
os.path.join(backend_subfolder, '*.csv'))
average_csv_path = os.path.join(backend_subfolder,
'average.csv')
if merged_csv_path in csv_files:
csv_files.remove(merged_csv_path)
if average_csv_path in csv_files:
csv_files.remove(average_csv_path)
merged_df = pd.DataFrame()

if len(csv_files) > 0:
for f in csv_files:
df = pd.read_csv(f)
merged_df = pd.concat([merged_df, df],
ignore_index=True)

merged_df = merged_df.sort_values(
by=merged_df.columns[0])

grouped_df = merged_df.groupby(merged_df.columns[0])
if 'generation' not in benchmark_subfolder:
average_values = grouped_df.pipe(
(lambda group: {
'mean': group.mean().round(decimals=3)
}))['mean']
average_values.to_csv(average_csv_path, index=True)
avg_df = pd.read_csv(average_csv_path)
merged_df = pd.concat([merged_df, avg_df],
ignore_index=True)
add_summary(average_csv_path)
merged_df.to_csv(merged_csv_path, index=False)
if 'generation' in benchmark_subfolder:
add_summary(merged_csv_path)
print(merged_df)
for backend_subfolder in benchmark_subfolders:
benchmark_type = backend_subfolder.replace(
sec_dir_path + '/', '')
print('*' * 10, benchmark_type, '*' * 10)
_append_summary('-' * 10 + benchmark_type + '-' * 10 + '\n')
merged_csv_path = os.path.join(backend_subfolder,
'summary.csv')
csv_files = glob.glob(os.path.join(backend_subfolder, '*.csv'))
average_csv_path = os.path.join(backend_subfolder,
'average.csv')
if merged_csv_path in csv_files:
csv_files.remove(merged_csv_path)
if average_csv_path in csv_files:
csv_files.remove(average_csv_path)
merged_df = pd.DataFrame()

if len(csv_files) > 0:
for f in csv_files:
df = pd.read_csv(f)
merged_df = pd.concat([merged_df, df],
ignore_index=True)

merged_df = merged_df.sort_values(by=merged_df.columns[0])

grouped_df = merged_df.groupby(merged_df.columns[0])
if 'generation' not in backend_subfolder:
average_values = grouped_df.pipe(
(lambda group: {
'mean': group.mean().round(decimals=3)
}))['mean']
average_values.to_csv(average_csv_path, index=True)
avg_df = pd.read_csv(average_csv_path)
merged_df = pd.concat([merged_df, avg_df],
ignore_index=True)
add_summary(average_csv_path)
merged_df.to_csv(merged_csv_path, index=False)
if 'generation' in backend_subfolder:
add_summary(merged_csv_path)

_append_summary('## Benchmark Results End')


Expand Down
76 changes: 68 additions & 8 deletions .github/scripts/eval_opencompass_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,12 +150,10 @@
MAX_NEW_TOKENS = 1024

tb_engine_config_template_max_bs_128 = dict(session_len=MAX_SESSION_LEN,
max_batch_size=128,
rope_scaling_factor=1.0)
max_batch_size=128)
tb_engine_config_template_max_bs_128_tp2 = dict(session_len=MAX_SESSION_LEN,
max_batch_size=128,
tp=2,
rope_scaling_factor=1.0)
tp=2)

pt_engine_config_template_max_bs_16 = dict(session_len=MAX_SESSION_LEN,
max_batch_size=16)
Expand Down Expand Up @@ -192,12 +190,10 @@
tp=2)
tb_awq_engine_config_template_max_bs_8 = dict(session_len=MAX_SESSION_LEN,
max_batch_size=8,
model_format='awq',
rope_scaling_factor=1.0)
model_format='awq')
tb_awq_engine_config_template_max_bs_32 = dict(session_len=MAX_SESSION_LEN,
max_batch_size=32,
model_format='awq',
rope_scaling_factor=1.0)
model_format='awq')

gen_config_template = dict(top_k=1,
top_p=0.8,
Expand Down Expand Up @@ -315,6 +311,38 @@
run_cfg=run_cfg_tp1_template,
end_str='<|im_end|>')

# ===== Configs for internlm/internlm2_5_20b_chat =====
tb_internlm2_5_20b_chat = dict(
type=TurboMindModelwithChatTemplate,
abbr='tb_internlm2_5_20b_chat',
path='internlm/internlm2_5-20b-chat',
engine_config=engine_config_template_max_bs_128_tp2,
gen_config=gen_config_template,
max_seq_len=MAX_SESSION_LEN,
max_out_len=MAX_NEW_TOKENS,
batch_size=128,
run_cfg=dict(num_gpus=2),
stop_words=['</s>', '<|im_end|>'],
)

tb_internlm2_5_20b_chat_4bits = deepcopy(tb_internlm2_5_20b_chat)
tb_internlm2_5_20b_chat_kvint4 = deepcopy(tb_internlm2_5_20b_chat)
tb_internlm2_5_20b_chat_kvint8 = deepcopy(tb_internlm2_5_20b_chat)

pt_internlm2_5_20b_chat = dict(
type=LmdeployPytorchModel,
abbr='pt_internlm2_5_20b_chat',
path='internlm/internlm2_5-20b-chat',
engine_config=pt_engine_config_template_max_bs_64_tp2,
gen_config=gen_config_template,
max_out_len=MAX_NEW_TOKENS,
max_seq_len=MAX_SESSION_LEN,
batch_size=64,
concurrency=64,
meta_template=internlm2_meta_template,
run_cfg=run_cfg_tp2_template,
end_str='<|im_end|>')

# ===== Configs for internlm/internlm2_chat_20b =====
tb_internlm2_chat_20b = dict(
type=TurboMindModelwithChatTemplate,
Expand Down Expand Up @@ -473,6 +501,38 @@
tb_llama_3_8b_instruct_kvint4 = deepcopy(tb_llama_3_8b_instruct)
tb_llama_3_8b_instruct_kvint8 = deepcopy(tb_llama_3_8b_instruct)

# ===== Configs for meta-llama/Meta-Llama-3.1-8B-Instruct =====
tb_llama_3d1_8b_instruct = dict(
type=TurboMindModelwithChatTemplate,
abbr='tb_llama_3d1_8b_instruct',
path='meta-llama/Meta-Llama-3-1-8B-Instruct',
engine_config=engine_config_template_max_bs_128,
gen_config=gen_config_template,
max_seq_len=MAX_SESSION_LEN,
max_out_len=MAX_NEW_TOKENS,
batch_size=128,
run_cfg=dict(num_gpus=1),
stop_words=['<|eot_id|>', '<|end_of_text|>'],
)

pt_llama_3d1_8b_instruct = dict(
type=LmdeployPytorchModel,
abbr='pt_llama_3d1_8b_instruct',
path='meta-llama/Meta-Llama-3-1-8B-Instruct',
engine_config=pt_engine_config_template_max_bs_128,
gen_config=gen_config_template,
max_out_len=MAX_NEW_TOKENS,
max_seq_len=MAX_SESSION_LEN,
batch_size=128,
concurrency=128,
meta_template=llama3_meta_template,
run_cfg=run_cfg_tp1_template,
end_str='[INST]')

tb_llama_3d1_8b_instruct_4bits = deepcopy(tb_llama_3d1_8b_instruct)
tb_llama_3d1_8b_instruct_kvint4 = deepcopy(tb_llama_3d1_8b_instruct)
tb_llama_3d1_8b_instruct_kvint8 = deepcopy(tb_llama_3d1_8b_instruct)

# ===== Configs for Qwen/Qwen2-7B-Instruct =====
tb_qwen2_7b_instruct = dict(
type=TurboMindModelwithChatTemplate,
Expand Down
42 changes: 0 additions & 42 deletions .github/scripts/set_benchmark_param.sh

This file was deleted.

Loading
Loading