From 693e6331d914f58a6f8f3056aeadb1981874cff4 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Wed, 25 Sep 2024 20:27:57 +0800 Subject: [PATCH 1/8] updaste --- .github/scripts/eval_stable_config.py | 70 +++++++++++++++++++++++++++ .github/workflows/stable.yml | 18 +++++-- autotest/config.yaml | 6 +++ 3 files changed, 90 insertions(+), 4 deletions(-) create mode 100644 .github/scripts/eval_stable_config.py diff --git a/.github/scripts/eval_stable_config.py b/.github/scripts/eval_stable_config.py new file mode 100644 index 0000000000..9f5ff55e8a --- /dev/null +++ b/.github/scripts/eval_stable_config.py @@ -0,0 +1,70 @@ +from mmengine.config import read_base +from opencompass.models import OpenAISDK + +with read_base(): + # choose a list of datasets + from opencompass.configs.datasets.bbh.bbh_gen_2879b0 import \ + bbh_datasets # noqa: F401, E501 + from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \ + ceval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import \ + cmmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \ + GaokaoBench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.gpqa.gpqa_gen_4baadb import \ + gpqa_datasets # noqa: F401, E501 + from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \ + gsm8k_datasets # noqa: F401, E501 + from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \ + hellaswag_datasets # noqa: F401, E501 + from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \ + humaneval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \ + ifeval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.math.math_0shot_gen_393424 import \ + math_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_830460 import \ + sanitized_mbpp_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import \ + mmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \ + nq_datasets # noqa: F401, E501 + from opencompass.configs.datasets.race.race_gen_69ee4f import \ + race_datasets # noqa: F401, E501 + from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \ + TheoremQA_datasets # noqa: F401, E501 + from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \ + triviaqa_datasets # noqa: F401, E501 + from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import \ + winogrande_datasets # noqa: F401, E501 + from opencompass.configs.summarizers.medium import \ + summarizer # noqa: F401, E501 + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], +) + +models = [ + dict( + abbr='lmdeploy-api-test', + type=OpenAISDK, + key='EMPTY', + openai_api_base='http://localhost:23333/v1', + path='internlm2_20b_api', + tokenizer_path='internlm/internlm2_5-20b-chat', + rpm_verbose=True, + meta_template=api_meta_template, + query_per_second=50, + max_out_len=1024, + max_seq_len=4096, + temperature=0.01, + batch_size=128, + retry=3, + ) +] diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml index 0bfd32d4ee..8f45b9d74c 100644 --- a/.github/workflows/stable.yml +++ b/.github/workflows/stable.yml @@ -120,6 +120,12 @@ jobs: run: | python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt + - name: Install opencompass + run: | + git clone --depth=1 https://github.com/open-compass/opencompass.git + cd opencompass + python3 -m pip install -e . + cd .. - name: Check env run: | python3 -m pip list @@ -130,16 +136,16 @@ jobs: CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/${{matrix.model}} --tp 2 --max-batch-size 256 --cache-max-entry-count 0.9 --server-port 23344 > ${{env.REPORT_DIR}}/restful.log 2>&1 & echo "restful_pid=$!" >> "$GITHUB_ENV" sleep 120s + - name: Run OC result + if: always() + run: | + opencompass .github/scripts/eval_stable_config.py --reuse --dump-eval-details -w ${{env.REPORT_DIR}} - name: Test lmdeploy - restful api run: | python3 benchmark/profile_restful_api.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --stream-output True --num-prompts 10000 --csv ${{env.REPORT_DIR}}/stable.csv > ${{env.REPORT_DIR}}/stable.log python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-1.csv > ${{env.REPORT_DIR}}/stable-internal-1.log python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv > ${{env.REPORT_DIR}}/stable-internal-2.log python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-3.csv > ${{env.REPORT_DIR}}/stable-internal-3.log - - name: Kill api server - if: always() - run: | - kill -15 "$restful_pid" - name: Attach result if: always() run: | @@ -147,6 +153,10 @@ jobs: python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-1.csv python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-2.csv python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-3.csv + - name: Kill api server + if: always() + run: | + kill -15 "$restful_pid" - name: Clear workfile if: always() run: | diff --git a/autotest/config.yaml b/autotest/config.yaml index 152bfdeca5..07505718c6 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -42,6 +42,7 @@ turbomind_chat_model: - Qwen/Qwen1.5-7B-Chat - Qwen/Qwen1.5-4B-Chat-AWQ - Qwen/Qwen-VL-Chat + - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen2-7B-Instruct-GPTQ-Int4 - mistralai/Mistral-7B-Instruct-v0.1 - mistralai/Mistral-7B-Instruct-v0.2 @@ -79,6 +80,8 @@ pytorch_chat_model: - Qwen/Qwen2-1.5B-Instruct - Qwen/Qwen1.5-7B-Chat - Qwen/Qwen1.5-MoE-A2.7B-Chat + - Qwen/Qwen2-VL-2B-Instruct + - Qwen/Qwen2-VL-7B-Instruct - mistralai/Mistral-7B-Instruct-v0.1 - mistralai/Mistral-7B-Instruct-v0.2 - mistralai/Mixtral-8x7B-Instruct-v0.1 @@ -120,6 +123,8 @@ vl_model: - OpenGVLab/InternVL2-8B - OpenGVLab/InternVL2-26B - OpenGVLab/InternVL2-40B + - Qwen/Qwen2-VL-2B-Instruct + - Qwen/Qwen2-VL-7B-Instruct - internlm/internlm-xcomposer2-vl-7b - internlm/internlm-xcomposer2d5-7b - internlm/internlm-xcomposer2-4khd-7b @@ -152,6 +157,7 @@ turbomind_quatization: - Qwen/Qwen1.5-7B-Chat - Qwen/Qwen2-7B-Instruct - Qwen/Qwen2-1.5B-Instruct + - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen-VL-Chat - liuhaotian/llava-v1.5-13b - liuhaotian/llava-v1.6-vicuna-7b From 24cc94421fce03849313643991efa51e5742057a Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Wed, 25 Sep 2024 20:34:46 +0800 Subject: [PATCH 2/8] update --- .github/scripts/eval_stable_config.py | 24 ++++++++++++++++++++---- .github/workflows/stable.yml | 3 ++- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/.github/scripts/eval_stable_config.py b/.github/scripts/eval_stable_config.py index 9f5ff55e8a..486457e5cf 100644 --- a/.github/scripts/eval_stable_config.py +++ b/.github/scripts/eval_stable_config.py @@ -31,14 +31,28 @@ nq_datasets # noqa: F401, E501 from opencompass.configs.datasets.race.race_gen_69ee4f import \ race_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import \ + alignbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \ + alpacav2_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \ + arenahard_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \ + compassarena_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \ + fofo_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.followbench.followbench_llmeval import \ + followbench_llmeval_dataset # noqa: F401, E501 + from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \ + mtbench101_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \ + wildbench_datasets # noqa: F401, E501 from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \ - TheoremQA_datasets # noqa: F401, E501 + TheoremQA_datasets # noqa: F401, E501 # noqa: F401, E501 from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \ - triviaqa_datasets # noqa: F401, E501 + triviaqa_datasets # noqa: F401, E501 # noqa: F401, E501 from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import \ winogrande_datasets # noqa: F401, E501 - from opencompass.configs.summarizers.medium import \ - summarizer # noqa: F401, E501 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) @@ -68,3 +82,5 @@ retry=3, ) ] + +judge_models = models diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml index 8f45b9d74c..1a60376a8a 100644 --- a/.github/workflows/stable.yml +++ b/.github/workflows/stable.yml @@ -33,6 +33,7 @@ env: REPORT_DIR: /nvme/qa_test_models/stable_reports/${{ github.run_id }} ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib jmespath'}} + COMPASS_DATA_CACHE: /nvme/qa_test_models/dataset/data jobs: linux-build: @@ -137,7 +138,7 @@ jobs: echo "restful_pid=$!" >> "$GITHUB_ENV" sleep 120s - name: Run OC result - if: always() + continue-on-error: true run: | opencompass .github/scripts/eval_stable_config.py --reuse --dump-eval-details -w ${{env.REPORT_DIR}} - name: Test lmdeploy - restful api From 8a4c303987057e7cfa8b13797d261f6af146605d Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Thu, 26 Sep 2024 19:14:32 +0800 Subject: [PATCH 3/8] update --- .github/scripts/eval_stable_config.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/scripts/eval_stable_config.py b/.github/scripts/eval_stable_config.py index 486457e5cf..a39e023548 100644 --- a/.github/scripts/eval_stable_config.py +++ b/.github/scripts/eval_stable_config.py @@ -41,8 +41,6 @@ compassarena_datasets # noqa: F401, E501 from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \ fofo_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.followbench.followbench_llmeval import \ - followbench_llmeval_dataset # noqa: F401, E501 from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \ mtbench101_datasets # noqa: F401, E501 from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \ From c988a273662eaf5af60d64d994100f1d15a36acf Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Thu, 26 Sep 2024 19:41:21 +0800 Subject: [PATCH 4/8] update --- .github/scripts/eval_stable_config.py | 2 +- .github/workflows/stable.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/scripts/eval_stable_config.py b/.github/scripts/eval_stable_config.py index a39e023548..dec667e2bb 100644 --- a/.github/scripts/eval_stable_config.py +++ b/.github/scripts/eval_stable_config.py @@ -27,7 +27,7 @@ sanitized_mbpp_datasets # noqa: F401, E501 from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import \ mmlu_datasets # noqa: F401, E501 - from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \ + from opencompass.configs.datasets.nq.nq_gen_3dcea1 import \ nq_datasets # noqa: F401, E501 from opencompass.configs.datasets.race.race_gen_69ee4f import \ race_datasets # noqa: F401, E501 diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml index 1a60376a8a..110fb5c3fb 100644 --- a/.github/workflows/stable.yml +++ b/.github/workflows/stable.yml @@ -33,7 +33,7 @@ env: REPORT_DIR: /nvme/qa_test_models/stable_reports/${{ github.run_id }} ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib jmespath'}} - COMPASS_DATA_CACHE: /nvme/qa_test_models/dataset/data + COMPASS_DATA_CACHE: /nvme/qa_test_models/dataset jobs: linux-build: From f1d2a46fd25cc5e221ff44bf4f3fb959a3abc865 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Thu, 26 Sep 2024 20:03:43 +0800 Subject: [PATCH 5/8] update --- .github/scripts/eval_stable_config.py | 6 +++--- .github/workflows/stable.yml | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/scripts/eval_stable_config.py b/.github/scripts/eval_stable_config.py index dec667e2bb..57d3bb15e1 100644 --- a/.github/scripts/eval_stable_config.py +++ b/.github/scripts/eval_stable_config.py @@ -67,9 +67,9 @@ abbr='lmdeploy-api-test', type=OpenAISDK, key='EMPTY', - openai_api_base='http://localhost:23333/v1', - path='internlm2_20b_api', - tokenizer_path='internlm/internlm2_5-20b-chat', + openai_api_base='http://localhost:23344/v1', + path='/nvme/qa_test_models/internlm/internlm2_5-20b-chat', + tokenizer_path='/nvme/qa_test_models/internlm/internlm2_5-20b-chat', rpm_verbose=True, meta_template=api_meta_template, query_per_second=50, diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml index 110fb5c3fb..8273cabd93 100644 --- a/.github/workflows/stable.yml +++ b/.github/workflows/stable.yml @@ -24,7 +24,7 @@ on: type: string default: 'packaging transformers_stream_generator transformers datasets matplotlib jmespath' schedule: - - cron: '00 8 * * 1,5' + - cron: '00 8 * * 1' env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache @@ -140,7 +140,8 @@ jobs: - name: Run OC result continue-on-error: true run: | - opencompass .github/scripts/eval_stable_config.py --reuse --dump-eval-details -w ${{env.REPORT_DIR}} + ln -s /nvme/qa_test_models/dataset/data . + opencompass .github/scripts/eval_stable_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-oc - name: Test lmdeploy - restful api run: | python3 benchmark/profile_restful_api.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --stream-output True --num-prompts 10000 --csv ${{env.REPORT_DIR}}/stable.csv > ${{env.REPORT_DIR}}/stable.log From 2c72cf5ee2118088454583d4424210179dfde5ea Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Thu, 26 Sep 2024 20:28:40 +0800 Subject: [PATCH 6/8] update --- .github/workflows/stable.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml index 8273cabd93..0e4dce924e 100644 --- a/.github/workflows/stable.yml +++ b/.github/workflows/stable.yml @@ -22,7 +22,7 @@ on: required: true description: 'Dependency packages, you can also set a specific version' type: string - default: 'packaging transformers_stream_generator transformers datasets matplotlib jmespath' + default: 'packaging transformers_stream_generator transformers datasets matplotlib jmespath mmengine-lite==0.10.5' schedule: - cron: '00 8 * * 1' @@ -32,7 +32,7 @@ env: OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }} REPORT_DIR: /nvme/qa_test_models/stable_reports/${{ github.run_id }} ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true - dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib jmespath'}} + dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib jmespath mmengine-lite==0.10.5'}} COMPASS_DATA_CACHE: /nvme/qa_test_models/dataset jobs: From 10cc2839a73fb11ed9bde30ef35b9ef7757c03d8 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Fri, 27 Sep 2024 13:09:02 +0800 Subject: [PATCH 7/8] update --- ...config.py => eval_stable_object_config.py} | 22 ------ .github/scripts/eval_stable_subject_config.py | 70 +++++++++++++++++++ .github/workflows/stable.yml | 11 ++- 3 files changed, 80 insertions(+), 23 deletions(-) rename .github/scripts/{eval_stable_config.py => eval_stable_object_config.py} (64%) create mode 100644 .github/scripts/eval_stable_subject_config.py diff --git a/.github/scripts/eval_stable_config.py b/.github/scripts/eval_stable_object_config.py similarity index 64% rename from .github/scripts/eval_stable_config.py rename to .github/scripts/eval_stable_object_config.py index 57d3bb15e1..53e46b87ae 100644 --- a/.github/scripts/eval_stable_config.py +++ b/.github/scripts/eval_stable_object_config.py @@ -31,26 +31,6 @@ nq_datasets # noqa: F401, E501 from opencompass.configs.datasets.race.race_gen_69ee4f import \ race_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import \ - alignbench_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \ - alpacav2_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \ - arenahard_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \ - compassarena_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \ - fofo_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \ - mtbench101_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \ - wildbench_datasets # noqa: F401, E501 - from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \ - TheoremQA_datasets # noqa: F401, E501 # noqa: F401, E501 - from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \ - triviaqa_datasets # noqa: F401, E501 # noqa: F401, E501 - from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import \ - winogrande_datasets # noqa: F401, E501 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) @@ -80,5 +60,3 @@ retry=3, ) ] - -judge_models = models diff --git a/.github/scripts/eval_stable_subject_config.py b/.github/scripts/eval_stable_subject_config.py new file mode 100644 index 0000000000..9517fe384d --- /dev/null +++ b/.github/scripts/eval_stable_subject_config.py @@ -0,0 +1,70 @@ +from mmengine.config import read_base +from opencompass.models import OpenAISDK +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks.subjective_eval import SubjectiveEvalTask + +with read_base(): + # choose a list of datasets + from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import \ + alignbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \ + alpacav2_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \ + arenahard_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \ + compassarena_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \ + fofo_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \ + mtbench101_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \ + wildbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \ + TheoremQA_datasets # noqa: F401, E501 # noqa: F401, E501 + from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \ + triviaqa_datasets # noqa: F401, E501 # noqa: F401, E501 + from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import \ + winogrande_datasets # noqa: F401, E501 + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], +) + +models = [ + dict( + abbr='lmdeploy-api-test', + type=OpenAISDK, + key='EMPTY', + openai_api_base='http://localhost:23344/v1', + path='/nvme/qa_test_models/internlm/internlm2_5-20b-chat', + tokenizer_path='/nvme/qa_test_models/internlm/internlm2_5-20b-chat', + rpm_verbose=True, + meta_template=api_meta_template, + query_per_second=50, + max_out_len=1024, + max_seq_len=4096, + temperature=0.01, + batch_size=128, + retry=3, + ) +] + +judge_models = models + +eval = dict( + partitioner=dict( + type=SubjectiveNaivePartitioner, + models=models, + judge_models=judge_models, + ), + runner=dict(type=LocalRunner, + max_num_workers=16, + task=dict(type=SubjectiveEvalTask)), +) diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml index 0e4dce924e..97a9df7826 100644 --- a/.github/workflows/stable.yml +++ b/.github/workflows/stable.yml @@ -141,13 +141,20 @@ jobs: continue-on-error: true run: | ln -s /nvme/qa_test_models/dataset/data . - opencompass .github/scripts/eval_stable_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-oc + opencompass .github/scripts/eval_stable_object_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-object-1 + opencompass .github/scripts/eval_stable_subject_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-subject-1 + opencompass .github/scripts/eval_stable_object_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-object-2 + opencompass .github/scripts/eval_stable_subject_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-subject-2 + opencompass .github/scripts/eval_stable_object_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-object-3 + opencompass .github/scripts/eval_stable_subject_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-subject-3 - name: Test lmdeploy - restful api run: | python3 benchmark/profile_restful_api.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --stream-output True --num-prompts 10000 --csv ${{env.REPORT_DIR}}/stable.csv > ${{env.REPORT_DIR}}/stable.log python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-1.csv > ${{env.REPORT_DIR}}/stable-internal-1.log python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv > ${{env.REPORT_DIR}}/stable-internal-2.log python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-3.csv > ${{env.REPORT_DIR}}/stable-internal-3.log + python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv > ${{env.REPORT_DIR}}/stable-internal-4.log + python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-3.csv > ${{env.REPORT_DIR}}/stable-internal-5.log - name: Attach result if: always() run: | @@ -155,6 +162,8 @@ jobs: python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-1.csv python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-2.csv python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-3.csv + python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-4.csv + python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-5.csv - name: Kill api server if: always() run: | From 49cf2b80355f4531af457555d88d0b9d1ecdebe5 Mon Sep 17 00:00:00 2001 From: zhulin1 Date: Sun, 29 Sep 2024 10:13:07 +0800 Subject: [PATCH 8/8] update --- .github/scripts/eval_stable_subject_config.py | 4 +++- .github/workflows/daily_ete_test.yml | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/scripts/eval_stable_subject_config.py b/.github/scripts/eval_stable_subject_config.py index 9517fe384d..abcfba3db4 100644 --- a/.github/scripts/eval_stable_subject_config.py +++ b/.github/scripts/eval_stable_subject_config.py @@ -27,7 +27,9 @@ from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import \ winogrande_datasets # noqa: F401, E501 -datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) +datasets = sum((v for k, v in locals().items() + if k.endswith('_datasets') and 'wildbench' not in k), []) +datasets += wildbench_datasets api_meta_template = dict( round=[ diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index 063bf1a9e0..16202656ec 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -32,7 +32,7 @@ on: required: true description: 'Dependency packages, you can also set a specific version' type: string - default: 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq mmengine-lite==0.10.5' + default: 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq qwen_vl_utils mmengine-lite==0.10.5' regression_func: required: true description: 'regression functions' @@ -43,7 +43,7 @@ on: env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache - dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq mmengine-lite==0.10.5'}} + dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq qwen_vl_utils mmengine-lite==0.10.5'}} HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }} ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true