diff --git a/.github/scripts/eval_stable_object_config.py b/.github/scripts/eval_stable_object_config.py new file mode 100644 index 0000000000..53e46b87ae --- /dev/null +++ b/.github/scripts/eval_stable_object_config.py @@ -0,0 +1,62 @@ +from mmengine.config import read_base +from opencompass.models import OpenAISDK + +with read_base(): + # choose a list of datasets + from opencompass.configs.datasets.bbh.bbh_gen_2879b0 import \ + bbh_datasets # noqa: F401, E501 + from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \ + ceval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import \ + cmmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \ + GaokaoBench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.gpqa.gpqa_gen_4baadb import \ + gpqa_datasets # noqa: F401, E501 + from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \ + gsm8k_datasets # noqa: F401, E501 + from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \ + hellaswag_datasets # noqa: F401, E501 + from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \ + humaneval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \ + ifeval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.math.math_0shot_gen_393424 import \ + math_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_830460 import \ + sanitized_mbpp_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import \ + mmlu_datasets # noqa: F401, E501 + from opencompass.configs.datasets.nq.nq_gen_3dcea1 import \ + nq_datasets # noqa: F401, E501 + from opencompass.configs.datasets.race.race_gen_69ee4f import \ + race_datasets # noqa: F401, E501 + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], +) + +models = [ + dict( + abbr='lmdeploy-api-test', + type=OpenAISDK, + key='EMPTY', + openai_api_base='http://localhost:23344/v1', + path='/nvme/qa_test_models/internlm/internlm2_5-20b-chat', + tokenizer_path='/nvme/qa_test_models/internlm/internlm2_5-20b-chat', + rpm_verbose=True, + meta_template=api_meta_template, + query_per_second=50, + max_out_len=1024, + max_seq_len=4096, + temperature=0.01, + batch_size=128, + retry=3, + ) +] diff --git a/.github/scripts/eval_stable_subject_config.py b/.github/scripts/eval_stable_subject_config.py new file mode 100644 index 0000000000..abcfba3db4 --- /dev/null +++ b/.github/scripts/eval_stable_subject_config.py @@ -0,0 +1,72 @@ +from mmengine.config import read_base +from opencompass.models import OpenAISDK +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks.subjective_eval import SubjectiveEvalTask + +with read_base(): + # choose a list of datasets + from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import \ + alignbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \ + alpacav2_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \ + arenahard_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \ + compassarena_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \ + fofo_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \ + mtbench101_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \ + wildbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \ + TheoremQA_datasets # noqa: F401, E501 # noqa: F401, E501 + from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \ + triviaqa_datasets # noqa: F401, E501 # noqa: F401, E501 + from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import \ + winogrande_datasets # noqa: F401, E501 + +datasets = sum((v for k, v in locals().items() + if k.endswith('_datasets') and 'wildbench' not in k), []) +datasets += wildbench_datasets + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], +) + +models = [ + dict( + abbr='lmdeploy-api-test', + type=OpenAISDK, + key='EMPTY', + openai_api_base='http://localhost:23344/v1', + path='/nvme/qa_test_models/internlm/internlm2_5-20b-chat', + tokenizer_path='/nvme/qa_test_models/internlm/internlm2_5-20b-chat', + rpm_verbose=True, + meta_template=api_meta_template, + query_per_second=50, + max_out_len=1024, + max_seq_len=4096, + temperature=0.01, + batch_size=128, + retry=3, + ) +] + +judge_models = models + +eval = dict( + partitioner=dict( + type=SubjectiveNaivePartitioner, + models=models, + judge_models=judge_models, + ), + runner=dict(type=LocalRunner, + max_num_workers=16, + task=dict(type=SubjectiveEvalTask)), +) diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml index 063bf1a9e0..16202656ec 100644 --- a/.github/workflows/daily_ete_test.yml +++ b/.github/workflows/daily_ete_test.yml @@ -32,7 +32,7 @@ on: required: true description: 'Dependency packages, you can also set a specific version' type: string - default: 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq mmengine-lite==0.10.5' + default: 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq qwen_vl_utils mmengine-lite==0.10.5' regression_func: required: true description: 'regression functions' @@ -43,7 +43,7 @@ on: env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache - dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq mmengine-lite==0.10.5'}} + dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq qwen_vl_utils mmengine-lite==0.10.5'}} HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }} ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml index 0bfd32d4ee..97a9df7826 100644 --- a/.github/workflows/stable.yml +++ b/.github/workflows/stable.yml @@ -22,9 +22,9 @@ on: required: true description: 'Dependency packages, you can also set a specific version' type: string - default: 'packaging transformers_stream_generator transformers datasets matplotlib jmespath' + default: 'packaging transformers_stream_generator transformers datasets matplotlib jmespath mmengine-lite==0.10.5' schedule: - - cron: '00 8 * * 1,5' + - cron: '00 8 * * 1' env: HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache @@ -32,7 +32,8 @@ env: OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }} REPORT_DIR: /nvme/qa_test_models/stable_reports/${{ github.run_id }} ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true - dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib jmespath'}} + dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib jmespath mmengine-lite==0.10.5'}} + COMPASS_DATA_CACHE: /nvme/qa_test_models/dataset jobs: linux-build: @@ -120,6 +121,12 @@ jobs: run: | python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps python3 -m pip install -r requirements/test.txt + - name: Install opencompass + run: | + git clone --depth=1 https://github.com/open-compass/opencompass.git + cd opencompass + python3 -m pip install -e . + cd .. - name: Check env run: | python3 -m pip list @@ -130,16 +137,24 @@ jobs: CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/${{matrix.model}} --tp 2 --max-batch-size 256 --cache-max-entry-count 0.9 --server-port 23344 > ${{env.REPORT_DIR}}/restful.log 2>&1 & echo "restful_pid=$!" >> "$GITHUB_ENV" sleep 120s + - name: Run OC result + continue-on-error: true + run: | + ln -s /nvme/qa_test_models/dataset/data . + opencompass .github/scripts/eval_stable_object_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-object-1 + opencompass .github/scripts/eval_stable_subject_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-subject-1 + opencompass .github/scripts/eval_stable_object_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-object-2 + opencompass .github/scripts/eval_stable_subject_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-subject-2 + opencompass .github/scripts/eval_stable_object_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-object-3 + opencompass .github/scripts/eval_stable_subject_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-subject-3 - name: Test lmdeploy - restful api run: | python3 benchmark/profile_restful_api.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --stream-output True --num-prompts 10000 --csv ${{env.REPORT_DIR}}/stable.csv > ${{env.REPORT_DIR}}/stable.log python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-1.csv > ${{env.REPORT_DIR}}/stable-internal-1.log python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv > ${{env.REPORT_DIR}}/stable-internal-2.log python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-3.csv > ${{env.REPORT_DIR}}/stable-internal-3.log - - name: Kill api server - if: always() - run: | - kill -15 "$restful_pid" + python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv > ${{env.REPORT_DIR}}/stable-internal-4.log + python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-3.csv > ${{env.REPORT_DIR}}/stable-internal-5.log - name: Attach result if: always() run: | @@ -147,6 +162,12 @@ jobs: python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-1.csv python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-2.csv python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-3.csv + python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-4.csv + python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-5.csv + - name: Kill api server + if: always() + run: | + kill -15 "$restful_pid" - name: Clear workfile if: always() run: | diff --git a/autotest/config.yaml b/autotest/config.yaml index 152bfdeca5..07505718c6 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -42,6 +42,7 @@ turbomind_chat_model: - Qwen/Qwen1.5-7B-Chat - Qwen/Qwen1.5-4B-Chat-AWQ - Qwen/Qwen-VL-Chat + - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen2-7B-Instruct-GPTQ-Int4 - mistralai/Mistral-7B-Instruct-v0.1 - mistralai/Mistral-7B-Instruct-v0.2 @@ -79,6 +80,8 @@ pytorch_chat_model: - Qwen/Qwen2-1.5B-Instruct - Qwen/Qwen1.5-7B-Chat - Qwen/Qwen1.5-MoE-A2.7B-Chat + - Qwen/Qwen2-VL-2B-Instruct + - Qwen/Qwen2-VL-7B-Instruct - mistralai/Mistral-7B-Instruct-v0.1 - mistralai/Mistral-7B-Instruct-v0.2 - mistralai/Mixtral-8x7B-Instruct-v0.1 @@ -120,6 +123,8 @@ vl_model: - OpenGVLab/InternVL2-8B - OpenGVLab/InternVL2-26B - OpenGVLab/InternVL2-40B + - Qwen/Qwen2-VL-2B-Instruct + - Qwen/Qwen2-VL-7B-Instruct - internlm/internlm-xcomposer2-vl-7b - internlm/internlm-xcomposer2d5-7b - internlm/internlm-xcomposer2-4khd-7b @@ -152,6 +157,7 @@ turbomind_quatization: - Qwen/Qwen1.5-7B-Chat - Qwen/Qwen2-7B-Instruct - Qwen/Qwen2-1.5B-Instruct + - Qwen/Qwen2.5-7B-Instruct - Qwen/Qwen-VL-Chat - liuhaotian/llava-v1.5-13b - liuhaotian/llava-v1.6-vicuna-7b