From 693e6331d914f58a6f8f3056aeadb1981874cff4 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulin1@pjlab.org.cn>
Date: Wed, 25 Sep 2024 20:27:57 +0800
Subject: [PATCH 1/8] updaste

---
 .github/scripts/eval_stable_config.py | 70 +++++++++++++++++++++++++++
 .github/workflows/stable.yml          | 18 +++++--
 autotest/config.yaml                  |  6 +++
 3 files changed, 90 insertions(+), 4 deletions(-)
 create mode 100644 .github/scripts/eval_stable_config.py

diff --git a/.github/scripts/eval_stable_config.py b/.github/scripts/eval_stable_config.py
new file mode 100644
index 0000000000..9f5ff55e8a
--- /dev/null
+++ b/.github/scripts/eval_stable_config.py
@@ -0,0 +1,70 @@
+from mmengine.config import read_base
+from opencompass.models import OpenAISDK
+
+with read_base():
+    # choose a list of datasets
+    from opencompass.configs.datasets.bbh.bbh_gen_2879b0 import \
+        bbh_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \
+        ceval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import \
+        cmmlu_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
+        GaokaoBench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.gpqa.gpqa_gen_4baadb import \
+        gpqa_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import \
+        gsm8k_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
+        hellaswag_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \
+        humaneval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \
+        ifeval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.math.math_0shot_gen_393424 import \
+        math_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_830460 import \
+        sanitized_mbpp_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import \
+        mmlu_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \
+        nq_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.race.race_gen_69ee4f import \
+        race_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
+        TheoremQA_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \
+        triviaqa_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import \
+        winogrande_datasets  # noqa: F401, E501
+    from opencompass.configs.summarizers.medium import \
+        summarizer  # noqa: F401, E501
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+models = [
+    dict(
+        abbr='lmdeploy-api-test',
+        type=OpenAISDK,
+        key='EMPTY',
+        openai_api_base='http://localhost:23333/v1',
+        path='internlm2_20b_api',
+        tokenizer_path='internlm/internlm2_5-20b-chat',
+        rpm_verbose=True,
+        meta_template=api_meta_template,
+        query_per_second=50,
+        max_out_len=1024,
+        max_seq_len=4096,
+        temperature=0.01,
+        batch_size=128,
+        retry=3,
+    )
+]
diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml
index 0bfd32d4ee..8f45b9d74c 100644
--- a/.github/workflows/stable.yml
+++ b/.github/workflows/stable.yml
@@ -120,6 +120,12 @@ jobs:
         run: |
           python3 -m pip install /nvme/qa_test_models/offline_pkg/py310/lmdeploy-*.whl --no-deps
           python3 -m pip install -r requirements/test.txt
+      - name: Install opencompass
+        run: |
+          git clone --depth=1 https://github.com/open-compass/opencompass.git
+          cd opencompass
+          python3 -m pip install -e .
+          cd ..
       - name: Check env
         run: |
           python3 -m pip list
@@ -130,16 +136,16 @@ jobs:
           CUDA_VISIBLE_DEVICES=6,7 lmdeploy serve api_server /nvme/qa_test_models/${{matrix.model}} --tp 2 --max-batch-size 256 --cache-max-entry-count 0.9 --server-port 23344 > ${{env.REPORT_DIR}}/restful.log 2>&1  &
           echo "restful_pid=$!" >> "$GITHUB_ENV"
           sleep 120s
+      - name: Run OC result
+        if: always()
+        run: |
+          opencompass .github/scripts/eval_stable_config.py --reuse --dump-eval-details -w ${{env.REPORT_DIR}}
       - name: Test lmdeploy - restful api
         run: |
           python3 benchmark/profile_restful_api.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --stream-output True --num-prompts 10000 --csv ${{env.REPORT_DIR}}/stable.csv > ${{env.REPORT_DIR}}/stable.log
           python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-1.csv > ${{env.REPORT_DIR}}/stable-internal-1.log
           python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv > ${{env.REPORT_DIR}}/stable-internal-2.log
           python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-3.csv > ${{env.REPORT_DIR}}/stable-internal-3.log
-      - name: Kill api server
-        if: always()
-        run: |
-          kill -15 "$restful_pid"
       - name: Attach result
         if: always()
         run: |
@@ -147,6 +153,10 @@ jobs:
           python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-1.csv
           python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-2.csv
           python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-3.csv
+      - name: Kill api server
+        if: always()
+        run: |
+          kill -15 "$restful_pid"
       - name: Clear workfile
         if: always()
         run: |
diff --git a/autotest/config.yaml b/autotest/config.yaml
index 152bfdeca5..07505718c6 100644
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -42,6 +42,7 @@ turbomind_chat_model:
     - Qwen/Qwen1.5-7B-Chat
     - Qwen/Qwen1.5-4B-Chat-AWQ
     - Qwen/Qwen-VL-Chat
+    - Qwen/Qwen2.5-7B-Instruct
     - Qwen/Qwen2-7B-Instruct-GPTQ-Int4
     - mistralai/Mistral-7B-Instruct-v0.1
     - mistralai/Mistral-7B-Instruct-v0.2
@@ -79,6 +80,8 @@ pytorch_chat_model:
     - Qwen/Qwen2-1.5B-Instruct
     - Qwen/Qwen1.5-7B-Chat
     - Qwen/Qwen1.5-MoE-A2.7B-Chat
+    - Qwen/Qwen2-VL-2B-Instruct
+    - Qwen/Qwen2-VL-7B-Instruct
     - mistralai/Mistral-7B-Instruct-v0.1
     - mistralai/Mistral-7B-Instruct-v0.2
     - mistralai/Mixtral-8x7B-Instruct-v0.1
@@ -120,6 +123,8 @@ vl_model:
     - OpenGVLab/InternVL2-8B
     - OpenGVLab/InternVL2-26B
     - OpenGVLab/InternVL2-40B
+    - Qwen/Qwen2-VL-2B-Instruct
+    - Qwen/Qwen2-VL-7B-Instruct
     - internlm/internlm-xcomposer2-vl-7b
     - internlm/internlm-xcomposer2d5-7b
     - internlm/internlm-xcomposer2-4khd-7b
@@ -152,6 +157,7 @@ turbomind_quatization:
         - Qwen/Qwen1.5-7B-Chat
         - Qwen/Qwen2-7B-Instruct
         - Qwen/Qwen2-1.5B-Instruct
+        - Qwen/Qwen2.5-7B-Instruct
         - Qwen/Qwen-VL-Chat
         - liuhaotian/llava-v1.5-13b
         - liuhaotian/llava-v1.6-vicuna-7b

From 24cc94421fce03849313643991efa51e5742057a Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulin1@pjlab.org.cn>
Date: Wed, 25 Sep 2024 20:34:46 +0800
Subject: [PATCH 2/8] update

---
 .github/scripts/eval_stable_config.py | 24 ++++++++++++++++++++----
 .github/workflows/stable.yml          |  3 ++-
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/.github/scripts/eval_stable_config.py b/.github/scripts/eval_stable_config.py
index 9f5ff55e8a..486457e5cf 100644
--- a/.github/scripts/eval_stable_config.py
+++ b/.github/scripts/eval_stable_config.py
@@ -31,14 +31,28 @@
         nq_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.race.race_gen_69ee4f import \
         race_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import \
+        alignbench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \
+        alpacav2_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
+        arenahard_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \
+        compassarena_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \
+        fofo_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.followbench.followbench_llmeval import \
+        followbench_llmeval_dataset  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \
+        mtbench101_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \
+        wildbench_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
-        TheoremQA_datasets  # noqa: F401, E501
+        TheoremQA_datasets  # noqa: F401, E501 # noqa: F401, E501
     from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \
-        triviaqa_datasets  # noqa: F401, E501
+        triviaqa_datasets  # noqa: F401, E501 # noqa: F401, E501
     from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import \
         winogrande_datasets  # noqa: F401, E501
-    from opencompass.configs.summarizers.medium import \
-        summarizer  # noqa: F401, E501
 
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
 
@@ -68,3 +82,5 @@
         retry=3,
     )
 ]
+
+judge_models = models
diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml
index 8f45b9d74c..1a60376a8a 100644
--- a/.github/workflows/stable.yml
+++ b/.github/workflows/stable.yml
@@ -33,6 +33,7 @@ env:
   REPORT_DIR: /nvme/qa_test_models/stable_reports/${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib jmespath'}}
+  COMPASS_DATA_CACHE: /nvme/qa_test_models/dataset/data
 
 jobs:
   linux-build:
@@ -137,7 +138,7 @@ jobs:
           echo "restful_pid=$!" >> "$GITHUB_ENV"
           sleep 120s
       - name: Run OC result
-        if: always()
+        continue-on-error: true
         run: |
           opencompass .github/scripts/eval_stable_config.py --reuse --dump-eval-details -w ${{env.REPORT_DIR}}
       - name: Test lmdeploy - restful api

From 8a4c303987057e7cfa8b13797d261f6af146605d Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulin1@pjlab.org.cn>
Date: Thu, 26 Sep 2024 19:14:32 +0800
Subject: [PATCH 3/8] update

---
 .github/scripts/eval_stable_config.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/scripts/eval_stable_config.py b/.github/scripts/eval_stable_config.py
index 486457e5cf..a39e023548 100644
--- a/.github/scripts/eval_stable_config.py
+++ b/.github/scripts/eval_stable_config.py
@@ -41,8 +41,6 @@
         compassarena_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \
         fofo_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.subjective.followbench.followbench_llmeval import \
-        followbench_llmeval_dataset  # noqa: F401, E501
     from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \
         mtbench101_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \

From c988a273662eaf5af60d64d994100f1d15a36acf Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulin1@pjlab.org.cn>
Date: Thu, 26 Sep 2024 19:41:21 +0800
Subject: [PATCH 4/8] update

---
 .github/scripts/eval_stable_config.py | 2 +-
 .github/workflows/stable.yml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/eval_stable_config.py b/.github/scripts/eval_stable_config.py
index a39e023548..dec667e2bb 100644
--- a/.github/scripts/eval_stable_config.py
+++ b/.github/scripts/eval_stable_config.py
@@ -27,7 +27,7 @@
         sanitized_mbpp_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import \
         mmlu_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \
+    from opencompass.configs.datasets.nq.nq_gen_3dcea1 import \
         nq_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.race.race_gen_69ee4f import \
         race_datasets  # noqa: F401, E501
diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml
index 1a60376a8a..110fb5c3fb 100644
--- a/.github/workflows/stable.yml
+++ b/.github/workflows/stable.yml
@@ -33,7 +33,7 @@ env:
   REPORT_DIR: /nvme/qa_test_models/stable_reports/${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
   dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib jmespath'}}
-  COMPASS_DATA_CACHE: /nvme/qa_test_models/dataset/data
+  COMPASS_DATA_CACHE: /nvme/qa_test_models/dataset
 
 jobs:
   linux-build:

From f1d2a46fd25cc5e221ff44bf4f3fb959a3abc865 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulin1@pjlab.org.cn>
Date: Thu, 26 Sep 2024 20:03:43 +0800
Subject: [PATCH 5/8] update

---
 .github/scripts/eval_stable_config.py | 6 +++---
 .github/workflows/stable.yml          | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/scripts/eval_stable_config.py b/.github/scripts/eval_stable_config.py
index dec667e2bb..57d3bb15e1 100644
--- a/.github/scripts/eval_stable_config.py
+++ b/.github/scripts/eval_stable_config.py
@@ -67,9 +67,9 @@
         abbr='lmdeploy-api-test',
         type=OpenAISDK,
         key='EMPTY',
-        openai_api_base='http://localhost:23333/v1',
-        path='internlm2_20b_api',
-        tokenizer_path='internlm/internlm2_5-20b-chat',
+        openai_api_base='http://localhost:23344/v1',
+        path='/nvme/qa_test_models/internlm/internlm2_5-20b-chat',
+        tokenizer_path='/nvme/qa_test_models/internlm/internlm2_5-20b-chat',
         rpm_verbose=True,
         meta_template=api_meta_template,
         query_per_second=50,
diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml
index 110fb5c3fb..8273cabd93 100644
--- a/.github/workflows/stable.yml
+++ b/.github/workflows/stable.yml
@@ -24,7 +24,7 @@ on:
         type: string
         default: 'packaging transformers_stream_generator transformers datasets matplotlib jmespath'
   schedule:
-    - cron:  '00 8 * * 1,5'
+    - cron:  '00 8 * * 1'
 
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
@@ -140,7 +140,8 @@ jobs:
       - name: Run OC result
         continue-on-error: true
         run: |
-          opencompass .github/scripts/eval_stable_config.py --reuse --dump-eval-details -w ${{env.REPORT_DIR}}
+          ln -s /nvme/qa_test_models/dataset/data .
+          opencompass .github/scripts/eval_stable_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-oc
       - name: Test lmdeploy - restful api
         run: |
           python3 benchmark/profile_restful_api.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --stream-output True --num-prompts 10000 --csv ${{env.REPORT_DIR}}/stable.csv > ${{env.REPORT_DIR}}/stable.log

From 2c72cf5ee2118088454583d4424210179dfde5ea Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulin1@pjlab.org.cn>
Date: Thu, 26 Sep 2024 20:28:40 +0800
Subject: [PATCH 6/8] update

---
 .github/workflows/stable.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml
index 8273cabd93..0e4dce924e 100644
--- a/.github/workflows/stable.yml
+++ b/.github/workflows/stable.yml
@@ -22,7 +22,7 @@ on:
         required: true
         description: 'Dependency packages, you can also set a specific version'
         type: string
-        default: 'packaging transformers_stream_generator transformers datasets matplotlib jmespath'
+        default: 'packaging transformers_stream_generator transformers datasets matplotlib jmespath mmengine-lite==0.10.5'
   schedule:
     - cron:  '00 8 * * 1'
 
@@ -32,7 +32,7 @@ env:
   OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
   REPORT_DIR: /nvme/qa_test_models/stable_reports/${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
-  dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib jmespath'}}
+  dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib jmespath mmengine-lite==0.10.5'}}
   COMPASS_DATA_CACHE: /nvme/qa_test_models/dataset
 
 jobs:

From 10cc2839a73fb11ed9bde30ef35b9ef7757c03d8 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulin1@pjlab.org.cn>
Date: Fri, 27 Sep 2024 13:09:02 +0800
Subject: [PATCH 7/8] update

---
 ...config.py => eval_stable_object_config.py} | 22 ------
 .github/scripts/eval_stable_subject_config.py | 70 +++++++++++++++++++
 .github/workflows/stable.yml                  | 11 ++-
 3 files changed, 80 insertions(+), 23 deletions(-)
 rename .github/scripts/{eval_stable_config.py => eval_stable_object_config.py} (64%)
 create mode 100644 .github/scripts/eval_stable_subject_config.py

diff --git a/.github/scripts/eval_stable_config.py b/.github/scripts/eval_stable_object_config.py
similarity index 64%
rename from .github/scripts/eval_stable_config.py
rename to .github/scripts/eval_stable_object_config.py
index 57d3bb15e1..53e46b87ae 100644
--- a/.github/scripts/eval_stable_config.py
+++ b/.github/scripts/eval_stable_object_config.py
@@ -31,26 +31,6 @@
         nq_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.race.race_gen_69ee4f import \
         race_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import \
-        alignbench_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \
-        alpacav2_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
-        arenahard_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \
-        compassarena_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \
-        fofo_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \
-        mtbench101_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \
-        wildbench_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
-        TheoremQA_datasets  # noqa: F401, E501 # noqa: F401, E501
-    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \
-        triviaqa_datasets  # noqa: F401, E501 # noqa: F401, E501
-    from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import \
-        winogrande_datasets  # noqa: F401, E501
 
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
 
@@ -80,5 +60,3 @@
         retry=3,
     )
 ]
-
-judge_models = models
diff --git a/.github/scripts/eval_stable_subject_config.py b/.github/scripts/eval_stable_subject_config.py
new file mode 100644
index 0000000000..9517fe384d
--- /dev/null
+++ b/.github/scripts/eval_stable_subject_config.py
@@ -0,0 +1,70 @@
+from mmengine.config import read_base
+from opencompass.models import OpenAISDK
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+
+with read_base():
+    # choose a list of datasets
+    from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import \
+        alignbench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \
+        alpacav2_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
+        arenahard_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \
+        compassarena_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import \
+        fofo_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \
+        mtbench101_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \
+        wildbench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
+        TheoremQA_datasets  # noqa: F401, E501 # noqa: F401, E501
+    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \
+        triviaqa_datasets  # noqa: F401, E501 # noqa: F401, E501
+    from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import \
+        winogrande_datasets  # noqa: F401, E501
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+models = [
+    dict(
+        abbr='lmdeploy-api-test',
+        type=OpenAISDK,
+        key='EMPTY',
+        openai_api_base='http://localhost:23344/v1',
+        path='/nvme/qa_test_models/internlm/internlm2_5-20b-chat',
+        tokenizer_path='/nvme/qa_test_models/internlm/internlm2_5-20b-chat',
+        rpm_verbose=True,
+        meta_template=api_meta_template,
+        query_per_second=50,
+        max_out_len=1024,
+        max_seq_len=4096,
+        temperature=0.01,
+        batch_size=128,
+        retry=3,
+    )
+]
+
+judge_models = models
+
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveNaivePartitioner,
+        models=models,
+        judge_models=judge_models,
+    ),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=SubjectiveEvalTask)),
+)
diff --git a/.github/workflows/stable.yml b/.github/workflows/stable.yml
index 0e4dce924e..97a9df7826 100644
--- a/.github/workflows/stable.yml
+++ b/.github/workflows/stable.yml
@@ -141,13 +141,20 @@ jobs:
         continue-on-error: true
         run: |
           ln -s /nvme/qa_test_models/dataset/data .
-          opencompass .github/scripts/eval_stable_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-oc
+          opencompass .github/scripts/eval_stable_object_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-object-1
+          opencompass .github/scripts/eval_stable_subject_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-subject-1
+          opencompass .github/scripts/eval_stable_object_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-object-2
+          opencompass .github/scripts/eval_stable_subject_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-subject-2
+          opencompass .github/scripts/eval_stable_object_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-object-3
+          opencompass .github/scripts/eval_stable_subject_config.py --reuse --dump-eval-details --work-dir ${{env.REPORT_DIR}}-subject-3
       - name: Test lmdeploy - restful api
         run: |
           python3 benchmark/profile_restful_api.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json --stream-output True --num-prompts 10000 --csv ${{env.REPORT_DIR}}/stable.csv > ${{env.REPORT_DIR}}/stable.log
           python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-1.csv > ${{env.REPORT_DIR}}/stable-internal-1.log
           python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv > ${{env.REPORT_DIR}}/stable-internal-2.log
           python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-3.csv > ${{env.REPORT_DIR}}/stable-internal-3.log
+          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-2.csv > ${{env.REPORT_DIR}}/stable-internal-4.log
+          python3 /nvme/qa_test_models/offline_pkg/profile_restful_api_internal.py localhost:23344 /nvme/qa_test_models/${{matrix.model}} /nvme/qa_test_models/datasets/Mixed.json --stream-output True --num-prompts 100000 --csv ${{env.REPORT_DIR}}/stable-internal-3.csv > ${{env.REPORT_DIR}}/stable-internal-5.log
       - name: Attach result
         if: always()
         run: |
@@ -155,6 +162,8 @@ jobs:
           python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-1.csv
           python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-2.csv
           python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-3.csv
+          python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-4.csv
+          python3 .github/scripts/action_tools.py add_summary ${{env.REPORT_DIR}}/stable-internal-5.csv
       - name: Kill api server
         if: always()
         run: |

From 49cf2b80355f4531af457555d88d0b9d1ecdebe5 Mon Sep 17 00:00:00 2001
From: zhulin1 <zhulin1@pjlab.org.cn>
Date: Sun, 29 Sep 2024 10:13:07 +0800
Subject: [PATCH 8/8] update

---
 .github/scripts/eval_stable_subject_config.py | 4 +++-
 .github/workflows/daily_ete_test.yml          | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/eval_stable_subject_config.py b/.github/scripts/eval_stable_subject_config.py
index 9517fe384d..abcfba3db4 100644
--- a/.github/scripts/eval_stable_subject_config.py
+++ b/.github/scripts/eval_stable_subject_config.py
@@ -27,7 +27,9 @@
     from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import \
         winogrande_datasets  # noqa: F401, E501
 
-datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
+datasets = sum((v for k, v in locals().items()
+                if k.endswith('_datasets') and 'wildbench' not in k), [])
+datasets += wildbench_datasets
 
 api_meta_template = dict(
     round=[
diff --git a/.github/workflows/daily_ete_test.yml b/.github/workflows/daily_ete_test.yml
index 063bf1a9e0..16202656ec 100644
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -32,7 +32,7 @@ on:
         required: true
         description: 'Dependency packages, you can also set a specific version'
         type: string
-        default: 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq mmengine-lite==0.10.5'
+        default: 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq qwen_vl_utils mmengine-lite==0.10.5'
       regression_func:
         required: true
         description: 'regression functions'
@@ -43,7 +43,7 @@ on:
 
 env:
   HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
-  dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq mmengine-lite==0.10.5'}}
+  dependency_pkgs: ${{inputs.dependency_pkgs || 'packaging transformers_stream_generator transformers datasets matplotlib openai attrdict timm modelscope jmespath decord auto_gptq qwen_vl_utils mmengine-lite==0.10.5'}}
   HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
   OUTPUT_FOLDER: cuda11.8_dist_${{ github.run_id }}
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true