diff --git a/.github/workflows/nightly-eval.yml b/.github/workflows/nightly-eval.yml index 809120c0c18..7b77c63a54c 100644 --- a/.github/workflows/nightly-eval.yml +++ b/.github/workflows/nightly-eval.yml @@ -27,14 +27,14 @@ jobs: bash scripts/ci_install_dependency.sh pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus" - - name: Test human eval + - name: Test gsm8k timeout-minutes: 120 run: | cd test/srt - python3 test_nightly_human_eval.py + python3 test_nightly_gsm8k_eval.py - - name: Test gsm8k + - name: Test human eval timeout-minutes: 120 run: | cd test/srt - python3 test_nightly_gsm8k_eval.py + python3 test_nightly_human_eval.py diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 5aba58d7a5a..d673d59ff54 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -439,18 +439,22 @@ def popen_launch_server( process = subprocess.Popen(command, stdout=None, stderr=None, env=env) start_time = time.time() - while time.time() - start_time < timeout: - try: - headers = { - "Content-Type": "application/json; charset=utf-8", - "Authorization": f"Bearer {api_key}", - } - response = requests.get(f"{base_url}/health_generate", headers=headers) - if response.status_code == 200: - return process - except requests.RequestException: - pass - time.sleep(10) + with requests.Session() as session: + while time.time() - start_time < timeout: + try: + headers = { + "Content-Type": "application/json; charset=utf-8", + "Authorization": f"Bearer {api_key}", + } + response = session.get( + f"{base_url}/health_generate", + headers=headers, + ) + if response.status_code == 200: + return process + except requests.RequestException: + pass + time.sleep(10) raise TimeoutError("Server failed to start within the timeout period.") diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_gsm8k_eval.py index ede25b1d4b2..7c208e84b93 100644 --- a/test/srt/test_nightly_gsm8k_eval.py +++ b/test/srt/test_nightly_gsm8k_eval.py @@ -1,6 +1,8 @@ import json import os +import subprocess import unittest +import warnings from datetime import datetime from types import SimpleNamespace @@ -18,23 +20,23 @@ ) MODEL_SCORE_THRESHOLDS = { - "meta-llama/Llama-3.1-8B-Instruct": 0.8316, - "mistralai/Mistral-7B-Instruct-v0.3": 0.5861, - "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.8672, - "google/gemma-2-27b-it": 0.9227, - "meta-llama/Llama-3.1-70B-Instruct": 0.9623, - "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.6415, - "Qwen/Qwen2-57B-A14B-Instruct": 0.8791, - "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.8672, - "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.5544, - "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.8356, - "neuralmagic/gemma-2-2b-it-FP8": 0.6059, - "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.9504, - "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.6138, - "neuralmagic/Qwen2-72B-Instruct-FP8": 0.9504, - "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.8197, - "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.8395, - "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.8435, + "meta-llama/Llama-3.1-8B-Instruct": 0.83, + "mistralai/Mistral-7B-Instruct-v0.3": 0.58, + "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.84, + "google/gemma-2-27b-it": 0.92, + "meta-llama/Llama-3.1-70B-Instruct": 0.96, + "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64, + "Qwen/Qwen2-57B-A14B-Instruct": 0.87, + "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.84, + "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54, + "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.83, + "neuralmagic/gemma-2-2b-it-FP8": 0.60, + "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.95, + "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.61, + "neuralmagic/Qwen2-72B-Instruct-FP8": 0.95, + "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82, + "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.84, + "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.84, } @@ -65,6 +67,7 @@ def launch_server(base_url, model, is_fp8, is_tp2): base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=other_args, + return_stdout_stderr=(subprocess.DEVNULL, subprocess.DEVNULL), ) return process @@ -132,6 +135,9 @@ def tearDown(self): kill_child_process(self.process.pid, include_self=True) def test_mgsm_en_all_models(self): + warnings.filterwarnings( + "ignore", category=ResourceWarning, message="unclosed.*socket" + ) is_first = True all_results = []