Skip to content

Commit

Permalink
eval: support nightly human eval
Browse files Browse the repository at this point in the history
  • Loading branch information
zhyncs committed Nov 1, 2024
1 parent 16eb33f commit 9478965
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 3 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/nightly-eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
- name: Nightly gsm8k Accuracy
timeout-minutes: 60
- name: Nightly gsm8k and human eval Accuracy
timeout-minutes: 120
run: |
cd test/srt
python3 test_nightly_gsm8k_eval.py
python3 test_nightly_human_eval.py
125 changes: 125 additions & 0 deletions test/srt/test_nightly_human_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import os
import shutil
import signal
import subprocess
import unittest
from types import SimpleNamespace

from test_nightly_gsm8k_eval import parse_models

from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)


class TestEvalAccuracyLarge(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model_groups = [
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
]
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = None
cls.eval_process = None

@classmethod
def tearDownClass(cls):
if cls.process:
kill_child_process(cls.process.pid)
if cls.eval_process:
kill_child_process(cls.eval_process.pid)

def launch_server(self, model, is_fp8, is_tp2):
other_args = ["--log-level-http", "warning", "--trust-remote-code"]
if is_fp8:
if "Llama-3" in model or "gemma-2" in model:
# compressed-tensors
other_args.extend(["--kv-cache-dtype", "fp8_e5m2"])
elif "Qwen2-72B-Instruct-FP8" in model:
# bug
other_args.extend(["--quantization", "fp8"])
else:
other_args.extend(
["--quantization", "fp8", "--kv-cache-dtype", "fp8_e5m2"]
)
if is_tp2:
other_args.extend(["--tp", "2"])
if "DeepSeek" in model:
other_args.extend(["--mem-frac", "0.85"])
if "AWQ" in model:
other_args.extend(["--quantization", "awq"])
elif "GPTQ" in model:
other_args.extend(["--quantization", "gptq"])

self.process = popen_launch_server(
model,
self.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args,
)

def run_evalplus(self, model):
print("Delete evalplus results")
shutil.rmtree("evalplus_results", ignore_errors=True)
cmd = [
"evalplus.evaluate",
"--model",
model,
"--dataset",
"humaneval",
"--backend",
"openai",
"--base-url",
"http://localhost:6157/v1",
"--greedy",
]

try:
self.eval_process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
preexec_fn=os.setsid,
)

stdout, stderr = self.eval_process.communicate(timeout=600)

if self.eval_process.returncode != 0:
print(f"Fail to human eval model={model} err={stderr}")

print("=" * 42)
print(stdout)
print("=" * 42)
except subprocess.TimeoutExpired:
if self.eval_process:
os.killpg(os.getpgid(self.eval_process.pid), signal.SIGTERM)
print(f"Timeout during evaluation for model={model}")
except Exception as e:
print(f"Error running evalplus for model={model} {str(e)}")
if self.eval_process:
os.killpg(os.getpgid(self.eval_process.pid), signal.SIGTERM)

def test_human_eval_all_models(self):
for model_group, is_fp8, is_tp2 in self.model_groups:
for model in model_group:
# NOTE: only Llama for now
if "Llama" in model:
with self.subTest(model=model):
self.launch_server(model, is_fp8, is_tp2)
self.run_evalplus(model)
self.tearDownClass()


if __name__ == "__main__":
unittest.main()

0 comments on commit 9478965

Please sign in to comment.