Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

minor: add human eval #1754

Merged
merged 1 commit into from
Nov 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .github/workflows/nightly-eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@ jobs:
- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"

- name: Nightly gsm8k Accuracy
timeout-minutes: 60
- name: Nightly gsm8k and human eval Accuracy
timeout-minutes: 120
run: |
cd test/srt
python3 test_nightly_gsm8k_eval.py
python3 test_nightly_human_eval.py
125 changes: 125 additions & 0 deletions test/srt/test_nightly_human_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import os
import shutil
import signal
import subprocess
import unittest
from types import SimpleNamespace

from test_nightly_gsm8k_eval import parse_models

from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)


class TestEvalAccuracyLarge(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model_groups = [
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
]
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = None
cls.eval_process = None

@classmethod
def tearDownClass(cls):
if cls.process:
kill_child_process(cls.process.pid)
if cls.eval_process:
kill_child_process(cls.eval_process.pid)

def launch_server(self, model, is_fp8, is_tp2):
zhyncs marked this conversation as resolved.
Show resolved Hide resolved
other_args = ["--log-level-http", "warning", "--trust-remote-code"]
if is_fp8:
if "Llama-3" in model or "gemma-2" in model:
# compressed-tensors
other_args.extend(["--kv-cache-dtype", "fp8_e5m2"])
elif "Qwen2-72B-Instruct-FP8" in model:
# bug
other_args.extend(["--quantization", "fp8"])
else:
other_args.extend(
["--quantization", "fp8", "--kv-cache-dtype", "fp8_e5m2"]
)
if is_tp2:
other_args.extend(["--tp", "2"])
if "DeepSeek" in model:
other_args.extend(["--mem-frac", "0.85"])
if "AWQ" in model:
other_args.extend(["--quantization", "awq"])
elif "GPTQ" in model:
other_args.extend(["--quantization", "gptq"])

self.process = popen_launch_server(
model,
self.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args,
)

def run_evalplus(self, model):
print("Delete evalplus results")
shutil.rmtree("evalplus_results", ignore_errors=True)
cmd = [
"evalplus.evaluate",
"--model",
model,
"--dataset",
"humaneval",
"--backend",
"openai",
"--base-url",
"http://localhost:6157/v1",
"--greedy",
]

try:
self.eval_process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
preexec_fn=os.setsid,
)

stdout, stderr = self.eval_process.communicate(timeout=600)

if self.eval_process.returncode != 0:
print(f"Fail to human eval model={model} err={stderr}")

print("=" * 42)
print(stdout)
print("=" * 42)
except subprocess.TimeoutExpired:
if self.eval_process:
os.killpg(os.getpgid(self.eval_process.pid), signal.SIGTERM)
print(f"Timeout during evaluation for model={model}")
except Exception as e:
print(f"Error running evalplus for model={model} {str(e)}")
if self.eval_process:
os.killpg(os.getpgid(self.eval_process.pid), signal.SIGTERM)

def test_human_eval_all_models(self):
for model_group, is_fp8, is_tp2 in self.model_groups:
for model in model_group:
# NOTE: only Llama for now
if "Llama" in model:
with self.subTest(model=model):
self.launch_server(model, is_fp8, is_tp2)
self.run_evalplus(model)
self.tearDownClass()


if __name__ == "__main__":
unittest.main()
Loading