From dcbb0d14d198835322021c93c5c0d7b8eb9cf0ea Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 27 Oct 2024 01:27:20 -0700 Subject: [PATCH 1/4] Add a watch dog thread --- python/sglang/bench_latency.py | 2 +- python/sglang/bench_server_latency.py | 5 ++-- python/sglang/launch_server.py | 2 +- python/sglang/srt/managers/scheduler.py | 38 +++++++++++++++++++++---- python/sglang/srt/server.py | 10 +++---- python/sglang/srt/server_args.py | 7 +++++ python/sglang/srt/utils.py | 22 ++++++++++---- python/sglang/test/test_utils.py | 10 +++---- 8 files changed, 70 insertions(+), 26 deletions(-) diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py index 43cb7bc3fb..d97b641ea1 100644 --- a/python/sglang/bench_latency.py +++ b/python/sglang/bench_latency.py @@ -550,4 +550,4 @@ def main(server_args, bench_args): except Exception as e: raise e finally: - kill_child_process(os.getpid(), including_parent=False) + kill_child_process() diff --git a/python/sglang/bench_server_latency.py b/python/sglang/bench_server_latency.py index 57506913f5..f76682c9fa 100644 --- a/python/sglang/bench_server_latency.py +++ b/python/sglang/bench_server_latency.py @@ -15,7 +15,6 @@ import itertools import json import multiprocessing -import os import time from typing import Tuple @@ -70,7 +69,7 @@ def launch_server_internal(server_args): except Exception as e: raise e finally: - kill_child_process(os.getpid(), including_parent=False) + kill_child_process() def launch_server_process(server_args: ServerArgs): @@ -176,7 +175,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs): ) finally: if proc: - kill_child_process(proc.pid) + kill_child_process(proc.pid, include_self=True) print(f"\nResults are saved to {bench_args.result_filename}") diff --git a/python/sglang/launch_server.py b/python/sglang/launch_server.py index ce4cb07c2b..57f1dd10e9 100644 --- a/python/sglang/launch_server.py +++ b/python/sglang/launch_server.py @@ -15,4 +15,4 @@ except Exception as e: raise e finally: - kill_child_process(os.getpid(), including_parent=False) + kill_child_process() diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py index 4677568c40..f876847e1d 100644 --- a/python/sglang/srt/managers/scheduler.py +++ b/python/sglang/srt/managers/scheduler.py @@ -18,6 +18,7 @@ import json import logging import os +import threading import time import warnings from collections import deque @@ -222,10 +223,11 @@ def __init__( self.waiting_queue: List[Req] = [] self.running_batch: Optional[ScheduleBatch] = None self.cur_batch: Optional[ScheduleBatch] = None - self.decode_forward_ct = 0 - self.stream_interval = server_args.stream_interval + self.forward_ct = 0 + self.forward_ct_decode = 0 self.num_generated_tokens = 0 self.last_stats_tic = time.time() + self.stream_interval = server_args.stream_interval # Init chunked prefill self.chunked_prefill_size = server_args.chunked_prefill_size @@ -272,6 +274,11 @@ def __init__( self.batch_is_full = False + # Init watchdog thread + self.watchdog_timeout = server_args.watchdog_timeout + t = threading.Thread(target=self.watchdog_thread, daemon=True) + t.start() + # Init profiler if os.getenv("SGLANG_TORCH_PROFILER_DIR", "") == "": self.profiler = None @@ -289,6 +296,23 @@ def __init__( with_stack=True, ) + def watchdog_thread(self): + self.watchdog_last_forward_ct = 0 + self.watchdog_last_time = time.time() + + while True: + if self.cur_batch is not None: + if self.watchdog_last_forward_ct == self.forward_ct: + if time.time() > self.watchdog_last_time + self.watchdog_timeout: + logger.error(f"Watchdog timeout ({self.watchdog_timeout=})") + break + else: + self.watchdog_last_forward_ct = self.forward_ct + self.watchdog_last_time = time.time() + time.sleep(self.watchdog_timeout / 2) + + kill_parent_process() + @torch.inference_mode() def event_loop_normal(self): """A normal blocking scheduler loop.""" @@ -299,6 +323,7 @@ def event_loop_normal(self): self.process_input_requests(recv_reqs) batch = self.get_next_batch_to_run() + self.cur_batch = batch if batch: result = self.run_batch(batch) @@ -746,6 +771,8 @@ def update_running_batch(self): def run_batch(self, batch: ScheduleBatch): """Run a batch.""" + self.forward_ct += 1 + if self.is_generation: if batch.forward_mode.is_decode() or batch.extend_num_tokens != 0: model_worker_batch = batch.get_model_worker_batch() @@ -778,6 +805,7 @@ def process_batch_result(self, batch: ScheduleBatch, result): self.process_batch_result_prefill(batch, result) def process_batch_result_prefill(self, batch: ScheduleBatch, result): + if self.is_generation: logits_output, next_token_ids, bid = result @@ -890,8 +918,8 @@ def process_batch_result_decode(self, batch: ScheduleBatch, result): self.token_to_kv_pool.free_group_end() - self.decode_forward_ct = (self.decode_forward_ct + 1) % (1 << 30) - if self.tp_rank == 0 and self.decode_forward_ct % 40 == 0: + self.forward_ct_decode = (self.forward_ct_decode + 1) % (1 << 30) + if self.tp_rank == 0 and self.forward_ct_decode % 40 == 0: self.print_decode_stats() def add_logprob_return_values( @@ -984,7 +1012,7 @@ def stream_output(self, reqs: List[Req]): else: # embedding or reward model output_embeddings = [] - is_stream_iter = self.decode_forward_ct % self.stream_interval == 0 + is_stream_iter = self.forward_ct_decode % self.stream_interval == 0 for req in reqs: if req.finished() or ( diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 8912c5583a..56eb80d9ec 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -441,7 +441,7 @@ def launch_server( # Send a warmup request t = threading.Thread( - target=_wait_and_warmup, args=(server_args, pipe_finish_writer, os.getpid()) + target=_wait_and_warmup, args=(server_args, pipe_finish_writer) ) t.start() @@ -519,7 +519,7 @@ def _wait_and_warmup(server_args, pipe_finish_writer, pid): if pipe_finish_writer is not None: pipe_finish_writer.send(last_traceback) logger.error(f"Initialization failed. warmup error: {last_traceback}") - kill_child_process(pid, including_parent=False) + kill_child_process(include_self=True) return model_info = res.json() @@ -551,7 +551,7 @@ def _wait_and_warmup(server_args, pipe_finish_writer, pid): if pipe_finish_writer is not None: pipe_finish_writer.send(last_traceback) logger.error(f"Initialization failed. warmup error: {last_traceback}") - kill_child_process(pid, including_parent=False) + kill_child_process(include_self=True) return # logger.info(f"{res.json()=}") @@ -617,7 +617,7 @@ def __init__( def shutdown(self): if self.pid is not None: - kill_child_process(self.pid) + kill_child_process(self.pid, include_self=True) self.pid = None def cache_prefix(self, prefix: str): @@ -834,7 +834,7 @@ async def generator_wrapper(): return ret def shutdown(self): - kill_child_process(os.getpid(), including_parent=False) + kill_child_process(include_self=True) def get_tokenizer(self): global tokenizer_manager diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 753debb66d..7d23cb8bd5 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -74,6 +74,7 @@ class ServerArgs: api_key: Optional[str] = None file_storage_pth: str = "SGLang_storage" enable_cache_report: bool = False + watchdog_timeout: float = 600 # Data parallelism dp_size: int = 1 @@ -429,6 +430,12 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.", ) + parser.add_argument( + "--watchdog-timeout", + type=float, + default=ServerArgs.watchdog_timeout, + help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.", + ) # Data parallelism parser.add_argument( diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 6ad39647f2..7ca3adbd7a 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -24,6 +24,7 @@ import random import resource import socket +import sys import time import warnings from importlib.metadata import PackageNotFoundError, version @@ -398,17 +399,26 @@ def kill_parent_process(): """Kill the parent process and all children of the parent process.""" current_process = psutil.Process() parent_process = current_process.parent() - kill_child_process(parent_process.pid, skip_pid=current_process.pid) + kill_child_process( + parent_process.pid, include_self=True, skip_pid=current_process.pid + ) + try: + current_process.kill() + except psutil.NoSuchProcess: + pass -def kill_child_process(pid, including_parent=True, skip_pid=None): +def kill_child_process(pid=None, include_self=False, skip_pid=None): """Kill the process and all its children process.""" + if pid is None: + pid = os.getpid() + try: - parent = psutil.Process(pid) + itself = psutil.Process(pid) except psutil.NoSuchProcess: return - children = parent.children(recursive=True) + children = itself.children(recursive=True) for child in children: if child.pid == skip_pid: continue @@ -417,9 +427,9 @@ def kill_child_process(pid, including_parent=True, skip_pid=None): except psutil.NoSuchProcess: pass - if including_parent: + if include_self: try: - parent.kill() + itself.kill() except psutil.NoSuchProcess: pass diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 4a5a894c08..d6a4c1a292 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -495,7 +495,7 @@ def run_one_file(filename): ) assert ret_code == 0 except TimeoutError: - kill_child_process(process.pid) + kill_child_process(process.pid, include_self=True) time.sleep(5) print( f"\nTimeout after {timeout_per_file} seconds when running {filename}\n", @@ -563,7 +563,7 @@ def run_bench_serving( try: res = run_benchmark(args) finally: - kill_child_process(process.pid) + kill_child_process(process.pid, include_self=True) assert res["completed"] == num_prompts return res @@ -596,7 +596,7 @@ def run_bench_latency(model, other_args): lastline = output.split("\n")[-3] output_throughput = float(lastline.split(" ")[-2]) finally: - kill_child_process(process.pid) + kill_child_process(process.pid, include_self=True) return output_throughput @@ -707,8 +707,8 @@ def run_mmlu_test( pass # Clean up everything - kill_child_process(process.pid) - kill_child_process(process.pid) + kill_child_process(process.pid, include_self=True) + kill_child_process(process.pid, include_self=True) stdout.close() stderr.close() if os.path.exists(STDOUT_FILENAME): From c919bd152bfc48e75eb0f897dda2cc424bd60b9a Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 27 Oct 2024 01:39:51 -0700 Subject: [PATCH 2/4] Fix kill itself --- python/sglang/srt/utils.py | 1 - .../sampling/penaltylib/test_srt_endpoint_with_penalizers.py | 2 +- test/srt/test_cache_report.py | 2 +- test/srt/test_data_parallelism.py | 2 +- test/srt/test_double_sparsity.py | 2 +- test/srt/test_embedding_openai_server.py | 2 +- test/srt/test_eval_accuracy_large.py | 2 +- test/srt/test_eval_accuracy_large_chunked_prefill.py | 2 +- test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py | 2 +- test/srt/test_eval_accuracy_mini.py | 2 +- test/srt/test_json_constrained.py | 2 +- test/srt/test_large_max_new_tokens.py | 2 +- test/srt/test_matched_stop.py | 2 +- test/srt/test_mla.py | 2 +- test/srt/test_mla_fp8.py | 2 +- test/srt/test_moe_eval_accuracy_large.py | 2 +- test/srt/test_nightly_gsm8k_eval.py | 2 +- test/srt/test_openai_server.py | 2 +- test/srt/test_pytorch_sampling_backend.py | 2 +- test/srt/test_retract_decode.py | 2 +- test/srt/test_skip_tokenizer_init.py | 2 +- test/srt/test_srt_endpoint.py | 2 +- test/srt/test_torch_compile.py | 2 +- test/srt/test_torchao.py | 2 +- test/srt/test_triton_attn_backend.py | 2 +- test/srt/test_update_weights.py | 2 +- test/srt/test_vision_openai_server.py | 2 +- 27 files changed, 26 insertions(+), 27 deletions(-) diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 7ca3adbd7a..2be3a298ec 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -24,7 +24,6 @@ import random import resource import socket -import sys import time import warnings from importlib.metadata import PackageNotFoundError, version diff --git a/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py b/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py index e3496102cb..689d52a1c5 100644 --- a/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py +++ b/test/srt/sampling/penaltylib/test_srt_endpoint_with_penalizers.py @@ -31,7 +31,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def run_decode( self, diff --git a/test/srt/test_cache_report.py b/test/srt/test_cache_report.py index 1d8e9a4a02..dfc140d587 100644 --- a/test/srt/test_cache_report.py +++ b/test/srt/test_cache_report.py @@ -45,7 +45,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def run_decode(self, return_logprob=False, top_logprobs_num=0, n=1): response = requests.post( diff --git a/test/srt/test_data_parallelism.py b/test/srt/test_data_parallelism.py index a921a6b572..5f17994a2d 100644 --- a/test/srt/test_data_parallelism.py +++ b/test/srt/test_data_parallelism.py @@ -25,7 +25,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_double_sparsity.py b/test/srt/test_double_sparsity.py index 0f2f572ebb..14ee4de3cf 100644 --- a/test/srt/test_double_sparsity.py +++ b/test/srt/test_double_sparsity.py @@ -43,7 +43,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_embedding_openai_server.py b/test/srt/test_embedding_openai_server.py index 45f7850da9..666297c650 100644 --- a/test/srt/test_embedding_openai_server.py +++ b/test/srt/test_embedding_openai_server.py @@ -28,7 +28,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def run_embedding(self, use_list_input, token_input): client = openai.Client(api_key=self.api_key, base_url=self.base_url) diff --git a/test/srt/test_eval_accuracy_large.py b/test/srt/test_eval_accuracy_large.py index 0b95f435c2..000910cf23 100644 --- a/test/srt/test_eval_accuracy_large.py +++ b/test/srt/test_eval_accuracy_large.py @@ -30,7 +30,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_eval_accuracy_large_chunked_prefill.py b/test/srt/test_eval_accuracy_large_chunked_prefill.py index 02df2a7f56..2e9ff59cda 100644 --- a/test/srt/test_eval_accuracy_large_chunked_prefill.py +++ b/test/srt/test_eval_accuracy_large_chunked_prefill.py @@ -25,7 +25,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py b/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py index 8ba71e5c83..0fb08e64f4 100644 --- a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py +++ b/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py @@ -31,7 +31,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_eval_accuracy_mini.py b/test/srt/test_eval_accuracy_mini.py index ee977a6368..fa15c11814 100644 --- a/test/srt/test_eval_accuracy_mini.py +++ b/test/srt/test_eval_accuracy_mini.py @@ -22,7 +22,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_json_constrained.py b/test/srt/test_json_constrained.py index c054d72346..88368fba80 100644 --- a/test/srt/test_json_constrained.py +++ b/test/srt/test_json_constrained.py @@ -41,7 +41,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def run_decode(self, json_schema, return_logprob=False, top_logprobs_num=0, n=1): response = requests.post( diff --git a/test/srt/test_large_max_new_tokens.py b/test/srt/test_large_max_new_tokens.py index 24c011c756..ea9c20e5c2 100644 --- a/test/srt/test_large_max_new_tokens.py +++ b/test/srt/test_large_max_new_tokens.py @@ -42,7 +42,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) cls.stdout.close() cls.stderr.close() os.remove("stdout.txt") diff --git a/test/srt/test_matched_stop.py b/test/srt/test_matched_stop.py index a3399687db..df37fa13c7 100644 --- a/test/srt/test_matched_stop.py +++ b/test/srt/test_matched_stop.py @@ -32,7 +32,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def run_completions_generation( self, diff --git a/test/srt/test_mla.py b/test/srt/test_mla.py index 13b0aa2d89..796655adb5 100644 --- a/test/srt/test_mla.py +++ b/test/srt/test_mla.py @@ -25,7 +25,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_mla_fp8.py b/test/srt/test_mla_fp8.py index 37275d696b..5091759a9f 100644 --- a/test/srt/test_mla_fp8.py +++ b/test/srt/test_mla_fp8.py @@ -31,7 +31,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def test_mgsm_en(self): args = SimpleNamespace( diff --git a/test/srt/test_moe_eval_accuracy_large.py b/test/srt/test_moe_eval_accuracy_large.py index 5f25605268..401a47ce2c 100644 --- a/test/srt/test_moe_eval_accuracy_large.py +++ b/test/srt/test_moe_eval_accuracy_large.py @@ -35,7 +35,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_nightly_gsm8k_eval.py b/test/srt/test_nightly_gsm8k_eval.py index 8b8e0e16b3..b035db52b3 100644 --- a/test/srt/test_nightly_gsm8k_eval.py +++ b/test/srt/test_nightly_gsm8k_eval.py @@ -36,7 +36,7 @@ def setUp(self): def tearDown(self): if self.process: - kill_child_process(self.process.pid) + kill_child_process(self.process.pid, include_self=True) def launch_server(self, model, is_fp8, is_tp2): other_args = ["--log-level-http", "warning", "--trust-remote-code"] diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index 5afe9b0b17..d3e21d04b0 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -31,7 +31,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def run_completion( self, echo, logprobs, use_list_input, parallel_sample_num, token_input diff --git a/test/srt/test_pytorch_sampling_backend.py b/test/srt/test_pytorch_sampling_backend.py index 5dbb9ae2bc..f7affa8aca 100644 --- a/test/srt/test_pytorch_sampling_backend.py +++ b/test/srt/test_pytorch_sampling_backend.py @@ -27,7 +27,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_retract_decode.py b/test/srt/test_retract_decode.py index b16fd5163e..20352e7297 100644 --- a/test/srt/test_retract_decode.py +++ b/test/srt/test_retract_decode.py @@ -22,7 +22,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_skip_tokenizer_init.py b/test/srt/test_skip_tokenizer_init.py index 3a8c34c162..a5dcde4a27 100644 --- a/test/srt/test_skip_tokenizer_init.py +++ b/test/srt/test_skip_tokenizer_init.py @@ -26,7 +26,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def run_decode(self, return_logprob=False, top_logprobs_num=0, n=1): max_new_tokens = 32 diff --git a/test/srt/test_srt_endpoint.py b/test/srt/test_srt_endpoint.py index c4c8e844d6..e1b5318c06 100644 --- a/test/srt/test_srt_endpoint.py +++ b/test/srt/test_srt_endpoint.py @@ -27,7 +27,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def run_decode( self, diff --git a/test/srt/test_torch_compile.py b/test/srt/test_torch_compile.py index 40f47d6b6b..f5f4b602e1 100644 --- a/test/srt/test_torch_compile.py +++ b/test/srt/test_torch_compile.py @@ -27,7 +27,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_torchao.py b/test/srt/test_torchao.py index 8b5ce58ed0..7655671361 100644 --- a/test/srt/test_torchao.py +++ b/test/srt/test_torchao.py @@ -27,7 +27,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def test_mmlu(self): args = SimpleNamespace( diff --git a/test/srt/test_triton_attn_backend.py b/test/srt/test_triton_attn_backend.py index 55df1951f9..ffe9d22c42 100644 --- a/test/srt/test_triton_attn_backend.py +++ b/test/srt/test_triton_attn_backend.py @@ -50,7 +50,7 @@ def test_mmlu(self): metrics = run_eval(args) assert metrics["score"] >= 0.65 finally: - kill_child_process(process.pid) + kill_child_process(cls.process.pid, include_self=True) if __name__ == "__main__": diff --git a/test/srt/test_update_weights.py b/test/srt/test_update_weights.py index 73c3cc706a..c3cde0f14b 100644 --- a/test/srt/test_update_weights.py +++ b/test/srt/test_update_weights.py @@ -23,7 +23,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def run_decode(self): response = requests.post( diff --git a/test/srt/test_vision_openai_server.py b/test/srt/test_vision_openai_server.py index bf8f9d2775..f44bc98e25 100644 --- a/test/srt/test_vision_openai_server.py +++ b/test/srt/test_vision_openai_server.py @@ -45,7 +45,7 @@ def setUpClass(cls): @classmethod def tearDownClass(cls): - kill_child_process(cls.process.pid) + kill_child_process(cls.process.pid, include_self=True) def test_chat_completion(self): client = openai.Client(api_key=self.api_key, base_url=self.base_url) From 130dcd4ae5aad5a5730b8deefc0cabd217aa224d Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 27 Oct 2024 01:40:37 -0700 Subject: [PATCH 3/4] Fix --- python/sglang/srt/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 56eb80d9ec..7dffcc4a2b 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -496,7 +496,7 @@ def _set_envs_and_config(server_args: ServerArgs): mp.set_start_method("spawn", force=True) -def _wait_and_warmup(server_args, pipe_finish_writer, pid): +def _wait_and_warmup(server_args, pipe_finish_writer): headers = {} url = server_args.url() if server_args.api_key: From 668fefd03735a492dc468a38f7835861e401e63b Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 27 Oct 2024 01:50:06 -0700 Subject: [PATCH 4/4] update --- test/srt/test_triton_attn_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/srt/test_triton_attn_backend.py b/test/srt/test_triton_attn_backend.py index ffe9d22c42..2a6fe17bd8 100644 --- a/test/srt/test_triton_attn_backend.py +++ b/test/srt/test_triton_attn_backend.py @@ -50,7 +50,7 @@ def test_mmlu(self): metrics = run_eval(args) assert metrics["score"] >= 0.65 finally: - kill_child_process(cls.process.pid, include_self=True) + kill_child_process(process.pid, include_self=True) if __name__ == "__main__":