Merge branch 'sgl-project:main' into main

sgl-project · Nov 28, 2024 · 2946943 · 2946943
2 parents cebf16e + 65fdb28
commit 2946943
Show file tree

Hide file tree

Showing 57 changed files with 642 additions and 148 deletions.
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
@@ -50,7 +50,7 @@ jobs:
         timeout-minutes: 25
         run: |
           cd test/srt
-          python3 run_suite.py --suite minimal --range-begin 0 --range-end 5
+          python3 run_suite.py --suite minimal --range-begin 0 --range-end 6
 
   unit-test-backend-part-2:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
@@ -67,7 +67,7 @@ jobs:
         timeout-minutes: 25
         run: |
           cd test/srt
-          python3 run_suite.py --suite minimal --range-begin 5 --range-end 14
+          python3 run_suite.py --suite minimal --range-begin 6 --range-end 15
 
   unit-test-backend-part-3:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
@@ -84,7 +84,7 @@ jobs:
         timeout-minutes: 25
         run: |
           cd test/srt
-          python3 run_suite.py --suite minimal --range-begin 14 --range-end 23
+          python3 run_suite.py --suite minimal --range-begin 15 --range-end 24
 
   unit-test-backend-part-4:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
@@ -101,7 +101,7 @@ jobs:
         timeout-minutes: 25
         run: |
           cd test/srt
-          python3 run_suite.py --suite minimal --range-begin 23
+          python3 run_suite.py --suite minimal --range-begin 24
 
   unit-test-backend-2-gpu-part-1:
     if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
@@ -1,5 +1,5 @@
 # Usage (to build SGLang ROCm docker image):
-#   docker build --build-arg SGL_BRANCH=v0.3.6.post2 -t testImage -f Dockerfile.rocm .
+#   docker build --build-arg SGL_BRANCH=v0.3.6.post2 -t v0.3.6.post2-rocm620 -f Dockerfile.rocm .
 
 # default base image
 ARG BASE_IMAGE="rocm/vllm-dev:20241022"

diff --git a/docs/start/install.md b/docs/start/install.md
@@ -28,6 +28,17 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 
 Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
 
+Note: To AMD ROCm system with Instinct/MI GPUs, do following instead:
+
+```
+# Use the last release branch
+git clone -b v0.3.6.post2 https://github.com/sgl-project/sglang.git
+cd sglang
+
+pip install --upgrade pip
+pip install -e "python[all_hip]"
+```
+
 ## Method 3: Using docker
 The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker).
 Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).

diff --git a/python/sglang/bench_one_batch.py b/python/sglang/bench_one_batch.py
@@ -47,6 +47,7 @@
 import json
 import logging
 import multiprocessing
+import os
 import time
 from typing import Tuple
 
@@ -62,11 +63,7 @@
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server import _set_envs_and_config
 from sglang.srt.server_args import PortArgs, ServerArgs
-from sglang.srt.utils import (
-    configure_logger,
-    kill_child_process,
-    suppress_other_loggers,
-)
+from sglang.srt.utils import configure_logger, kill_process_tree, suppress_other_loggers
 
 
 @dataclasses.dataclass
@@ -468,4 +465,4 @@ def main(server_args, bench_args):
         main(server_args, bench_args)
     finally:
         if server_args.tp_size != 1:
-            kill_child_process()
+            kill_process_tree(os.getpid(), include_parent=False)
diff --git a/python/sglang/bench_one_batch_server.py b/python/sglang/bench_one_batch_server.py
@@ -15,6 +15,7 @@
 import itertools
 import json
 import multiprocessing
+import os
 import time
 from typing import Tuple
 
@@ -23,7 +24,7 @@
 
 from sglang.srt.server import launch_server
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import kill_child_process
+from sglang.srt.utils import kill_process_tree
 
 
 @dataclasses.dataclass
@@ -69,7 +70,7 @@ def launch_server_internal(server_args):
     except Exception as e:
         raise e
     finally:
-        kill_child_process()
+        kill_process_tree(os.getpid(), include_parent=False)
 
 
 def launch_server_process(server_args: ServerArgs):
@@ -175,7 +176,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
             )
     finally:
         if proc:
-            kill_child_process(proc.pid, include_self=True)
+            kill_process_tree(proc.pid)
 
     print(f"\nResults are saved to {bench_args.result_filename}")
 

diff --git a/python/sglang/launch_server.py b/python/sglang/launch_server.py
@@ -1,15 +1,16 @@
 """Launch the inference server."""
 
+import os
 import sys
 
 from sglang.srt.server import launch_server
 from sglang.srt.server_args import prepare_server_args
-from sglang.srt.utils import kill_child_process
+from sglang.srt.utils import kill_process_tree
 
 if __name__ == "__main__":
     server_args = prepare_server_args(sys.argv[1:])
 
     try:
         launch_server(server_args)
     finally:
-        kill_child_process()
+        kill_process_tree(os.getpid(), include_parent=False)
diff --git a/...ame=AMD_Instinct_MI300X,dtype=float8.json → ...e=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/...ame=AMD_Instinct_MI300X,dtype=float8.json → ...e=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
diff --git a/...ame=AMD_Instinct_MI300X,dtype=float8.json → ...e=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/...ame=AMD_Instinct_MI300X,dtype=float8.json → ...e=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
diff --git a/python/sglang/srt/managers/data_parallel_controller.py b/python/sglang/srt/managers/data_parallel_controller.py
@@ -15,9 +15,11 @@
 
 import logging
 import multiprocessing as mp
+import signal
 import threading
 from enum import Enum, auto
 
+import psutil
 import zmq
 
 from sglang.srt.managers.io_struct import (
@@ -26,13 +28,7 @@
 )
 from sglang.srt.managers.scheduler import run_scheduler_process
 from sglang.srt.server_args import PortArgs, ServerArgs
-from sglang.srt.utils import (
-    bind_port,
-    configure_logger,
-    get_zmq_socket,
-    kill_parent_process,
-    suppress_other_loggers,
-)
+from sglang.srt.utils import bind_port, configure_logger, get_zmq_socket
 from sglang.utils import get_exception_traceback
 
 logger = logging.getLogger(__name__)
@@ -235,7 +231,7 @@ def run_data_parallel_controller_process(
     pipe_writer,
 ):
     configure_logger(server_args)
-    suppress_other_loggers()
+    parent_process = psutil.Process().parent()
 
     try:
         controller = DataParallelController(server_args, port_args)
@@ -244,6 +240,6 @@ def run_data_parallel_controller_process(
         )
         controller.event_loop()
     except Exception:
-        msg = get_exception_traceback()
-        logger.error(msg)
-        kill_parent_process()
+        traceback = get_exception_traceback()
+        logger.error(f"DataParallelController hit an exception: {traceback}")
+        parent_process.send_signal(signal.SIGQUIT)
diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py
@@ -15,9 +15,11 @@
 
 import dataclasses
 import logging
+import signal
 from collections import OrderedDict
 from typing import List, Union
 
+import psutil
 import zmq
 
 from sglang.srt.hf_transformers_utils import get_tokenizer
@@ -28,7 +30,7 @@
 )
 from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR, FINISH_MATCHED_TOKEN
 from sglang.srt.server_args import PortArgs, ServerArgs
-from sglang.srt.utils import configure_logger, get_zmq_socket, kill_parent_process
+from sglang.srt.utils import configure_logger, get_zmq_socket
 from sglang.utils import find_printable_text, get_exception_traceback
 
 logger = logging.getLogger(__name__)
@@ -193,11 +195,12 @@ def run_detokenizer_process(
     port_args: PortArgs,
 ):
     configure_logger(server_args)
+    parent_process = psutil.Process().parent()
 
     try:
         manager = DetokenizerManager(server_args, port_args)
         manager.event_loop()
     except Exception:
-        msg = get_exception_traceback()
-        logger.error(msg)
-        kill_parent_process()
+        traceback = get_exception_traceback()
+        logger.error(f"DetokenizerManager hit an exception: {traceback}")
+        parent_process.send_signal(signal.SIGQUIT)
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
@@ -231,6 +231,7 @@ def __init__(
         self.tokenizer = None
         self.finished_reason = None
         self.stream = False
+        self.to_abort = False
 
         # For incremental decoding
         # ----- | --------- read_ids -------|
@@ -368,6 +369,10 @@ def check_finished(self):
         if self.finished():
             return
 
+        if self.to_abort:
+            self.finished_reason = FINISH_ABORT()
+            return
+
         if len(self.output_ids) >= self.sampling_params.max_new_tokens:
             self.finished_reason = FINISH_LENGTH(
                 length=self.sampling_params.max_new_tokens

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
@@ -15,6 +15,7 @@
 
 import logging
 import os
+import signal
 import threading
 import time
 import warnings
@@ -23,6 +24,7 @@
 from types import SimpleNamespace
 from typing import List, Optional
 
+import psutil
 import torch
 import zmq
 
@@ -73,7 +75,6 @@
     crash_on_warnings,
     get_bool_env_var,
     get_zmq_socket,
-    kill_parent_process,
     set_gpu_proc_affinity,
     set_random_seed,
     suppress_other_loggers,
@@ -170,6 +171,10 @@ def __init__(
             self.enable_overlap = False
             logger.info("Overlap scheduler is disabled for embedding models.")
 
+        if self.model_config.is_multimodal:
+            self.enable_overlap = False
+            logger.info("Overlap scheduler is disabled for multimodal models.")
+
         if self.enable_overlap:
             self.disable_jump_forward = True
 
@@ -312,6 +317,7 @@ def __init__(
         self.watchdog_timeout = server_args.watchdog_timeout
         t = threading.Thread(target=self.watchdog_thread, daemon=True)
         t.start()
+        self.parent_process = psutil.Process().parent()
 
         # Init profiler
         if os.getenv("SGLANG_TORCH_PROFILER_DIR", "") == "":
@@ -355,7 +361,7 @@ def watchdog_thread(self):
                     self.watchdog_last_time = time.time()
             time.sleep(self.watchdog_timeout / 2)
 
-        kill_parent_process()
+        self.parent_process.send_signal(signal.SIGQUIT)
 
     @torch.no_grad()
     def event_loop_normal(self):
@@ -573,6 +579,8 @@ def handle_generate_request(
                     "Image request length is longer than the KV cache pool size or "
                     "the max context length aborting because you cannot truncate the image embeds"
                 )
+                req.image_inputs = None
+                req.origin_input_ids = [0]
                 req.sampling_params.max_new_tokens = 0
                 self.waiting_queue.append(req)
                 return
@@ -1344,13 +1352,15 @@ def abort_request(self, recv_req: AbortReq):
 
         if to_del is not None:
             del self.waiting_queue[to_del]
+            logger.debug(f"Abort queued request. {req.rid=}")
+            return
 
         # Delete requests in the running batch
         if self.running_batch:
             for req in self.running_batch.reqs:
                 if req.rid == recv_req.rid and not req.finished():
-                    req.finished_reason = FINISH_ABORT()
-                    self.tree_cache.cache_finished_req(req)
+                    logger.debug(f"Abort running request. {req.rid=}")
+                    req.to_abort = True
                     break
 
     def update_weights(self, recv_req: UpdateWeightReqInput):
@@ -1419,6 +1429,7 @@ def run_scheduler_process(
         configure_logger(server_args, prefix=f" DP{dp_rank} TP{tp_rank}")
 
     suppress_other_loggers()
+    parent_process = psutil.Process().parent()
 
     try:
         scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, dp_rank)
@@ -1430,6 +1441,6 @@ def run_scheduler_process(
         else:
             scheduler.event_loop_normal()
     except Exception:
-        msg = get_exception_traceback()
-        logger.error(msg)
-        kill_parent_process()
+        traceback = get_exception_traceback()
+        logger.error(f"Scheduler hit an exception: {traceback}")
+        parent_process.send_signal(signal.SIGQUIT)
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
@@ -58,7 +58,7 @@
 from sglang.srt.metrics.collector import TokenizerMetricsCollector
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import PortArgs, ServerArgs
-from sglang.srt.utils import get_zmq_socket, kill_child_process
+from sglang.srt.utils import get_zmq_socket, kill_process_tree
 
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 
@@ -532,7 +532,7 @@ async def sigterm_watchdog(self):
             else:
                 break
 
-        kill_child_process(include_self=True)
+        kill_process_tree(os.getpid(), include_parent=True)
         sys.exit(0)
 
     async def handle_loop(self):