From cdcbde5fc3155edaa6b98a13ab8764101e657b23 Mon Sep 17 00:00:00 2001 From: Liangsheng Yin Date: Mon, 29 Jul 2024 23:04:48 -0700 Subject: [PATCH 1/6] Code structure refactor (#807) --- docs/en/hyperparameter_tuning.md | 6 +- python/sglang/__init__.py | 61 ++++++++++--------- python/sglang/bench_latency.py | 4 +- python/sglang/srt/layers/logits_processor.py | 2 +- python/sglang/srt/layers/radix_attention.py | 2 +- python/sglang/srt/layers/token_attention.py | 2 +- .../manager_multi.py => controller_multi.py} | 2 +- ...manager_single.py => controller_single.py} | 2 +- .../srt/managers/detokenizer_manager.py | 2 +- python/sglang/srt/managers/io_struct.py | 2 +- ...edule_heuristic.py => policy_scheduler.py} | 24 ++++---- .../infer_batch.py => schedule_batch.py} | 4 +- .../managers/{controller => }/tp_worker.py | 26 ++++---- .../sglang/srt/{ => mem_cache}/flush_cache.py | 2 +- .../sglang/srt/{ => mem_cache}/memory_pool.py | 0 .../controller => mem_cache}/radix_cache.py | 0 .../cuda_graph_runner.py | 2 +- .../model_runner.py | 6 +- python/sglang/srt/models/chatglm.py | 2 +- python/sglang/srt/models/commandr.py | 2 +- python/sglang/srt/models/dbrx.py | 2 +- python/sglang/srt/models/deepseek.py | 2 +- python/sglang/srt/models/deepseek_v2.py | 2 +- python/sglang/srt/models/gemma.py | 2 +- python/sglang/srt/models/gemma2.py | 2 +- python/sglang/srt/models/gpt_bigcode.py | 2 +- python/sglang/srt/models/grok.py | 2 +- python/sglang/srt/models/internlm2.py | 2 +- python/sglang/srt/models/llama2.py | 2 +- .../sglang/srt/models/llama_classification.py | 2 +- python/sglang/srt/models/llava.py | 4 +- python/sglang/srt/models/llavavid.py | 4 +- python/sglang/srt/models/minicpm.py | 2 +- python/sglang/srt/models/mixtral.py | 2 +- python/sglang/srt/models/mixtral_quant.py | 2 +- python/sglang/srt/models/qwen.py | 2 +- python/sglang/srt/models/qwen2.py | 2 +- python/sglang/srt/models/qwen2_moe.py | 2 +- python/sglang/srt/models/stablelm.py | 2 +- python/sglang/srt/server.py | 6 +- python/sglang/srt/server_args.py | 8 +-- 41 files changed, 105 insertions(+), 104 deletions(-) rename python/sglang/srt/managers/{controller/manager_multi.py => controller_multi.py} (99%) rename python/sglang/srt/managers/{controller/manager_single.py => controller_single.py} (98%) rename python/sglang/srt/managers/{controller/schedule_heuristic.py => policy_scheduler.py} (82%) rename python/sglang/srt/managers/{controller/infer_batch.py => schedule_batch.py} (99%) rename python/sglang/srt/managers/{controller => }/tp_worker.py (98%) rename python/sglang/srt/{ => mem_cache}/flush_cache.py (92%) rename python/sglang/srt/{ => mem_cache}/memory_pool.py (100%) rename python/sglang/srt/{managers/controller => mem_cache}/radix_cache.py (100%) rename python/sglang/srt/{managers/controller => model_executor}/cuda_graph_runner.py (99%) rename python/sglang/srt/{managers/controller => model_executor}/model_runner.py (98%) diff --git a/docs/en/hyperparameter_tuning.md b/docs/en/hyperparameter_tuning.md index 85315e74570..2ea43e26a14 100644 --- a/docs/en/hyperparameter_tuning.md +++ b/docs/en/hyperparameter_tuning.md @@ -29,7 +29,7 @@ If OOM happens during prefill, try to decrease `--max-prefill-tokens`. If OOM happens during decoding, try to decrease `--max-running-requests`. You can also try to decrease `--mem-fraction-static`, which reduces the memory usage of the KV cache memory pool and helps both prefill and decoding. -### (Minor) Tune `--schedule-heuristic` -If you have many shared prefixes, use the default `--schedule-heuristic lpm`. `lpm` stands for longest prefix match. +### (Minor) Tune `--schedule-policy` +If you have many shared prefixes, use the default `--schedule-policy lpm`. `lpm` stands for longest prefix match. When you have no shared prefixes at all or you always send the requests with the shared prefixes together, -you can try `--schedule-heuristic fcfs`. `fcfs` stands for first come first serve. +you can try `--schedule-policy fcfs`. `fcfs` stands for first come first serve. diff --git a/python/sglang/__init__.py b/python/sglang/__init__.py index 413ab9e7c06..f4eec131e02 100644 --- a/python/sglang/__init__.py +++ b/python/sglang/__init__.py @@ -1,4 +1,5 @@ # SGL API Components + from sglang.api import ( Runtime, assistant, @@ -22,46 +23,46 @@ video, ) -# Global Configurations -from sglang.global_config import global_config - -# SGL Backends -from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint -from sglang.utils import LazyImport -from sglang.version import __version__ - -Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic") -LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM") -OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI") -VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI") - - -# public APIs management +# SGLang DSL APIs __all__ = [ - "global_config", - "Anthropic", - "LiteLLM", - "OpenAI", - "RuntimeEndpoint", - "VertexAI", - "function", "Runtime", - "set_default_backend", + "assistant", + "assistant_begin", + "assistant_end", "flush_cache", - "get_server_args", + "function", "gen", "gen_int", "gen_string", + "get_server_args", "image", - "video", "select", + "set_default_backend", "system", + "system_begin", + "system_end", "user", - "assistant", "user_begin", "user_end", - "assistant_begin", - "assistant_end", - "system_begin", - "system_end", + "video", ] + +# Global Configurations +from sglang.global_config import global_config + +__all__ += ["global_config"] + +from sglang.version import __version__ + +__all__ += ["__version__"] + +# SGL Backends +from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint +from sglang.utils import LazyImport + +Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic") +LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM") +OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI") +VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI") + +__all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"] diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py index c2eb93a241d..c4ffce634bd 100644 --- a/python/sglang/bench_latency.py +++ b/python/sglang/bench_latency.py @@ -37,9 +37,9 @@ import torch.distributed as dist from sglang.srt.hf_transformers_utils import get_tokenizer -from sglang.srt.managers.controller.infer_batch import Batch, ForwardMode, Req -from sglang.srt.managers.controller.model_runner import ModelRunner +from sglang.srt.managers.schedule_batch import Batch, ForwardMode, Req from sglang.srt.model_config import ModelConfig +from sglang.srt.model_executor.model_runner import ModelRunner from sglang.srt.sampling_params import SamplingParams from sglang.srt.server_args import ServerArgs from sglang.srt.utils import suppress_other_loggers diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py index d3aa2469af3..200071c6046 100644 --- a/python/sglang/srt/layers/logits_processor.py +++ b/python/sglang/srt/layers/logits_processor.py @@ -25,7 +25,7 @@ tensor_model_parallel_all_gather, ) -from sglang.srt.managers.controller.model_runner import ForwardMode, InputMetadata +from sglang.srt.model_executor.model_runner import ForwardMode, InputMetadata @dataclasses.dataclass diff --git a/python/sglang/srt/layers/radix_attention.py b/python/sglang/srt/layers/radix_attention.py index fb95106bef3..ab3a650290f 100644 --- a/python/sglang/srt/layers/radix_attention.py +++ b/python/sglang/srt/layers/radix_attention.py @@ -22,7 +22,7 @@ from sglang.global_config import global_config from sglang.srt.layers.extend_attention import extend_attention_fwd from sglang.srt.layers.token_attention import token_attention_fwd -from sglang.srt.managers.controller.model_runner import ( +from sglang.srt.model_executor.model_runner import ( ForwardMode, InputMetadata, global_server_args_dict, diff --git a/python/sglang/srt/layers/token_attention.py b/python/sglang/srt/layers/token_attention.py index 565e1359f0c..a792b7f3aee 100644 --- a/python/sglang/srt/layers/token_attention.py +++ b/python/sglang/srt/layers/token_attention.py @@ -20,7 +20,7 @@ import triton import triton.language as tl -from sglang.srt.managers.controller.infer_batch import global_server_args_dict +from sglang.srt.managers.schedule_batch import global_server_args_dict if global_server_args_dict.get("attention_reduce_in_fp32", False): REDUCE_TRITON_TYPE = tl.float32 diff --git a/python/sglang/srt/managers/controller/manager_multi.py b/python/sglang/srt/managers/controller_multi.py similarity index 99% rename from python/sglang/srt/managers/controller/manager_multi.py rename to python/sglang/srt/managers/controller_multi.py index 08c9db82b8f..dcd984e0f2d 100644 --- a/python/sglang/srt/managers/controller/manager_multi.py +++ b/python/sglang/srt/managers/controller_multi.py @@ -27,7 +27,7 @@ import numpy as np import zmq -from sglang.srt.managers.controller.manager_single import ( +from sglang.srt.managers.controller_single import ( start_controller_process as start_controller_process_single, ) from sglang.srt.managers.io_struct import ( diff --git a/python/sglang/srt/managers/controller/manager_single.py b/python/sglang/srt/managers/controller_single.py similarity index 98% rename from python/sglang/srt/managers/controller/manager_single.py rename to python/sglang/srt/managers/controller_single.py index 012d6c1d644..415325b131c 100644 --- a/python/sglang/srt/managers/controller/manager_single.py +++ b/python/sglang/srt/managers/controller_single.py @@ -22,7 +22,7 @@ import zmq -from sglang.srt.managers.controller.tp_worker import ( +from sglang.srt.managers.tp_worker import ( ModelTpServer, broadcast_recv_input, launch_tp_servers, diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py index b8607482e04..0bd03d31481 100644 --- a/python/sglang/srt/managers/detokenizer_manager.py +++ b/python/sglang/srt/managers/detokenizer_manager.py @@ -25,8 +25,8 @@ import zmq.asyncio from sglang.srt.hf_transformers_utils import get_tokenizer -from sglang.srt.managers.controller.infer_batch import FINISH_MATCHED_STR from sglang.srt.managers.io_struct import BatchStrOut, BatchTokenIDOut +from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR from sglang.srt.server_args import PortArgs, ServerArgs from sglang.utils import find_printable_text, get_exception_traceback, graceful_registry diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py index f0b927a69e8..036837a3726 100644 --- a/python/sglang/srt/managers/io_struct.py +++ b/python/sglang/srt/managers/io_struct.py @@ -22,7 +22,7 @@ from dataclasses import dataclass from typing import Dict, List, Optional, Union -from sglang.srt.managers.controller.infer_batch import BaseFinishReason +from sglang.srt.managers.schedule_batch import BaseFinishReason from sglang.srt.sampling_params import SamplingParams diff --git a/python/sglang/srt/managers/controller/schedule_heuristic.py b/python/sglang/srt/managers/policy_scheduler.py similarity index 82% rename from python/sglang/srt/managers/controller/schedule_heuristic.py rename to python/sglang/srt/managers/policy_scheduler.py index d1f45836b3c..0eecc41d85b 100644 --- a/python/sglang/srt/managers/controller/schedule_heuristic.py +++ b/python/sglang/srt/managers/policy_scheduler.py @@ -13,47 +13,47 @@ limitations under the License. """ -"""Request scheduler heuristic.""" +"""Request policy scheduler""" import random from collections import defaultdict -class ScheduleHeuristic: +class PolicyScheduler: def __init__( self, - schedule_heuristic, + policy, max_running_seqs, max_prefill_num_tokens, max_total_num_tokens, tree_cache, ): - if tree_cache.disable and schedule_heuristic == "lpm": + if tree_cache.disable and policy == "lpm": # LMP is meaningless when the tree cache is disabled. - schedule_heuristic = "fcfs" + policy = "fcfs" - self.schedule_heuristic = schedule_heuristic + self.policy = policy self.max_running_seqs = max_running_seqs self.max_prefill_num_tokens = max_prefill_num_tokens self.max_total_num_tokens = max_total_num_tokens self.tree_cache = tree_cache def get_priority_queue(self, waiting_queue): - if self.schedule_heuristic == "lpm": + if self.policy == "lpm": # longest prefix match waiting_queue.sort(key=lambda x: -len(x.prefix_indices)) return waiting_queue - elif self.schedule_heuristic == "fcfs": + elif self.policy == "fcfs": # first come first serve return waiting_queue - elif self.schedule_heuristic == "lof": + elif self.policy == "lof": # longest output first waiting_queue.sort(key=lambda x: -x.sampling_params.max_new_tokens) return waiting_queue - elif self.schedule_heuristic == "random": + elif self.policy == "random": random.shuffle(waiting_queue) return waiting_queue - elif self.schedule_heuristic == "dfs-weight": + elif self.policy == "dfs-weight": last_node_to_reqs = defaultdict(list) for req in waiting_queue: last_node_to_reqs[req.last_node].append(req) @@ -70,7 +70,7 @@ def get_priority_queue(self, waiting_queue): assert len(q) == len(waiting_queue) return q else: - raise ValueError(f"Unknown schedule_heuristic: {self.schedule_heuristic}") + raise ValueError(f"Unknown schedule_policy: {self.policy}") def calc_weight(self, cur_node, node_to_weight): for child in cur_node.children.values(): diff --git a/python/sglang/srt/managers/controller/infer_batch.py b/python/sglang/srt/managers/schedule_batch.py similarity index 99% rename from python/sglang/srt/managers/controller/infer_batch.py rename to python/sglang/srt/managers/schedule_batch.py index a80a9d657e2..6cfd2f6509e 100644 --- a/python/sglang/srt/managers/controller/infer_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -28,8 +28,8 @@ from sglang.global_config import global_config from sglang.srt.constrained import RegexGuide from sglang.srt.constrained.jump_forward import JumpForwardMap -from sglang.srt.managers.controller.radix_cache import RadixCache -from sglang.srt.memory_pool import ReqToTokenPool, TokenToKVPool +from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPool +from sglang.srt.mem_cache.radix_cache import RadixCache INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5 diff --git a/python/sglang/srt/managers/controller/tp_worker.py b/python/sglang/srt/managers/tp_worker.py similarity index 98% rename from python/sglang/srt/managers/controller/tp_worker.py rename to python/sglang/srt/managers/tp_worker.py index a688c53e302..d21a0c694af 100644 --- a/python/sglang/srt/managers/controller/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -29,23 +29,23 @@ from sglang.srt.constrained.fsm_cache import FSMCache from sglang.srt.constrained.jump_forward import JumpForwardCache from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer -from sglang.srt.managers.controller.infer_batch import ( - FINISH_ABORT, - BaseFinishReason, - Batch, - ForwardMode, - Req, -) -from sglang.srt.managers.controller.model_runner import ModelRunner -from sglang.srt.managers.controller.radix_cache import RadixCache -from sglang.srt.managers.controller.schedule_heuristic import ScheduleHeuristic from sglang.srt.managers.io_struct import ( AbortReq, BatchTokenIDOut, FlushCacheReq, TokenizedGenerateReqInput, ) +from sglang.srt.managers.policy_scheduler import PolicyScheduler +from sglang.srt.managers.schedule_batch import ( + FINISH_ABORT, + BaseFinishReason, + Batch, + ForwardMode, + Req, +) +from sglang.srt.mem_cache.radix_cache import RadixCache from sglang.srt.model_config import ModelConfig +from sglang.srt.model_executor.model_runner import ModelRunner from sglang.srt.server_args import ServerArgs from sglang.srt.utils import ( get_int_token_logit_bias, @@ -74,7 +74,7 @@ def __init__( self.tp_rank = tp_rank self.tp_size = server_args.tp_size self.dp_size = server_args.dp_size - self.schedule_heuristic = server_args.schedule_heuristic + self.schedule_policy = server_args.schedule_policy self.disable_regex_jump_forward = server_args.disable_regex_jump_forward # Chunked prefill @@ -150,8 +150,8 @@ def __init__( disable=server_args.disable_radix_cache, ) self.tree_cache_metrics = {"total": 0, "hit": 0} - self.scheduler = ScheduleHeuristic( - self.schedule_heuristic, + self.scheduler = PolicyScheduler( + self.schedule_policy, self.max_running_requests, self.max_prefill_tokens, self.max_total_num_tokens, diff --git a/python/sglang/srt/flush_cache.py b/python/sglang/srt/mem_cache/flush_cache.py similarity index 92% rename from python/sglang/srt/flush_cache.py rename to python/sglang/srt/mem_cache/flush_cache.py index 4ef3ab1d3b6..3ac425ac8c0 100644 --- a/python/sglang/srt/flush_cache.py +++ b/python/sglang/srt/mem_cache/flush_cache.py @@ -17,7 +17,7 @@ Flush the KV cache. Usage: -python3 -m sglang.srt.flush_cache --url http://localhost:30000 +python3 -m sglang.srt.mem_cache.flush_cache --url http://localhost:30000 """ import argparse diff --git a/python/sglang/srt/memory_pool.py b/python/sglang/srt/mem_cache/memory_pool.py similarity index 100% rename from python/sglang/srt/memory_pool.py rename to python/sglang/srt/mem_cache/memory_pool.py diff --git a/python/sglang/srt/managers/controller/radix_cache.py b/python/sglang/srt/mem_cache/radix_cache.py similarity index 100% rename from python/sglang/srt/managers/controller/radix_cache.py rename to python/sglang/srt/mem_cache/radix_cache.py diff --git a/python/sglang/srt/managers/controller/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py similarity index 99% rename from python/sglang/srt/managers/controller/cuda_graph_runner.py rename to python/sglang/srt/model_executor/cuda_graph_runner.py index 7d59eeef521..458395e7328 100644 --- a/python/sglang/srt/managers/controller/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -29,7 +29,7 @@ LogitsMetadata, LogitsProcessor, ) -from sglang.srt.managers.controller.infer_batch import ( +from sglang.srt.managers.schedule_batch import ( Batch, ForwardMode, InputMetadata, diff --git a/python/sglang/srt/managers/controller/model_runner.py b/python/sglang/srt/model_executor/model_runner.py similarity index 98% rename from python/sglang/srt/managers/controller/model_runner.py rename to python/sglang/srt/model_executor/model_runner.py index 24c59b6ff85..10b1b40ded8 100644 --- a/python/sglang/srt/managers/controller/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -40,13 +40,13 @@ from vllm.model_executor.models import ModelRegistry from sglang.global_config import global_config -from sglang.srt.managers.controller.infer_batch import ( +from sglang.srt.managers.schedule_batch import ( Batch, ForwardMode, InputMetadata, global_server_args_dict, ) -from sglang.srt.memory_pool import ReqToTokenPool, TokenToKVPool +from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPool from sglang.srt.server_args import ServerArgs from sglang.srt.utils import ( get_available_gpu_memory, @@ -273,7 +273,7 @@ def init_flash_infer(self): ) def init_cuda_graphs(self): - from sglang.srt.managers.controller.cuda_graph_runner import CudaGraphRunner + from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner if self.server_args.disable_cuda_graph or self.server_args.disable_flashinfer: self.cuda_graph_runner = None diff --git a/python/sglang/srt/models/chatglm.py b/python/sglang/srt/models/chatglm.py index 9df6e4fd34d..4589a14aca5 100644 --- a/python/sglang/srt/models/chatglm.py +++ b/python/sglang/srt/models/chatglm.py @@ -45,7 +45,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.model_executor.model_runner import InputMetadata LoraConfig = None diff --git a/python/sglang/srt/models/commandr.py b/python/sglang/srt/models/commandr.py index cc4ce9d4a2e..671746bf7bf 100644 --- a/python/sglang/srt/models/commandr.py +++ b/python/sglang/srt/models/commandr.py @@ -64,7 +64,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.model_executor.model_runner import InputMetadata @torch.compile diff --git a/python/sglang/srt/models/dbrx.py b/python/sglang/srt/models/dbrx.py index 3104ca7c313..1d0f40bd3c9 100644 --- a/python/sglang/srt/models/dbrx.py +++ b/python/sglang/srt/models/dbrx.py @@ -45,7 +45,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.model_executor.model_runner import InputMetadata class DbrxRouter(nn.Module): diff --git a/python/sglang/srt/models/deepseek.py b/python/sglang/srt/models/deepseek.py index c12b8a09c12..09481e71b52 100644 --- a/python/sglang/srt/models/deepseek.py +++ b/python/sglang/srt/models/deepseek.py @@ -46,7 +46,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.controller.infer_batch import InputMetadata +from sglang.srt.managers.schedule_batch import InputMetadata class DeepseekMLP(nn.Module): diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index ae3d06ed022..4cc37c388a9 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -45,7 +45,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.model_executor.model_runner import InputMetadata class DeepseekV2MLP(nn.Module): diff --git a/python/sglang/srt/models/gemma.py b/python/sglang/srt/models/gemma.py index aa42ad5082c..843bc5d28df 100644 --- a/python/sglang/srt/models/gemma.py +++ b/python/sglang/srt/models/gemma.py @@ -37,7 +37,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.model_executor.model_runner import InputMetadata class GemmaMLP(nn.Module): diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py index 02f20e70516..4c77e0c695d 100644 --- a/python/sglang/srt/models/gemma2.py +++ b/python/sglang/srt/models/gemma2.py @@ -42,7 +42,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.model_executor.model_runner import InputMetadata class GemmaRMSNorm(CustomOp): diff --git a/python/sglang/srt/models/gpt_bigcode.py b/python/sglang/srt/models/gpt_bigcode.py index 0ac89f64897..eee7f648379 100644 --- a/python/sglang/srt/models/gpt_bigcode.py +++ b/python/sglang/srt/models/gpt_bigcode.py @@ -35,7 +35,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.controller.infer_batch import InputMetadata +from sglang.srt.managers.schedule_batch import InputMetadata class GPTBigCodeAttention(nn.Module): diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py index 9c4251b0921..b989c4e79ed 100644 --- a/python/sglang/srt/models/grok.py +++ b/python/sglang/srt/models/grok.py @@ -52,7 +52,7 @@ from sglang.srt.layers.fused_moe import fused_moe from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.model_executor.model_runner import InputMetadata use_fused = True diff --git a/python/sglang/srt/models/internlm2.py b/python/sglang/srt/models/internlm2.py index bf6d99e3c86..35f81f8a9a8 100644 --- a/python/sglang/srt/models/internlm2.py +++ b/python/sglang/srt/models/internlm2.py @@ -40,7 +40,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.model_executor.model_runner import InputMetadata class InternLM2MLP(nn.Module): diff --git a/python/sglang/srt/models/llama2.py b/python/sglang/srt/models/llama2.py index 2287cf0a1d9..3e24e7b9cae 100644 --- a/python/sglang/srt/models/llama2.py +++ b/python/sglang/srt/models/llama2.py @@ -36,7 +36,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.model_executor.model_runner import InputMetadata MergedColumnParallelLinear = None QKVParallelLinear = None diff --git a/python/sglang/srt/models/llama_classification.py b/python/sglang/srt/models/llama_classification.py index f96eae0932f..3ffb256dd22 100644 --- a/python/sglang/srt/models/llama_classification.py +++ b/python/sglang/srt/models/llama_classification.py @@ -25,7 +25,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from sglang.srt.layers.logits_processor import LogitProcessorOutput -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.model_executor.model_runner import InputMetadata from sglang.srt.models.llama2 import LlamaModel diff --git a/python/sglang/srt/models/llava.py b/python/sglang/srt/models/llava.py index 2fcc4e99837..f89a9b61878 100644 --- a/python/sglang/srt/models/llava.py +++ b/python/sglang/srt/models/llava.py @@ -32,13 +32,13 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from sglang.srt.managers.controller.infer_batch import ForwardMode -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.managers.schedule_batch import ForwardMode from sglang.srt.mm_utils import ( get_anyres_image_grid_shape, unpad_image, unpad_image_shape, ) +from sglang.srt.model_executor.model_runner import InputMetadata from sglang.srt.models.llama2 import LlamaForCausalLM from sglang.srt.models.mistral import MistralForCausalLM from sglang.srt.models.qwen2 import Qwen2ForCausalLM diff --git a/python/sglang/srt/models/llavavid.py b/python/sglang/srt/models/llavavid.py index 1f08137c618..3f88d41a192 100644 --- a/python/sglang/srt/models/llavavid.py +++ b/python/sglang/srt/models/llavavid.py @@ -26,13 +26,13 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from sglang.srt.managers.controller.infer_batch import ForwardMode -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.managers.schedule_batch import ForwardMode from sglang.srt.mm_utils import ( get_anyres_image_grid_shape, unpad_image, unpad_image_shape, ) +from sglang.srt.model_executor.model_runner import InputMetadata from sglang.srt.models.llama2 import LlamaForCausalLM diff --git a/python/sglang/srt/models/minicpm.py b/python/sglang/srt/models/minicpm.py index 7a07335d1b3..ab2a083255b 100644 --- a/python/sglang/srt/models/minicpm.py +++ b/python/sglang/srt/models/minicpm.py @@ -39,7 +39,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.model_executor.model_runner import InputMetadata class MiniCPMMLP(nn.Module): diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index 0cfbad7194b..a7d45d455a3 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -50,7 +50,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.model_executor.model_runner import InputMetadata class MixtralMoE(nn.Module): diff --git a/python/sglang/srt/models/mixtral_quant.py b/python/sglang/srt/models/mixtral_quant.py index fce04cc8975..d643db33f35 100644 --- a/python/sglang/srt/models/mixtral_quant.py +++ b/python/sglang/srt/models/mixtral_quant.py @@ -45,7 +45,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.model_executor.model_runner import InputMetadata class MixtralMLP(nn.Module): diff --git a/python/sglang/srt/models/qwen.py b/python/sglang/srt/models/qwen.py index cf6b264f32f..52edd28bccf 100644 --- a/python/sglang/srt/models/qwen.py +++ b/python/sglang/srt/models/qwen.py @@ -39,7 +39,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.model_executor.model_runner import InputMetadata class QWenMLP(nn.Module): diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py index 80ab61b64bb..2df91814e39 100644 --- a/python/sglang/srt/models/qwen2.py +++ b/python/sglang/srt/models/qwen2.py @@ -39,7 +39,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.model_executor.model_runner import InputMetadata Qwen2Config = None diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py index 213ba6d3c54..7475d8f6297 100644 --- a/python/sglang/srt/models/qwen2_moe.py +++ b/python/sglang/srt/models/qwen2_moe.py @@ -51,7 +51,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.model_executor.model_runner import InputMetadata class Qwen2MoeMLP(nn.Module): diff --git a/python/sglang/srt/models/stablelm.py b/python/sglang/srt/models/stablelm.py index 4589c997ca8..76f40437a2b 100644 --- a/python/sglang/srt/models/stablelm.py +++ b/python/sglang/srt/models/stablelm.py @@ -40,7 +40,7 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.managers.controller.model_runner import InputMetadata +from sglang.srt.model_executor.model_runner import InputMetadata class StablelmMLP(nn.Module): diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index f1b5dae9c93..4c8ace96289 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -44,11 +44,11 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint from sglang.srt.constrained import disable_cache from sglang.srt.hf_transformers_utils import get_tokenizer -from sglang.srt.managers.controller.manager_multi import ( +from sglang.srt.managers.controller_multi import ( start_controller_process as start_controller_process_multi, ) -from sglang.srt.managers.controller.manager_single import launch_tp_servers -from sglang.srt.managers.controller.manager_single import ( +from sglang.srt.managers.controller_single import launch_tp_servers +from sglang.srt.managers.controller_single import ( start_controller_process as start_controller_process_single, ) from sglang.srt.managers.detokenizer_manager import start_detokenizer_process diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 8b3de98e25f..e62987dd9d8 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -44,7 +44,7 @@ class ServerArgs: max_prefill_tokens: Optional[int] = None max_running_requests: Optional[int] = None max_num_reqs: Optional[int] = None - schedule_heuristic: str = "lpm" + schedule_policy: str = "lpm" schedule_conservativeness: float = 1.0 # Other runtime options @@ -231,11 +231,11 @@ def add_cli_args(parser: argparse.ArgumentParser): help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.", ) parser.add_argument( - "--schedule-heuristic", + "--schedule-policy", type=str, - default=ServerArgs.schedule_heuristic, + default=ServerArgs.schedule_policy, choices=["lpm", "random", "fcfs", "dfs-weight"], - help="The scheduling heuristic.", + help="The scheduling policy of the requests.", ) parser.add_argument( "--schedule-conservativeness", From bece265f5a189b23bac9ad31d140e11072d5efdf Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Tue, 30 Jul 2024 16:17:50 +1000 Subject: [PATCH 2/6] docs: update README (#819) --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ee93680e67d..9e85f4541bb 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ ### Method 2: From source ``` -git clone https://github.com/sgl-project/sglang.git +# Use the stable rel branch +git clone -b rel https://github.com/sgl-project/sglang.git cd sglang pip install --upgrade pip From daf593a385db3f50cdaf8a71fb1f37548cd73bf8 Mon Sep 17 00:00:00 2001 From: ObjectNotFound <13832753+objnf-dev@users.noreply.github.com> Date: Tue, 30 Jul 2024 15:32:07 +0800 Subject: [PATCH 3/6] Fix streaming bug (#820) --- python/sglang/lang/interpreter.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/python/sglang/lang/interpreter.py b/python/sglang/lang/interpreter.py index 61f0a025900..0133b97c749 100644 --- a/python/sglang/lang/interpreter.py +++ b/python/sglang/lang/interpreter.py @@ -553,6 +553,7 @@ def _execute_select(self, expr: SglSelect): "output_token_logprobs": output_token_logprobs, } self.variable_event[name].set() + self.stream_var_event[name].set() self.text_ += decision def _execute_variable(self, expr: SglVariable): @@ -778,7 +779,14 @@ def text_iter(self, var_name: Optional[str] = None): if self.stream_executor.is_finished: break else: - event = self.stream_executor.stream_var_event[var_name] + event = None + while not event: + if var_name in self.stream_executor.stream_var_event: + event = self.stream_executor.stream_var_event[var_name] + if self.stream_executor.is_finished: + yield "" + return + while True: event.wait() event.clear() @@ -813,7 +821,14 @@ async def text_async_iter( if self.stream_executor.is_finished: break else: - event = self.stream_executor.stream_var_event[var_name] + event = None + while not event: + if var_name in self.stream_executor.stream_var_event: + event = self.stream_executor.stream_var_event[var_name] + if self.stream_executor.is_finished: + yield "" + return + while True: await loop.run_in_executor(None, event.wait) event.clear() From 17af39c5dc6a20f39d5a68dd1ac668477eacadce Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Tue, 30 Jul 2024 17:32:13 +1000 Subject: [PATCH 4/6] feat: add runner (#821) --- .github/workflows/pr-e2e-test.yml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 .github/workflows/pr-e2e-test.yml diff --git a/.github/workflows/pr-e2e-test.yml b/.github/workflows/pr-e2e-test.yml new file mode 100644 index 00000000000..7f6a58407d4 --- /dev/null +++ b/.github/workflows/pr-e2e-test.yml @@ -0,0 +1,30 @@ +name: PR E2E Test + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + workflow_dispatch: + +jobs: + gpu-job: + runs-on: self-hosted + env: + CUDA_VISIBLE_DEVICES: 6 + steps: + - uses: actions/checkout@v2 + - name: Check GPU + run: | + if ! command -v nvidia-smi &> /dev/null; then + echo "nvidia-smi not found. Is CUDA installed?" + exit 1 + fi + nvidia-smi || exit 1 + - name: Environment Info + run: | + echo "Working directory: $(pwd)" + echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" + - name: Run Tests + run: | + echo "Running tests..." From a30d5d75bfde72c99fbd5ffc30a309e793520f66 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Tue, 30 Jul 2024 18:31:26 +1000 Subject: [PATCH 5/6] feat: add pr e2e test (#822) --- .github/workflows/pr-e2e-test.yml | 43 +++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/.github/workflows/pr-e2e-test.yml b/.github/workflows/pr-e2e-test.yml index 7f6a58407d4..4942e89f015 100644 --- a/.github/workflows/pr-e2e-test.yml +++ b/.github/workflows/pr-e2e-test.yml @@ -12,19 +12,36 @@ jobs: runs-on: self-hosted env: CUDA_VISIBLE_DEVICES: 6 + steps: - - uses: actions/checkout@v2 - - name: Check GPU - run: | - if ! command -v nvidia-smi &> /dev/null; then - echo "nvidia-smi not found. Is CUDA installed?" - exit 1 - fi - nvidia-smi || exit 1 - - name: Environment Info + - name: Checkout code + uses: actions/checkout@v3 + + - name: Install dependencies run: | - echo "Working directory: $(pwd)" - echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" - - name: Run Tests + pip install --upgrade pip + pip install -e "python[all]" + pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ --force-reinstall + pip install --upgrade transformers + + - name: Launch server and run benchmark run: | - echo "Running tests..." + python3 -m sglang.launch_server --model /home/lmzheng/zhyncs/Meta-Llama-3.1-8B-Instruct --port 8413 & + + echo "Waiting for server to start..." + for i in {1..60}; do + if curl -s http://127.0.0.1:8413/health; then + echo "Server is up!" + break + fi + if [ $i -eq 60 ]; then + echo "Server failed to start within 60 seconds" + exit 1 + fi + sleep 1 + done + + python3 -m sglang.bench_serving --backend sglang --port 8413 + + echo "Stopping server..." + kill -9 $(ps aux | grep sglang | grep Meta-Llama-3.1-8B-Instruct | grep -v grep | awk '{print $2}') From ae5c0fc442716e9fdc6fddba33c970ab3fe6f208 Mon Sep 17 00:00:00 2001 From: Ying Sheng Date: Tue, 30 Jul 2024 01:42:07 -0700 Subject: [PATCH 6/6] Support disable_ignore_eos in bench_serving.py (#824) --- python/sglang/bench_serving.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 839f947c1b6..3d970d3a9cb 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -84,6 +84,9 @@ async def async_request_trt_llm( "min_length": request_func_input.output_len, "end_id": 1048576, } + if args.disable_ignore_eos: + del payload["min_length"] + del payload["end_id"] output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -149,7 +152,7 @@ async def async_request_openai_completions( "best_of": 1, "max_tokens": request_func_input.output_len, "stream": not args.disable_stream, - "ignore_eos": True, + "ignore_eos": not args.disable_ignore_eos, } headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} @@ -969,6 +972,11 @@ def set_ulimit(target_soft_limit=65535): action="store_true", help="Disable streaming mode.", ) + parser.add_argument( + "--disable-ignore-eos", + action="store_true", + help="Disable ignoring EOS.", + ) set_ulimit()