diff --git a/python/sglang/api.py b/python/sglang/api.py index 9470b1425d..fc6dce16e3 100644 --- a/python/sglang/api.py +++ b/python/sglang/api.py @@ -3,22 +3,10 @@ import re from typing import Callable, List, Optional, Union -from sglang.backend.anthropic import Anthropic from sglang.backend.base_backend import BaseBackend -from sglang.backend.openai import OpenAI -from sglang.backend.runtime_endpoint import RuntimeEndpoint -from sglang.backend.vertexai import VertexAI from sglang.global_config import global_config -from sglang.lang.ir import ( - SglExpr, - SglExprList, - SglFunction, - SglGen, - SglImage, - SglRoleBegin, - SglRoleEnd, - SglSelect, -) +from sglang.lang.ir import (SglExpr, SglExprList, SglFunction, SglGen, + SglImage, SglRoleBegin, SglRoleEnd, SglSelect) def function( diff --git a/python/sglang/backend/anthropic.py b/python/sglang/backend/anthropic.py index aa03cb5b6c..05cf4e77e7 100644 --- a/python/sglang/backend/anthropic.py +++ b/python/sglang/backend/anthropic.py @@ -1,6 +1,4 @@ -from typing import List, Optional, Union -import numpy as np from sglang.backend.base_backend import BaseBackend from sglang.lang.chat_template import get_chat_template from sglang.lang.interpreter import StreamExecutor diff --git a/python/sglang/backend/base_backend.py b/python/sglang/backend/base_backend.py index cb504f51b7..606b821a87 100644 --- a/python/sglang/backend/base_backend.py +++ b/python/sglang/backend/base_backend.py @@ -1,4 +1,4 @@ -from typing import Callable, List, Optional, Union +from typing import List, Optional, Union from sglang.lang.chat_template import get_chat_template from sglang.lang.interpreter import StreamExecutor diff --git a/python/sglang/backend/openai.py b/python/sglang/backend/openai.py index f2dd2f0678..540baae5dd 100644 --- a/python/sglang/backend/openai.py +++ b/python/sglang/backend/openai.py @@ -1,17 +1,17 @@ import logging import time -from typing import Callable, List, Optional, Union +from typing import List, Optional import numpy as np from sglang.backend.base_backend import BaseBackend -from sglang.lang.chat_template import ChatTemplate, get_chat_template_by_model_path +from sglang.lang.chat_template import (ChatTemplate, + get_chat_template_by_model_path) from sglang.lang.interpreter import StreamExecutor from sglang.lang.ir import SglSamplingParams try: - import tiktoken - import openai + import tiktoken except ImportError as e: openai = tiktoken = e diff --git a/python/sglang/backend/runtime_endpoint.py b/python/sglang/backend/runtime_endpoint.py index 3d2ecaa762..bd2c053ffa 100644 --- a/python/sglang/backend/runtime_endpoint.py +++ b/python/sglang/backend/runtime_endpoint.py @@ -1,14 +1,13 @@ import json -from typing import Callable, List, Optional, Union +from typing import List, Optional import numpy as np -import requests from sglang.backend.base_backend import BaseBackend from sglang.global_config import global_config from sglang.lang.chat_template import get_chat_template_by_model_path from sglang.lang.interpreter import StreamExecutor -from sglang.lang.ir import SglArgument, SglSamplingParams -from sglang.utils import encode_image_base64, find_printable_text, http_request +from sglang.lang.ir import SglSamplingParams +from sglang.utils import find_printable_text, http_request class RuntimeEndpoint(BaseBackend): diff --git a/python/sglang/backend/vertexai.py b/python/sglang/backend/vertexai.py index 5c3c307e2e..4c130ff78d 100644 --- a/python/sglang/backend/vertexai.py +++ b/python/sglang/backend/vertexai.py @@ -1,8 +1,6 @@ import os import warnings -from typing import List, Optional, Union -import numpy as np from sglang.backend.base_backend import BaseBackend from sglang.lang.chat_template import get_chat_template from sglang.lang.interpreter import StreamExecutor @@ -10,11 +8,8 @@ try: import vertexai - from vertexai.preview.generative_models import ( - GenerationConfig, - GenerativeModel, - Image, - ) + from vertexai.preview.generative_models import (GenerationConfig, + GenerativeModel, Image) except ImportError as e: GenerativeModel = e diff --git a/python/sglang/lang/chat_template.py b/python/sglang/lang/chat_template.py index 43303bf4c7..aa4bc2f2a3 100644 --- a/python/sglang/lang/chat_template.py +++ b/python/sglang/lang/chat_template.py @@ -1,6 +1,6 @@ -from dataclasses import dataclass, field +from dataclasses import dataclass from enum import Enum, auto -from typing import Callable, Dict, List, Optional, Tuple +from typing import Callable, Dict, List, Tuple class ChatTemplateStyle(Enum): diff --git a/python/sglang/lang/compiler.py b/python/sglang/lang/compiler.py index 2c071e407e..b2a83ea3c2 100644 --- a/python/sglang/lang/compiler.py +++ b/python/sglang/lang/compiler.py @@ -5,13 +5,7 @@ from sglang.global_config import global_config from sglang.lang.interpreter import ProgramState, StreamExecutor, pin_program -from sglang.lang.ir import ( - SglArgument, - SglConstantText, - SglExpr, - SglSamplingParams, - SglVariable, -) +from sglang.lang.ir import SglArgument, SglExpr, SglSamplingParams, SglVariable def compile_func(function, backend): diff --git a/python/sglang/lang/interpreter.py b/python/sglang/lang/interpreter.py index 08a8d401bc..83c4d915a5 100644 --- a/python/sglang/lang/interpreter.py +++ b/python/sglang/lang/interpreter.py @@ -7,26 +7,14 @@ import uuid from concurrent.futures import ThreadPoolExecutor from contextlib import contextmanager -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional import tqdm from sglang.global_config import global_config -from sglang.lang.ir import ( - SglCommitLazy, - SglConcateAndAppend, - SglConstantText, - SglExpr, - SglExprList, - SglFunction, - SglGen, - SglImage, - SglRoleBegin, - SglRoleEnd, - SglSelect, - SglVariable, - SglVarScopeBegin, - SglVarScopeEnd, -) +from sglang.lang.ir import (SglCommitLazy, SglConcateAndAppend, + SglConstantText, SglExpr, SglExprList, SglGen, + SglImage, SglRoleBegin, SglRoleEnd, SglSelect, + SglVariable, SglVarScopeBegin, SglVarScopeEnd) from sglang.utils import encode_image_base64 diff --git a/python/sglang/lang/ir.py b/python/sglang/lang/ir.py index 9895786dc6..66f515686e 100644 --- a/python/sglang/lang/ir.py +++ b/python/sglang/lang/ir.py @@ -472,4 +472,4 @@ def __init__(self): super().__init__() def __repr__(self): - return f"CommitLazy()" + return "CommitLazy()" diff --git a/python/sglang/lang/tracer.py b/python/sglang/lang/tracer.py index 74ac9b9986..fcf618b695 100644 --- a/python/sglang/lang/tracer.py +++ b/python/sglang/lang/tracer.py @@ -1,29 +1,14 @@ """Tracing a program.""" import uuid -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional from sglang.backend.base_backend import BaseBackend -from sglang.global_config import global_config from sglang.lang.interpreter import ProgramState, ProgramStateGroup -from sglang.lang.ir import ( - SglArgument, - SglCommitLazy, - SglConcateAndAppend, - SglConstantText, - SglExpr, - SglExprList, - SglFork, - SglFunction, - SglGen, - SglGetForkItem, - SglRoleBegin, - SglRoleEnd, - SglSelect, - SglVariable, - SglVarScopeBegin, - SglVarScopeEnd, -) +from sglang.lang.ir import (SglArgument, SglConstantText, SglExpr, SglExprList, + SglFork, SglGen, SglGetForkItem, SglRoleBegin, + SglRoleEnd, SglSelect, SglVariable, + SglVarScopeBegin, SglVarScopeEnd) class StopTracing(Exception): diff --git a/python/sglang/srt/hf_transformers_utils.py b/python/sglang/srt/hf_transformers_utils.py index fde8457a39..48de79dc9d 100644 --- a/python/sglang/srt/hf_transformers_utils.py +++ b/python/sglang/srt/hf_transformers_utils.py @@ -3,17 +3,12 @@ import json import os import warnings -from typing import List, Optional, Tuple, Union +from typing import Optional, Union from huggingface_hub import snapshot_download from sglang.srt.utils import is_multimodal_model -from transformers import ( - AutoConfig, - AutoProcessor, - AutoTokenizer, - PreTrainedTokenizer, - PreTrainedTokenizerFast, -) +from transformers import (AutoConfig, AutoProcessor, AutoTokenizer, + PreTrainedTokenizer, PreTrainedTokenizerFast) def download_from_hf(model_path: str): diff --git a/python/sglang/srt/layers/extend_attention.py b/python/sglang/srt/layers/extend_attention.py index 62167a5825..6edc140669 100644 --- a/python/sglang/srt/layers/extend_attention.py +++ b/python/sglang/srt/layers/extend_attention.py @@ -1,7 +1,8 @@ import torch import triton import triton.language as tl -from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd +from sglang.srt.layers.context_flashattention_nopad import \ + context_attention_fwd from sglang.srt.utils import wrap_kernel_launcher CUDA_CAPABILITY = torch.cuda.get_device_capability() diff --git a/python/sglang/srt/layers/logits_processor.py b/python/sglang/srt/layers/logits_processor.py index 980a2cd207..bbca95b347 100644 --- a/python/sglang/srt/layers/logits_processor.py +++ b/python/sglang/srt/layers/logits_processor.py @@ -1,10 +1,8 @@ import torch -from sglang.srt.managers.router.model_runner import ForwardMode, InputMetadata +from sglang.srt.managers.router.model_runner import ForwardMode from torch import nn from vllm.model_executor.parallel_utils.communication_op import ( - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_gather, -) + get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather) class LogitsProcessor(nn.Module): diff --git a/python/sglang/srt/layers/radix_attention.py b/python/sglang/srt/layers/radix_attention.py index 5adc31d3ef..180edb225e 100644 --- a/python/sglang/srt/layers/radix_attention.py +++ b/python/sglang/srt/layers/radix_attention.py @@ -1,5 +1,6 @@ import torch -from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd +from sglang.srt.layers.context_flashattention_nopad import \ + context_attention_fwd from sglang.srt.layers.extend_attention import extend_attention_fwd from sglang.srt.layers.token_attention import token_attention_fwd from sglang.srt.managers.router.model_runner import ForwardMode, InputMetadata @@ -15,7 +16,8 @@ def __init__(self, num_heads, head_dim, scaling, num_kv_heads, layer_id): self.head_dim = head_dim self.layer_id = layer_id - from sglang.srt.managers.router.model_runner import global_server_args_dict + from sglang.srt.managers.router.model_runner import \ + global_server_args_dict if global_server_args_dict.get("enable_flashinfer", False): self.prefill_forward = self.prefill_forward_flashinfer diff --git a/python/sglang/srt/managers/detokenizer_manager.py b/python/sglang/srt/managers/detokenizer_manager.py index 566d40d13a..7b507df241 100644 --- a/python/sglang/srt/managers/detokenizer_manager.py +++ b/python/sglang/srt/managers/detokenizer_manager.py @@ -83,7 +83,7 @@ def start_detokenizer_process( ): try: manager = DetokenizerManager(server_args, port_args) - except Exception as e: + except Exception: pipe_writer.send(get_exception_traceback()) raise pipe_writer.send("init ok") diff --git a/python/sglang/srt/managers/router/model_rpc.py b/python/sglang/srt/managers/router/model_rpc.py index 5c9be20959..0db091d1a2 100644 --- a/python/sglang/srt/managers/router/model_rpc.py +++ b/python/sglang/srt/managers/router/model_rpc.py @@ -13,23 +13,17 @@ from sglang.srt.constrained.fsm_cache import FSMCache from sglang.srt.constrained.jump_forward import JumpForwardCache from sglang.srt.hf_transformers_utils import get_processor, get_tokenizer -from sglang.srt.managers.io_struct import ( - BatchTokenIDOut, - FlushCacheReq, - TokenizedGenerateReqInput, -) +from sglang.srt.managers.io_struct import (BatchTokenIDOut, FlushCacheReq, + TokenizedGenerateReqInput) from sglang.srt.managers.router.infer_batch import Batch, ForwardMode, Req from sglang.srt.managers.router.model_runner import ModelRunner from sglang.srt.managers.router.radix_cache import RadixCache from sglang.srt.managers.router.scheduler import Scheduler from sglang.srt.model_config import ModelConfig from sglang.srt.server_args import PortArgs, ServerArgs -from sglang.srt.utils import ( - get_exception_traceback, - get_int_token_logit_bias, - is_multimodal_model, - set_random_seed, -) +from sglang.srt.utils import (get_exception_traceback, + get_int_token_logit_bias, is_multimodal_model, + set_random_seed) from vllm.logger import _default_handler as vllm_default_handler logger = logging.getLogger("model_rpc") diff --git a/python/sglang/srt/managers/router/model_runner.py b/python/sglang/srt/managers/router/model_runner.py index f349819f30..f564ed4859 100644 --- a/python/sglang/srt/managers/router/model_runner.py +++ b/python/sglang/srt/managers/router/model_runner.py @@ -16,7 +16,8 @@ from vllm.model_executor.layers.quantization.gptq import GPTQConfig from vllm.model_executor.layers.quantization.marlin import MarlinConfig from vllm.model_executor.model_loader import _set_default_torch_dtype -from vllm.model_executor.parallel_utils.parallel_state import initialize_model_parallel +from vllm.model_executor.parallel_utils.parallel_state import \ + initialize_model_parallel QUANTIONCONFIG_MAPPING = {"awq": AWQConfig, "gptq": GPTQConfig, "marlin": MarlinConfig} @@ -92,10 +93,8 @@ class InputMetadata: decode_wrapper = None def init_flashinfer_args(self, tp_size): - from flashinfer import ( - BatchDecodeWithPagedKVCacheWrapper, - BatchPrefillWithPagedKVCacheWrapper, - ) + from flashinfer import (BatchDecodeWithPagedKVCacheWrapper, + BatchPrefillWithPagedKVCacheWrapper) self.kv_indptr = torch.zeros( (self.batch_size + 1,), dtype=torch.int32, device="cuda" diff --git a/python/sglang/srt/managers/router/radix_cache.py b/python/sglang/srt/managers/router/radix_cache.py index 6ee6703091..b6d70583e0 100644 --- a/python/sglang/srt/managers/router/radix_cache.py +++ b/python/sglang/srt/managers/router/radix_cache.py @@ -1,8 +1,6 @@ import heapq import time from collections import defaultdict -from dataclasses import dataclass -from typing import Tuple import torch diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py index 7947ca2ff5..8c82dd5aaa 100644 --- a/python/sglang/srt/managers/tokenizer_manager.py +++ b/python/sglang/srt/managers/tokenizer_manager.py @@ -10,23 +10,16 @@ import uvloop import zmq import zmq.asyncio -from sglang.srt.hf_transformers_utils import ( - get_config, - get_context_length, - get_processor, - get_tokenizer, -) -from sglang.srt.managers.io_struct import ( - BatchStrOut, - DetokenizeReqInput, - FlushCacheReq, - GenerateReqInput, - TokenizedGenerateReqInput, -) +from sglang.srt.hf_transformers_utils import (get_config, get_context_length, + get_processor, get_tokenizer) +from sglang.srt.managers.io_struct import (BatchStrOut, DetokenizeReqInput, + FlushCacheReq, GenerateReqInput, + TokenizedGenerateReqInput) from sglang.srt.mm_utils import expand2square, process_anyres_image from sglang.srt.sampling_params import SamplingParams from sglang.srt.server_args import PortArgs, ServerArgs -from sglang.srt.utils import get_exception_traceback, is_multimodal_model, load_image +from sglang.srt.utils import (get_exception_traceback, is_multimodal_model, + load_image) asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) diff --git a/python/sglang/srt/models/gemma.py b/python/sglang/srt/models/gemma.py index 4030c5cd7f..2e7ada43e7 100644 --- a/python/sglang/srt/models/gemma.py +++ b/python/sglang/srt/models/gemma.py @@ -12,21 +12,17 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import GeluAndMul from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ( - LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear, -) +from vllm.model_executor.layers.linear import (LinearMethodBase, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size, -) -from vllm.model_executor.weight_utils import ( - default_weight_loader, - hf_model_weights_iterator, -) +from vllm.model_executor.layers.vocab_parallel_embedding import \ + VocabParallelEmbedding +from vllm.model_executor.parallel_utils.parallel_state import \ + get_tensor_model_parallel_world_size +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) class GemmaMLP(nn.Module): diff --git a/python/sglang/srt/models/llama2.py b/python/sglang/srt/models/llama2.py index e5c28fa127..559b598c67 100644 --- a/python/sglang/srt/models/llama2.py +++ b/python/sglang/srt/models/llama2.py @@ -1,7 +1,7 @@ # Adapted from # https://github.com/vllm-project/vllm/blob/671af2b1c0b3ed6d856d37c21a561cc429a10701/vllm/model_executor/models/llama.py#L1 """Inference-only LLaMA model compatible with HuggingFace weights.""" -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, Optional, Tuple import torch from sglang.srt.layers.logits_processor import LogitsProcessor @@ -11,24 +11,17 @@ from transformers import LlamaConfig from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ( - LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear, -) +from vllm.model_executor.layers.linear import (LinearMethodBase, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, - VocabParallelEmbedding, -) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size, -) -from vllm.model_executor.weight_utils import ( - default_weight_loader, - hf_model_weights_iterator, -) + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.parallel_utils.parallel_state import \ + get_tensor_model_parallel_world_size +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) class LlamaMLP(nn.Module): diff --git a/python/sglang/srt/models/llava.py b/python/sglang/srt/models/llava.py index 8e42d48c79..4bd6991848 100644 --- a/python/sglang/srt/models/llava.py +++ b/python/sglang/srt/models/llava.py @@ -6,20 +6,15 @@ import torch from sglang.srt.managers.router.infer_batch import ForwardMode from sglang.srt.managers.router.model_runner import InputMetadata -from sglang.srt.mm_utils import ( - get_anyres_image_grid_shape, - unpad_image, - unpad_image_shape, -) +from sglang.srt.mm_utils import (get_anyres_image_grid_shape, unpad_image, + unpad_image_shape) from sglang.srt.models.llama2 import LlamaForCausalLM from torch import nn -from transformers import CLIPVisionModel, LlamaConfig, LlavaConfig +from transformers import CLIPVisionModel, LlavaConfig from transformers.models.llava.modeling_llava import LlavaMultiModalProjector from vllm.model_executor.layers.linear import LinearMethodBase -from vllm.model_executor.weight_utils import ( - default_weight_loader, - hf_model_weights_iterator, -) +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) class LlavaLlamaForCausalLM(nn.Module): diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index 01a830807f..82b54193c1 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -1,7 +1,7 @@ # Adapted from # https://github.com/vllm-project/vllm/blob/d0215a58e78572d91dadafe9d832a2db89b09a13/vllm/model_executor/models/mixtral.py#L1 """Inference-only Mixtral model.""" -from typing import List, Optional, Tuple +from typing import Optional import numpy as np import torch @@ -12,28 +12,19 @@ from torch import nn from transformers import MixtralConfig from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ( - LinearMethodBase, - QKVParallelLinear, - ReplicatedLinear, - RowParallelLinear, -) +from vllm.model_executor.layers.linear import (LinearMethodBase, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, - VocabParallelEmbedding, -) -from vllm.model_executor.parallel_utils.communication_op import ( - tensor_model_parallel_all_reduce, -) + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.parallel_utils.communication_op import \ + tensor_model_parallel_all_reduce from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, -) -from vllm.model_executor.weight_utils import ( - default_weight_loader, - hf_model_weights_iterator, -) + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) class MixtralMLP(nn.Module): diff --git a/python/sglang/srt/models/qwen.py b/python/sglang/srt/models/qwen.py index 111ad704b2..87cadc3109 100644 --- a/python/sglang/srt/models/qwen.py +++ b/python/sglang/srt/models/qwen.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, Optional import torch from sglang.srt.layers.logits_processor import LogitsProcessor @@ -8,24 +8,17 @@ from transformers import PretrainedConfig from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ( - LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear, -) +from vllm.model_executor.layers.linear import (LinearMethodBase, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, - VocabParallelEmbedding, -) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size, -) -from vllm.model_executor.weight_utils import ( - default_weight_loader, - hf_model_weights_iterator, -) + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.parallel_utils.parallel_state import \ + get_tensor_model_parallel_world_size +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) class QWenMLP(nn.Module): diff --git a/python/sglang/srt/models/qwen2.py b/python/sglang/srt/models/qwen2.py index 26f0a5ae19..559f21e248 100644 --- a/python/sglang/srt/models/qwen2.py +++ b/python/sglang/srt/models/qwen2.py @@ -1,7 +1,7 @@ # Adapted from llama2.py # Modify details for the adaptation of Qwen2 model. """Inference-only Qwen2 model compatible with HuggingFace weights.""" -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, Optional, Tuple import torch from sglang.srt.layers.logits_processor import LogitsProcessor @@ -10,24 +10,17 @@ from torch import nn from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ( - LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear, -) +from vllm.model_executor.layers.linear import (LinearMethodBase, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, - VocabParallelEmbedding, -) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size, -) -from vllm.model_executor.weight_utils import ( - default_weight_loader, - hf_model_weights_iterator, -) + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.parallel_utils.parallel_state import \ + get_tensor_model_parallel_world_size +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) Qwen2Config = None diff --git a/python/sglang/srt/models/stablelm.py b/python/sglang/srt/models/stablelm.py index 5eea538969..678b074d34 100644 --- a/python/sglang/srt/models/stablelm.py +++ b/python/sglang/srt/models/stablelm.py @@ -5,31 +5,23 @@ from typing import Optional, Tuple import torch -from torch import nn -from transformers import PretrainedConfig - from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.managers.router.model_runner import InputMetadata +from torch import nn +from transformers import PretrainedConfig from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.linear import ( - LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear, -) +from vllm.model_executor.layers.linear import (LinearMethodBase, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, - ParallelLMHead, -) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size, -) -from vllm.model_executor.weight_utils import ( - default_weight_loader, - hf_model_weights_iterator, -) + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.parallel_utils.parallel_state import \ + get_tensor_model_parallel_world_size +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) class StablelmMLP(nn.Module): diff --git a/python/sglang/srt/models/yivl.py b/python/sglang/srt/models/yivl.py index 014e40c6e6..d3c2fe4fe8 100644 --- a/python/sglang/srt/models/yivl.py +++ b/python/sglang/srt/models/yivl.py @@ -1,20 +1,14 @@ """Inference-only Yi-VL model.""" -import os -from typing import List, Optional +from typing import Optional import torch import torch.nn as nn -from sglang.srt.models.llava import ( - LlavaLlamaForCausalLM, - clip_vision_embed_forward, - monkey_path_clip_vision_embed_forward, -) +from sglang.srt.models.llava import (LlavaLlamaForCausalLM, + monkey_path_clip_vision_embed_forward) from transformers import CLIPVisionModel, LlavaConfig -from vllm.model_executor.weight_utils import ( - default_weight_loader, - hf_model_weights_iterator, -) +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) class YiVLForCausalLM(LlavaLlamaForCausalLM): diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index e9961305d6..d6d0ac3713 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -24,32 +24,19 @@ from pydantic import BaseModel from sglang.backend.runtime_endpoint import RuntimeEndpoint from sglang.srt.constrained import disable_cache -from sglang.srt.conversation import ( - Conversation, - SeparatorStyle, - chat_template_exists, - generate_chat_conv, - register_conv_template, -) +from sglang.srt.conversation import (Conversation, SeparatorStyle, + chat_template_exists, generate_chat_conv, + register_conv_template) from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.managers.detokenizer_manager import start_detokenizer_process from sglang.srt.managers.io_struct import DetokenizeReqInput, GenerateReqInput from sglang.srt.managers.openai_protocol import ( - ChatCompletionRequest, - ChatCompletionResponse, - ChatCompletionResponseChoice, - ChatCompletionResponseStreamChoice, - ChatCompletionStreamResponse, - ChatMessage, - CompletionRequest, - CompletionResponse, - CompletionResponseChoice, - CompletionResponseStreamChoice, - CompletionStreamResponse, - DeltaMessage, - LogProbs, - UsageInfo, -) + ChatCompletionRequest, ChatCompletionResponse, + ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, + ChatCompletionStreamResponse, ChatMessage, CompletionRequest, + CompletionResponse, CompletionResponseChoice, + CompletionResponseStreamChoice, CompletionStreamResponse, DeltaMessage, + LogProbs, UsageInfo) from sglang.srt.managers.router.manager import start_router_process from sglang.srt.managers.tokenizer_manager import TokenizerManager from sglang.srt.server_args import PortArgs, ServerArgs @@ -527,7 +514,7 @@ def _wait_and_warmup(): try: requests.get(url + "/get_model_info", timeout=5, headers=headers) break - except requests.exceptions.RequestException as e: + except requests.exceptions.RequestException: pass else: if pipe_finish_writer is not None: diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py index 86680c3bbf..ecb5831c88 100644 --- a/python/sglang/srt/utils.py +++ b/python/sglang/srt/utils.py @@ -149,7 +149,6 @@ def get_exception_traceback(): def get_int_token_logit_bias(tokenizer, vocab_size): - from transformers import LlamaTokenizer, LlamaTokenizerFast # a bug when model's vocab size > tokenizer.vocab_size vocab_size = tokenizer.vocab_size diff --git a/python/sglang/test/test_conversation.py b/python/sglang/test/test_conversation.py index 11e837ddbd..d828060521 100644 --- a/python/sglang/test/test_conversation.py +++ b/python/sglang/test/test_conversation.py @@ -1,12 +1,9 @@ from sglang.srt.conversation import generate_chat_conv from sglang.srt.managers.openai_protocol import ( ChatCompletionMessageContentImagePart, - ChatCompletionMessageContentImageURL, - ChatCompletionMessageContentTextPart, - ChatCompletionMessageGenericParam, - ChatCompletionMessageUserParam, - ChatCompletionRequest, -) + ChatCompletionMessageContentImageURL, ChatCompletionMessageContentTextPart, + ChatCompletionMessageGenericParam, ChatCompletionMessageUserParam, + ChatCompletionRequest) def test_chat_completion_to_conv_image(): diff --git a/python/sglang/test/test_openai_protocol.py b/python/sglang/test/test_openai_protocol.py index 99e7a8089c..72b2b7c1b1 100644 --- a/python/sglang/test/test_openai_protocol.py +++ b/python/sglang/test/test_openai_protocol.py @@ -1,11 +1,8 @@ from sglang.srt.managers.openai_protocol import ( ChatCompletionMessageContentImagePart, - ChatCompletionMessageContentImageURL, - ChatCompletionMessageContentTextPart, - ChatCompletionMessageGenericParam, - ChatCompletionMessageUserParam, - ChatCompletionRequest, -) + ChatCompletionMessageContentImageURL, ChatCompletionMessageContentTextPart, + ChatCompletionMessageGenericParam, ChatCompletionMessageUserParam, + ChatCompletionRequest) def test_chat_completion_request_image(): diff --git a/scripts/format.sh b/scripts/format.sh index 104db69bf5..20b522e46e 100644 --- a/scripts/format.sh +++ b/scripts/format.sh @@ -1,5 +1,5 @@ isort python -black python +ruff python isort test -black test +ruff test diff --git a/test/lang/run_all.py b/test/lang/run_all.py index cb5da15850..75d5d5c2b3 100644 --- a/test/lang/run_all.py +++ b/test/lang/run_all.py @@ -1,7 +1,6 @@ import argparse import glob import multiprocessing -import os import time import unittest diff --git a/test/lang/test_anthropic_backend.py b/test/lang/test_anthropic_backend.py index b0da888381..a693c72b51 100644 --- a/test/lang/test_anthropic_backend.py +++ b/test/lang/test_anthropic_backend.py @@ -1,4 +1,3 @@ -import json import unittest from sglang.test.test_programs import test_mt_bench, test_stream diff --git a/test/lang/test_openai_backend.py b/test/lang/test_openai_backend.py index 236c548a82..e1276860f7 100644 --- a/test/lang/test_openai_backend.py +++ b/test/lang/test_openai_backend.py @@ -1,19 +1,11 @@ import unittest -from sglang.test.test_programs import ( - test_decode_int, - test_decode_json, - test_expert_answer, - test_few_shot_qa, - test_image_qa, - test_mt_bench, - test_parallel_decoding, - test_parallel_encoding, - test_react, - test_select, - test_stream, - test_tool_use, -) +from sglang.test.test_programs import (test_decode_int, test_decode_json, + test_expert_answer, test_few_shot_qa, + test_image_qa, test_mt_bench, + test_parallel_decoding, + test_parallel_encoding, test_react, + test_select, test_stream, test_tool_use) from sglang import OpenAI, set_default_backend diff --git a/test/lang/test_srt_backend.py b/test/lang/test_srt_backend.py index 82a9f1ad46..90ea028554 100644 --- a/test/lang/test_srt_backend.py +++ b/test/lang/test_srt_backend.py @@ -2,23 +2,13 @@ python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 """ -import json import unittest -from sglang.test.test_programs import ( - test_decode_int, - test_decode_json_regex, - test_expert_answer, - test_few_shot_qa, - test_mt_bench, - test_parallel_decoding, - test_parallel_encoding, - test_react, - test_regex, - test_select, - test_stream, - test_tool_use, -) +from sglang.test.test_programs import (test_decode_int, test_decode_json_regex, + test_expert_answer, test_few_shot_qa, + test_mt_bench, test_parallel_decoding, + test_regex, test_select, test_stream, + test_tool_use) import sglang as sgl diff --git a/test/lang/test_tracing.py b/test/lang/test_tracing.py index cdc9000d89..11a3e61809 100644 --- a/test/lang/test_tracing.py +++ b/test/lang/test_tracing.py @@ -111,7 +111,7 @@ def tip_suggestion(s): forks = s.fork(3) for i in range(3): forks[i] += f"Now, expand tip {i+1} into a paragraph:\n" - forks[i] += sgl.gen(f"detailed_tip") + forks[i] += sgl.gen("detailed_tip") s += "Tip 1:" + forks[0]["detailed_tip"] + "\n" s += "Tip 2:" + forks[1]["detailed_tip"] + "\n" diff --git a/test/lang/test_vertexai_backend.py b/test/lang/test_vertexai_backend.py index a17ab4ba74..fa6af1c38a 100644 --- a/test/lang/test_vertexai_backend.py +++ b/test/lang/test_vertexai_backend.py @@ -1,14 +1,9 @@ import unittest -from sglang.test.test_programs import ( - test_expert_answer, - test_few_shot_qa, - test_image_qa, - test_mt_bench, - test_parallel_decoding, - test_parallel_encoding, - test_stream, -) +from sglang.test.test_programs import (test_expert_answer, test_few_shot_qa, + test_image_qa, test_mt_bench, + test_parallel_decoding, + test_parallel_encoding, test_stream) from sglang import VertexAI, set_default_backend diff --git a/test/srt/model/reference_hf.py b/test/srt/model/reference_hf.py index e63866f026..4060f9212c 100644 --- a/test/srt/model/reference_hf.py +++ b/test/srt/model/reference_hf.py @@ -1,5 +1,4 @@ import argparse -import os import torch from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/srt/model/test_llama_extend.py b/test/srt/model/test_llama_extend.py index 2931dfa5dc..8b0a1371a4 100644 --- a/test/srt/model/test_llama_extend.py +++ b/test/srt/model/test_llama_extend.py @@ -1,10 +1,6 @@ import multiprocessing import os -import time -import numpy as np -import torch -import torch.distributed as dist import transformers from sglang.srt.managers.router.infer_batch import Batch, ForwardMode, Req from sglang.srt.managers.router.model_runner import ModelRunner diff --git a/test/srt/model/test_llava_low_api.py b/test/srt/model/test_llava_low_api.py index 322ba4855f..fb6f6abc6b 100644 --- a/test/srt/model/test_llava_low_api.py +++ b/test/srt/model/test_llava_low_api.py @@ -1,12 +1,9 @@ import multiprocessing -import time import numpy as np import torch -import torch.distributed as dist from sglang.srt.hf_transformers_utils import get_processor -from sglang.srt.managers.router.infer_batch import ForwardMode -from sglang.srt.managers.router.model_runner import InputMetadata, ModelRunner +from sglang.srt.managers.router.model_runner import ModelRunner from sglang.srt.model_config import ModelConfig from sglang.srt.utils import load_image diff --git a/test/srt/test_httpserver_concurrent.py b/test/srt/test_httpserver_concurrent.py index 855e51f33d..6cdd5332dd 100644 --- a/test/srt/test_httpserver_concurrent.py +++ b/test/srt/test_httpserver_concurrent.py @@ -9,11 +9,8 @@ import argparse import asyncio -import json -import time import aiohttp -import requests async def send_request(url, data, delay=0): diff --git a/test/srt/test_httpserver_llava.py b/test/srt/test_httpserver_llava.py index 0f6571b450..6db4ab9303 100644 --- a/test/srt/test_httpserver_llava.py +++ b/test/srt/test_httpserver_llava.py @@ -10,7 +10,6 @@ import argparse import asyncio import json -import time import aiohttp import requests diff --git a/test/srt/test_httpserver_reuse.py b/test/srt/test_httpserver_reuse.py index c3f7865899..ef866afc6b 100644 --- a/test/srt/test_httpserver_reuse.py +++ b/test/srt/test_httpserver_reuse.py @@ -6,7 +6,6 @@ """ import argparse -import time import requests diff --git a/test/srt/test_jump_forward.py b/test/srt/test_jump_forward.py index 15ec2caffd..832409be53 100644 --- a/test/srt/test_jump_forward.py +++ b/test/srt/test_jump_forward.py @@ -3,10 +3,8 @@ from pydantic import BaseModel, constr from sglang.srt.constrained import build_regex_from_object -from sglang.test.test_utils import ( - add_common_sglang_args_and_parse, - select_sglang_backend, -) +from sglang.test.test_utils import (add_common_sglang_args_and_parse, + select_sglang_backend) import sglang as sgl diff --git a/test/srt/test_robust.py b/test/srt/test_robust.py index 5b479318f5..81c66fbcc8 100644 --- a/test/srt/test_robust.py +++ b/test/srt/test_robust.py @@ -2,10 +2,8 @@ import random import string -from sglang.test.test_utils import ( - add_common_sglang_args_and_parse, - select_sglang_backend, -) +from sglang.test.test_utils import (add_common_sglang_args_and_parse, + select_sglang_backend) from vllm.transformers_utils.tokenizer import get_tokenizer import sglang as sgl