sgl-project · Ying1123 · Jun 30, 2024 · Jun 30, 2024
diff --git a/benchmark/latency_throughput/bench_serving.py b/benchmark/latency_throughput/bench_serving.py
@@ -38,7 +38,6 @@ def sample_requests(
     num_requests: int,
     tokenizer: AutoTokenizer,
 ) -> List[Tuple[str, int, int]]:
-
     def load_dataset():
         with open(dataset_path, encoding="utf-8") as f:
             dataset = json.load(f)

diff --git a/benchmark/line_retrieval/gen_data.py b/benchmark/line_retrieval/gen_data.py
@@ -48,9 +48,9 @@ def generate_lines(random_words, num_lines, redirect_ratio):
         )
         for i in redirect_indices:
             target_idx = np.random.choice(min(i * 2 + 100, num_lines))
-            lines[i] = (
-                f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}."
-            )
+            lines[
+                i
+            ] = f"Line {indices[i]}: The REGISTER_CONTENT is the same as Line {indices[target_idx]}."
             redirects[i] = target_idx
 
     # Build links and find sources

diff --git a/python/sglang/backend/litellm.py b/python/sglang/backend/litellm.py
@@ -13,7 +13,6 @@
 
 
 class LiteLLM(BaseBackend):
-
     def __init__(
         self,
         model_name,

@@ -4,7 +4,7 @@
 from typing import List, Union
 
 from sglang.global_config import global_config
-from sglang.lang.interpreter import ProgramState, StreamExecutor, pin_program
+from sglang.lang.interpreter import ProgramState, StreamExecutor, cache_program
 from sglang.lang.ir import (
     SglArgument,
     SglConstantText,
@@ -184,7 +184,7 @@ def run_batch(
 
         # Extract prefix by tracing and cache it
         if len(batch_kwargs) > 1:
-            pin_program(self.function, backend)
+            cache_program(self.function, backend)
 
         # Run all programs
         if num_threads == "auto":

diff --git a/python/sglang/launch_server_llavavid.py b/python/sglang/launch_server_llavavid.py
@@ -6,7 +6,6 @@
 from sglang.srt.server import ServerArgs, launch_server
 
 if __name__ == "__main__":
-
     model_overide_args = {}
 
     model_overide_args["mm_spatial_pool_stride"] = 2

@@ -498,9 +498,10 @@ def check_for_jump_forward(self, model_runner):
                         req.output_ids = cur_output_ids
                         continue
 
-                    jump_forward_str, next_state = (
-                        req.jump_forward_map.jump_forward_symbol(cur_state)
-                    )
+                    (
+                        jump_forward_str,
+                        next_state,
+                    ) = req.jump_forward_map.jump_forward_symbol(cur_state)
 
                     # Make the incrementally decoded text part of jump_forward_str
                     # so that the UTF-8 will not corrupt

@@ -283,13 +283,14 @@ def handle_generate_request(
                 (recv_req.image_hash >> 64) % self.model_config.vocab_size,
             ]
             req.image_size = recv_req.image_size
-            req.origin_input_ids, req.image_offset = (
-                self.model_runner.model.pad_input_ids(
-                    req.origin_input_ids_unpadded,
-                    req.pad_value,
-                    req.pixel_values.shape,
-                    req.image_size,
-                )
+            (
+                req.origin_input_ids,
+                req.image_offset,
+            ) = self.model_runner.model.pad_input_ids(
+                req.origin_input_ids_unpadded,
+                req.pad_value,
+                req.pixel_values.shape,
+                req.image_size,
             )
         req.sampling_params = recv_req.sampling_params
         req.return_logprob = recv_req.return_logprob

@@ -35,7 +35,6 @@ class GenerateReqInput:
     stream: bool = False
 
     def post_init(self):
-
         if (self.text is None and self.input_ids is None) or (
             self.text is not None and self.input_ids is not None
         ):

@@ -334,15 +334,15 @@ def convert_logprob_style(
                 ret["meta_info"]["decode_token_logprobs"], return_text_in_logprobs
             )
         if top_logprobs_num > 0:
-            ret["meta_info"]["prefill_top_logprobs"] = (
-                self.detokenize_top_logprobs_tokens(
-                    ret["meta_info"]["prefill_top_logprobs"], return_text_in_logprobs
-                )
+            ret["meta_info"][
+                "prefill_top_logprobs"
+            ] = self.detokenize_top_logprobs_tokens(
+                ret["meta_info"]["prefill_top_logprobs"], return_text_in_logprobs
             )
-            ret["meta_info"]["decode_top_logprobs"] = (
-                self.detokenize_top_logprobs_tokens(
-                    ret["meta_info"]["decode_top_logprobs"], return_text_in_logprobs
-                )
+            ret["meta_info"][
+                "decode_top_logprobs"
+            ] = self.detokenize_top_logprobs_tokens(
+                ret["meta_info"]["decode_top_logprobs"], return_text_in_logprobs
             )
         return ret
 

@@ -36,7 +36,6 @@
 
 
 class GLMAttention(nn.Module):
-
     def __init__(
         self,
         config,
@@ -294,7 +293,6 @@ def forward(
 
 
 class ChatGLMModel(nn.Module):
-
     def __init__(
         self,
         config,

@@ -521,7 +521,6 @@ def forward(
         hidden_states: torch.Tensor,
         input_metadata: InputMetadata,
     ) -> torch.Tensor:
-
         hidden_states = (
             self.post_attn_norm(
                 self.self_attn(

@@ -160,9 +160,9 @@ def __init__(
         if rope_scaling is not None and getattr(
             config, "original_max_position_embeddings", None
         ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
+            rope_scaling[
+                "original_max_position_embeddings"
+            ] = config.original_max_position_embeddings
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.self_attn = LlamaAttention(
             hidden_size=self.hidden_size,
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,7 +13,6 @@ @@
     class LiteLLM(BaseBackend):
         def __init__(
             self,
             model_name,
@@ Expand Down @@