From 9d5231ea81307800cf603bdcb60c3fca71a0e8c2 Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Thu, 8 Aug 2024 05:26:45 +0000 Subject: [PATCH 1/9] support more optioin about usage in stream mode --- python/sglang/srt/managers/schedule_batch.py | 6 +-- python/sglang/srt/openai_api/adapter.py | 46 +++++++++++++++----- python/sglang/srt/openai_api/protocol.py | 9 +++- 3 files changed, 46 insertions(+), 15 deletions(-) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index 4e9b9eb2f3..d6d530914f 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -197,16 +197,14 @@ def check_finished(self): return if len(self.output_ids) >= self.sampling_params.max_new_tokens: - self.finished_reason = FINISH_LENGTH(len(self.output_ids)) + self.finished_reason = "length" return if ( self.output_ids[-1] == self.tokenizer.eos_token_id and not self.sampling_params.ignore_eos ): - self.finished_reason = FINISH_MATCHED_TOKEN( - matched=self.tokenizer.eos_token_id - ) + self.finished_reason = "stop" return if len(self.sampling_params.stop_strs) > 0: diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 2b6fd961a7..14a9c0a31f 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -614,13 +614,26 @@ async def generate_stream_resp(): object="text_completion", choices=[choice_data], model=request.model, - usage=UsageInfo( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=prompt_tokens + completion_tokens, - ), ) yield f"data: {chunk.model_dump_json()}\n\n" + if request.stream_options.include_usage: + + usage = UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + ) + + final_usage_chunk = CompletionStreamResponse( + id=str(uuid.uuid4().hex), + choices=[], + model=request.model, + usage=usage, + ) + final_usage_data = final_usage_chunk.model_dump_json( + exclude_unset=True, exclude_none=True + ) + yield f"data: {final_usage_data}\n\n" except ValueError as e: error = create_streaming_error_response(str(e)) yield f"data: {error}\n\n" @@ -928,13 +941,26 @@ async def generate_stream_resp(): id=content["meta_info"]["id"], choices=[choice_data], model=request.model, - usage=UsageInfo( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=prompt_tokens + completion_tokens, - ), ) yield f"data: {chunk.model_dump_json()}\n\n" + if request.stream_options.include_usage: + + usage = UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + ) + + final_usage_chunk = ChatCompletionStreamResponse( + id=str(uuid.uuid4().hex), + choices=[], + model=request.model, + usage=usage, + ) + final_usage_data = final_usage_chunk.model_dump_json( + exclude_unset=True, exclude_none=True + ) + yield f"data: {final_usage_data}\n\n" except ValueError as e: error = create_streaming_error_response(str(e)) yield f"data: {error}\n\n" diff --git a/python/sglang/srt/openai_api/protocol.py b/python/sglang/srt/openai_api/protocol.py index 0e9b902231..0721beb73b 100644 --- a/python/sglang/srt/openai_api/protocol.py +++ b/python/sglang/srt/openai_api/protocol.py @@ -78,6 +78,10 @@ class UsageInfo(BaseModel): completion_tokens: Optional[int] = 0 +class StreamOptions(BaseModel): + include_usage: Optional[bool] = False + + class FileRequest(BaseModel): # https://platform.openai.com/docs/api-reference/files/create file: bytes # The File object (not file name) to be uploaded @@ -149,6 +153,7 @@ class CompletionRequest(BaseModel): seed: Optional[int] = None stop: Optional[Union[str, List[str]]] = Field(default_factory=list) stream: Optional[bool] = False + stream_options: Optional[StreamOptions] = None suffix: Optional[str] = None temperature: Optional[float] = 1.0 top_p: Optional[float] = 1.0 @@ -188,7 +193,7 @@ class CompletionStreamResponse(BaseModel): created: int = Field(default_factory=lambda: int(time.time())) model: str choices: List[CompletionResponseStreamChoice] - usage: UsageInfo + usage: Optional[UsageInfo] = None class ChatCompletionMessageGenericParam(BaseModel): @@ -247,6 +252,7 @@ class ChatCompletionRequest(BaseModel): seed: Optional[int] = None stop: Optional[Union[str, List[str]]] = Field(default_factory=list) stream: Optional[bool] = False + stream_options: Optional[StreamOptions] = None temperature: Optional[float] = 0.7 top_p: Optional[float] = 1.0 user: Optional[str] = None @@ -294,3 +300,4 @@ class ChatCompletionStreamResponse(BaseModel): created: int = Field(default_factory=lambda: int(time.time())) model: str choices: List[ChatCompletionResponseStreamChoice] + usage: Optional[UsageInfo] = None From 0955918acd97c37ae242f8288e6fc81aeda1fa4c Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Thu, 8 Aug 2024 05:53:42 +0000 Subject: [PATCH 2/9] fix bug --- python/sglang/srt/openai_api/adapter.py | 6 ++---- python/sglang/srt/server_args.py | 3 ++- test/srt/test_openai_server.py | 3 --- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 14a9c0a31f..788bf13a3e 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -616,8 +616,7 @@ async def generate_stream_resp(): model=request.model, ) yield f"data: {chunk.model_dump_json()}\n\n" - if request.stream_options.include_usage: - + if request.stream_options and request.stream_options.include_usage: usage = UsageInfo( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, @@ -943,8 +942,7 @@ async def generate_stream_resp(): model=request.model, ) yield f"data: {chunk.model_dump_json()}\n\n" - if request.stream_options.include_usage: - + if request.stream_options and request.stream_options.include_usage: usage = UsageInfo( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index f42afdf8d5..fdad0cbbad 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -223,7 +223,8 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--mem-fraction-static", type=float, - default=ServerArgs.mem_fraction_static, + # default=ServerArgs.mem_fraction_static, + default=0.8, help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.", ) parser.add_argument( diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index db1a3c0277..2205137fe9 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -125,9 +125,6 @@ def run_completion_stream(self, echo, logprobs, token_input): assert response.id assert response.created - assert response.usage.prompt_tokens > 0 - assert response.usage.completion_tokens > 0 - assert response.usage.total_tokens > 0 def run_chat_completion(self, logprobs, parallel_sample_num): client = openai.Client(api_key=self.api_key, base_url=self.base_url) From dfebd5e4dda39c5a416220b6c97d9b11563fccba Mon Sep 17 00:00:00 2001 From: Ying Sheng Date: Wed, 7 Aug 2024 23:33:53 -0700 Subject: [PATCH 3/9] Update python/sglang/srt/server_args.py --- python/sglang/srt/server_args.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index fdad0cbbad..f42afdf8d5 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -223,8 +223,7 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--mem-fraction-static", type=float, - # default=ServerArgs.mem_fraction_static, - default=0.8, + default=ServerArgs.mem_fraction_static, help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.", ) parser.add_argument( From 4e182054483a2299e34a22022b123f3485990c84 Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Thu, 8 Aug 2024 07:32:45 +0000 Subject: [PATCH 4/9] format finish reason --- python/sglang/srt/managers/schedule_batch.py | 8 +++- python/sglang/srt/openai_api/adapter.py | 39 ++++++++++++++++---- python/sglang/srt/server_args.py | 3 +- 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index d6d530914f..faa4201fbc 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -197,14 +197,18 @@ def check_finished(self): return if len(self.output_ids) >= self.sampling_params.max_new_tokens: - self.finished_reason = "length" + self.finished_reason = FINISH_LENGTH( + length=self.sampling_params.max_new_tokens + ) return if ( self.output_ids[-1] == self.tokenizer.eos_token_id and not self.sampling_params.ignore_eos ): - self.finished_reason = "stop" + self.finished_reason = FINISH_MATCHED_TOKEN( + matched=self.tokenizer.eos_token_id + ) return if len(self.sampling_params.stop_strs) > 0: diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 788bf13a3e..9800c34ce3 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -82,6 +82,17 @@ def __init__(self, filename: str, purpose: str): storage_dir = None +def format_finish_reason(finish_reason) -> Optional[str]: + if finish_reason.startswith("None"): + return None + if finish_reason.startswith("FINISH_MATCHED"): + return "stop" + elif finish_reason.startswith("FINISH_LENGTH"): + return "length" + else: + raise ValueError(f"Unknown finish reason: {finish_reason}") + + def create_error_response( message: str, err_type: str = "BadRequestError", @@ -485,14 +496,18 @@ def v1_generate_response(request, ret, tokenizer_manager, to_file=False): "index": 0, "text": text, "logprobs": logprobs, - "finish_reason": ret_item["meta_info"]["finish_reason"], + "finish_reason": format_finish_reason( + ret_item["meta_info"]["finish_reason"] + ), } else: choice_data = CompletionResponseChoice( index=idx, text=text, logprobs=logprobs, - finish_reason=ret_item["meta_info"]["finish_reason"], + finish_reason=format_finish_reason( + ret_item["meta_info"]["finish_reason"] + ), ) choices.append(choice_data) @@ -607,7 +622,9 @@ async def generate_stream_resp(): index=0, text=delta, logprobs=logprobs, - finish_reason=content["meta_info"]["finish_reason"], + finish_reason=format_finish_reason( + content["meta_info"]["finish_reason"] + ), ) chunk = CompletionStreamResponse( id=content["meta_info"]["id"], @@ -788,14 +805,18 @@ def v1_chat_generate_response(request, ret, to_file=False): "index": 0, "message": {"role": "assistant", "content": ret_item["text"]}, "logprobs": choice_logprobs, - "finish_reason": ret_item["meta_info"]["finish_reason"], + "finish_reason": format_finish_reason( + ret_item["meta_info"]["finish_reason"] + ), } else: choice_data = ChatCompletionResponseChoice( index=idx, message=ChatMessage(role="assistant", content=ret_item["text"]), logprobs=choice_logprobs, - finish_reason=ret_item["meta_info"]["finish_reason"], + finish_reason=format_finish_reason( + ret_item["meta_info"]["finish_reason"] + ), ) choices.append(choice_data) @@ -912,7 +933,9 @@ async def generate_stream_resp(): choice_data = ChatCompletionResponseStreamChoice( index=0, delta=DeltaMessage(role="assistant"), - finish_reason=content["meta_info"]["finish_reason"], + finish_reason=format_finish_reason( + content["meta_info"]["finish_reason"] + ), logprobs=choice_logprobs, ) chunk = ChatCompletionStreamResponse( @@ -933,7 +956,9 @@ async def generate_stream_resp(): choice_data = ChatCompletionResponseStreamChoice( index=0, delta=DeltaMessage(content=delta), - finish_reason=content["meta_info"]["finish_reason"], + finish_reason=format_finish_reason( + content["meta_info"]["finish_reason"] + ), logprobs=choice_logprobs, ) chunk = ChatCompletionStreamResponse( diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index fdad0cbbad..f42afdf8d5 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -223,8 +223,7 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--mem-fraction-static", type=float, - # default=ServerArgs.mem_fraction_static, - default=0.8, + default=ServerArgs.mem_fraction_static, help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.", ) parser.add_argument( From c38ac9cc41169dffd37cc3baea9f0778a1c55e47 Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Thu, 8 Aug 2024 08:03:30 +0000 Subject: [PATCH 5/9] fix --- python/sglang/srt/openai_api/adapter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 9800c34ce3..0cc6115edc 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -85,7 +85,7 @@ def __init__(self, filename: str, purpose: str): def format_finish_reason(finish_reason) -> Optional[str]: if finish_reason.startswith("None"): return None - if finish_reason.startswith("FINISH_MATCHED"): + elif finish_reason.startswith("FINISH_MATCHED"): return "stop" elif finish_reason.startswith("FINISH_LENGTH"): return "length" From de5fe56c7aa6ce71f892c22200fa0f5ad3d1b98e Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Thu, 8 Aug 2024 08:21:34 +0000 Subject: [PATCH 6/9] add test about usage --- python/sglang/srt/openai_api/adapter.py | 5 ----- python/sglang/srt/server_args.py | 3 ++- test/srt/test_openai_server.py | 17 ++++++++++++++++- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 0cc6115edc..00a8735ab7 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -942,11 +942,6 @@ async def generate_stream_resp(): id=content["meta_info"]["id"], choices=[choice_data], model=request.model, - usage=UsageInfo( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=prompt_tokens + completion_tokens, - ), ) yield f"data: {chunk.model_dump_json()}\n\n" diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index f42afdf8d5..fdad0cbbad 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -223,7 +223,8 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--mem-fraction-static", type=float, - default=ServerArgs.mem_fraction_static, + # default=ServerArgs.mem_fraction_static, + default=0.8, help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.", ) parser.add_argument( diff --git a/test/srt/test_openai_server.py b/test/srt/test_openai_server.py index 2205137fe9..90c869665f 100644 --- a/test/srt/test_openai_server.py +++ b/test/srt/test_openai_server.py @@ -98,10 +98,17 @@ def run_completion_stream(self, echo, logprobs, token_input): echo=echo, logprobs=logprobs, stream=True, + stream_options={"include_usage": True}, ) first = True for response in generator: + usage = response.usage + if usage is not None: + assert usage.prompt_tokens > 0 + assert usage.completion_tokens > 0 + assert usage.total_tokens > 0 + continue if logprobs: assert response.choices[0].logprobs assert isinstance(response.choices[0].logprobs.tokens[0], str) @@ -122,7 +129,6 @@ def run_completion_stream(self, echo, logprobs, token_input): prompt ), f"{response.choices[0].text} and all args {echo} {logprobs} {token_input} {first}" first = False - assert response.id assert response.created @@ -176,11 +182,20 @@ def run_chat_completion_stream(self, logprobs): logprobs=logprobs is not None and logprobs > 0, top_logprobs=logprobs, stream=True, + stream_options={"include_usage": True}, ) is_first = True for response in generator: + usage = response.usage + if usage is not None: + assert usage.prompt_tokens > 0 + assert usage.completion_tokens > 0 + assert usage.total_tokens > 0 + continue + data = response.choices[0].delta + if is_first: data.role == "assistant" is_first = False From 575f6fda0123428e41bbb11c20f4f3f36726f74f Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 8 Aug 2024 02:03:49 -0700 Subject: [PATCH 7/9] Update python/sglang/srt/server_args.py Co-authored-by: Ying Sheng --- python/sglang/srt/server_args.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index fdad0cbbad..f42afdf8d5 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -223,8 +223,7 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--mem-fraction-static", type=float, - # default=ServerArgs.mem_fraction_static, - default=0.8, + default=ServerArgs.mem_fraction_static, help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.", ) parser.add_argument( From f8792df00c5bc92c22c4a4dde213b9fad9eb2dc2 Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Thu, 8 Aug 2024 09:17:27 +0000 Subject: [PATCH 8/9] handle more case of finish --- python/sglang/srt/openai_api/adapter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 00a8735ab7..3bcab64096 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -89,9 +89,10 @@ def format_finish_reason(finish_reason) -> Optional[str]: return "stop" elif finish_reason.startswith("FINISH_LENGTH"): return "length" + elif finish_reason.startswith("FINISH_ABORT"): + return "abort" else: - raise ValueError(f"Unknown finish reason: {finish_reason}") - + return "unknown" def create_error_response( message: str, From ccc7c3ff5557431ce4d9919cc7e60e0051c1c67d Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Thu, 8 Aug 2024 09:25:23 +0000 Subject: [PATCH 9/9] format --- python/sglang/srt/openai_api/adapter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/sglang/srt/openai_api/adapter.py b/python/sglang/srt/openai_api/adapter.py index 3bcab64096..781b5e2d16 100644 --- a/python/sglang/srt/openai_api/adapter.py +++ b/python/sglang/srt/openai_api/adapter.py @@ -94,6 +94,7 @@ def format_finish_reason(finish_reason) -> Optional[str]: else: return "unknown" + def create_error_response( message: str, err_type: str = "BadRequestError",