diff --git a/llms/mlx_lm/SERVER.md b/llms/mlx_lm/SERVER.md index 2976a09fc..e544c6fa3 100644 --- a/llms/mlx_lm/SERVER.md +++ b/llms/mlx_lm/SERVER.md @@ -92,7 +92,7 @@ curl localhost:8080/v1/chat/completions \ - `system_fingerprint`: A unique identifier for the system. -- `object`: Any of "chat.completions", "chat.completions.chunk" (for +- `object`: Any of "chat.completion", "chat.completion.chunk" (for streaming), or "text.completion". - `model`: The model repo or path (e.g. `"mlx-community/Llama-3.2-3B-Instruct-4bit"`). diff --git a/llms/mlx_lm/server.py b/llms/mlx_lm/server.py index badc6dd37..ce09cf45c 100644 --- a/llms/mlx_lm/server.py +++ b/llms/mlx_lm/server.py @@ -589,9 +589,7 @@ def handle_chat_completions(self) -> List[int]: # Determine response type self.request_id = f"chatcmpl-{uuid.uuid4()}" - self.object_type = ( - "chat.completions.chunk" if self.stream else "chat.completions" - ) + self.object_type = "chat.completion.chunk" if self.stream else "chat.completion" if ( hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template