[Frontend] Add template related params to request (vllm-project#5709)

opendatahub-io · Jul 3, 2024 · 10e603f · 10e603f
1 parent 549c660
commit 10e603f
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 1 deletion.
diff --git a/requirements-common.txt b/requirements-common.txt
@@ -6,7 +6,7 @@ numpy < 2.0.0
 requests
 tqdm
 py-cpuinfo
-transformers >= 4.42.0  # Required for Gemma 2.
+transformers >= 4.42.0  # Required for Gemma 2 and for additional chat template parameters.
 tokenizers >= 0.19.1  # Required for Llama 3.
 fastapi
 aiohttp

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
@@ -190,6 +190,27 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "special tokens so this should be set to False (as is the "
             "default)."),
     )
+    documents: Optional[List[Dict[str, str]]] = Field(
+        default=None,
+        description=
+        ("A list of dicts representing documents that will be accessible to "
+         "the model if it is performing RAG (retrieval-augmented generation)."
+         " If the template does not support RAG, this argument will have no "
+         "effect. We recommend that each document should be a dict containing "
+         "\"title\" and \"text\" keys."),
+    )
+    chat_template: Optional[str] = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "If this is not passed, the model's default chat template will be "
+            "used instead."),
+    )
+    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the template renderer. "
+                     "Will be accessible by the chat template."),
+    )
     include_stop_str_in_output: Optional[bool] = Field(
         default=False,
         description=(

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -218,10 +218,18 @@ async def create_chat_completion(
                 conversation.extend(chat_parsed_result.messages)
                 image_futures.extend(chat_parsed_result.image_futures)
 
+            tool_dicts = None if request.tools is None else [
+                tool.model_dump() for tool in request.tools
+            ]
+
             prompt = self.tokenizer.apply_chat_template(
                 conversation=conversation,
                 tokenize=False,
                 add_generation_prompt=request.add_generation_prompt,
+                tools=tool_dicts,
+                documents=request.documents,
+                chat_template=request.chat_template,
+                **(request.chat_template_kwargs or {}),
             )
         except Exception as e:
             logger.error("Error in applying chat template from request: %s", e)