epam · roman-romanov-o · Jan 16, 2025 · Jan 14, 2025 · Jan 14, 2025 · Jan 14, 2025
@@ -1,5 +1,7 @@
 from typing import assert_never
 
+from google.genai.client import Client as GenAIClient
+
 from aidial_adapter_vertexai.chat.bison.adapter import (
     BisonChatAdapter,
     BisonCodeChatAdapter,
@@ -9,6 +11,7 @@
 )
 from aidial_adapter_vertexai.chat.gemini.adapter import (
     GeminiChatCompletionAdapter,
+    GeminiGenAIChatCompletionAdapter,
 )
 from aidial_adapter_vertexai.chat.imagen.adapter import (
     ImagenChatCompletionAdapter,
@@ -28,7 +31,7 @@
 
 
 async def get_chat_completion_model(
-    api_key: str, deployment: ChatCompletionDeployment
+    api_key: str, deployment: ChatCompletionDeployment, client: GenAIClient
 ) -> ChatCompletionAdapter:
     model_id = deployment.get_model_id()
 
@@ -58,6 +61,15 @@ async def get_chat_completion_model(
             return await GeminiChatCompletionAdapter.create(
                 storage, model_id, deployment
             )
+        case (
+            ChatCompletionDeployment.GEMINI_2_0_FLASH_EXP
+            | ChatCompletionDeployment.GEMINI_2_0_FLASH_THINKING_EXP_1219
+            | ChatCompletionDeployment.GEMINI_2_0_EXPERIMENTAL_1206
+        ):
+            storage = create_file_storage(api_key)
+            return GeminiGenAIChatCompletionAdapter(
+                client, storage, model_id, deployment
+            )
         case ChatCompletionDeployment.IMAGEN_005:
             storage = create_file_storage(api_key)
             return await ImagenChatCompletionAdapter.create(storage, model_id)

@@ -3,6 +3,7 @@
 import vertexai
 from aidial_sdk import DIALApp
 from aidial_sdk.telemetry.types import TelemetryConfig
+from google.genai.client import Client as GenAIClient
 
 from aidial_adapter_vertexai.chat_completion import VertexAIChatCompletion
 from aidial_adapter_vertexai.deployments import (
@@ -52,7 +53,13 @@ async def models():
 
 
 for deployment in ChatCompletionDeployment:
-    app.add_chat_completion(deployment.get_model_id(), VertexAIChatCompletion())
-
+    app.add_chat_completion(
+        deployment.get_model_id(),
+        VertexAIChatCompletion(
+            GenAIClient(
+                vertexai=True, project=GCP_PROJECT_ID, location=DEFAULT_REGION
+            )
+        ),
+    )
 for deployment in EmbeddingsDeployment:
     app.add_embeddings(deployment.get_model_id(), VertexAIEmbeddings())
@@ -7,6 +7,7 @@
     Choice,
     FinishReason,
     Response,
+    Stage,
 )
 
 from aidial_adapter_vertexai.dial_api.token_usage import TokenUsage
@@ -41,6 +42,10 @@ async def set_finish_reason(self, finish_reason: FinishReason):
     def is_empty(self) -> bool:
         pass
 
+    @abstractmethod
+    async def create_stage(self, name: str) -> Stage:
+        pass
+
 
 class ChoiceConsumer(Consumer):
     response: Response
@@ -132,3 +137,6 @@ async def set_finish_reason(self, finish_reason: FinishReason):
             )
 
         self.finish_reason = finish_reason
+
+    async def create_stage(self, name) -> Stage:
+        return self.choice.create_stage(name)
@@ -1,5 +1,4 @@
+from .genai_lib import GeminiGenAIChatCompletionAdapter
 from .vertex_lib import GeminiChatCompletionAdapter
 
-__all__ = [
-    "GeminiChatCompletionAdapter",
-]
+__all__ = ["GeminiChatCompletionAdapter", "GeminiGenAIChatCompletionAdapter"]
@@ -0,0 +1,241 @@
+from logging import DEBUG
+from typing import AsyncIterator, Callable, List, Optional, assert_never
+
+from aidial_sdk.chat_completion import FinishReason, Message, Stage
+from google.genai.client import Client as GenAIClient
+from google.genai.types import (
+    GenerateContentResponse as GenAIGenerateContentResponse,
+)
+from typing_extensions import override
+
+from aidial_adapter_vertexai.chat.chat_completion_adapter import (
+    ChatCompletionAdapter,
+)
+from aidial_adapter_vertexai.chat.consumer import Consumer
+from aidial_adapter_vertexai.chat.errors import UserError
+from aidial_adapter_vertexai.chat.gemini.error import generate_with_retries
+from aidial_adapter_vertexai.chat.gemini.finish_reason import (
+    genai_to_openai_finish_reason,
+)
+from aidial_adapter_vertexai.chat.gemini.generation_config import (
+    create_genai_generation_config,
+)
+from aidial_adapter_vertexai.chat.gemini.grounding import create_grounding
+from aidial_adapter_vertexai.chat.gemini.output import (
+    create_attachments_from_citations,
+    create_function_calls_from_genai,
+    set_usage,
+)
+from aidial_adapter_vertexai.chat.gemini.prompt.base import GeminiGenAIPrompt
+from aidial_adapter_vertexai.chat.gemini.prompt.gemini_2 import Gemini_2_Prompt
+from aidial_adapter_vertexai.chat.static_tools import StaticToolsConfig
+from aidial_adapter_vertexai.chat.tools import ToolsConfig
+from aidial_adapter_vertexai.chat.truncate_prompt import TruncatedPrompt
+from aidial_adapter_vertexai.deployments import (
+    ChatCompletionDeployment,
+    Gemini2Deployment,
+)
+from aidial_adapter_vertexai.dial_api.request import ModelParameters
+from aidial_adapter_vertexai.dial_api.storage import FileStorage
+from aidial_adapter_vertexai.utils.json import json_dumps, json_dumps_short
+from aidial_adapter_vertexai.utils.log_config import vertex_ai_logger as log
+from aidial_adapter_vertexai.utils.timer import Timer
+
+
+class GeminiGenAIChatCompletionAdapter(
+    ChatCompletionAdapter[GeminiGenAIPrompt]
+):
+    deployment: Gemini2Deployment
+
+    def __init__(
+        self,
+        client: GenAIClient,
+        file_storage: Optional[FileStorage],
+        model_id: str,
+        deployment: Gemini2Deployment,
+    ):
+        self.file_storage = file_storage
+        self.model_id = model_id
+        self.deployment = deployment
+        self.client = client
+
+    @override
+    async def parse_prompt(
+        self,
+        tools: ToolsConfig,
+        static_tools: StaticToolsConfig,
+        messages: List[Message],
+    ) -> GeminiGenAIPrompt | UserError:
+        match self.deployment:
+            case (
+                ChatCompletionDeployment.GEMINI_2_0_EXPERIMENTAL_1206
+                | ChatCompletionDeployment.GEMINI_2_0_FLASH_EXP
+                | ChatCompletionDeployment.GEMINI_2_0_FLASH_THINKING_EXP_1219
+            ):
+                return await Gemini_2_Prompt.parse(
+                    self.file_storage, tools, static_tools, messages
+                )
+            case _:
+                assert_never(self.deployment)
+
+    async def send_message_async(
+        self, params: ModelParameters, prompt: GeminiGenAIPrompt
+    ) -> AsyncIterator[GenAIGenerateContentResponse]:
+
+        generation_config = create_genai_generation_config(
+            params,
+            prompt.tools,
+            prompt.static_tools,
+            prompt.system_instruction,
+        )
+        if params.stream:
+            async for chunk in self.client.aio.models.generate_content_stream(
+                model=self.model_id,
+                contents=list(prompt.contents),
+                config=generation_config,
+            ):
+                yield chunk
+        else:
+            yield await self.client.aio.models.generate_content(
+                model=self.model_id,
+                contents=list(prompt.contents),
+                config=generation_config,
+            )
+
+    async def process_chunks(
+        self,
+        consumer: Consumer,
+        tools: ToolsConfig,
+        generator: Callable[[], AsyncIterator[GenAIGenerateContentResponse]],
+    ):
+        thinking_stage: Stage | None = None
+
+        usage_metadata = None
+        is_grounding_added = False
+        try:
+            async for chunk in generator():
+                if log.isEnabledFor(DEBUG):
+                    chunk_str = json_dumps(chunk)
+                    log.debug(f"response chunk: {chunk_str}")
+
+                if chunk.prompt_feedback:
+                    await consumer.set_finish_reason(
+                        FinishReason.CONTENT_FILTER
+                    )
+
+                if chunk.usage_metadata:
+                    usage_metadata = chunk.usage_metadata
+
+                if not chunk.candidates:
+                    continue
+
+                candidate = chunk.candidates[0]
+                if candidate.content and candidate.content.parts:
+                    for part in candidate.content.parts:
+                        await create_function_calls_from_genai(
+                            part, consumer, tools
+                        )
+                        if part.thought and part.text:
+                            if thinking_stage is None:
+                                thinking_stage = await consumer.create_stage(
+                                    "Thought Process"
+                                )
+                                thinking_stage.open()
+                            thinking_stage.append_content(part.text)
+                            yield part.text
+                        elif part.text:
+                            await consumer.append_content(part.text)
+                            yield part.text
+
+                is_grounding_added |= await create_grounding(
+                    candidate, consumer
+                )
+
+                await create_attachments_from_citations(candidate, consumer)
+                if openai_reason := genai_to_openai_finish_reason(
+                    candidate.finish_reason,
+                    consumer.is_empty(),
+                ):
+                    await consumer.set_finish_reason(openai_reason)
+        finally:
+            if thinking_stage:
+                thinking_stage.close()
+            # For thinking model there is possible, that max tokens will be reached on thinking stage,
+            # and there will be no content in response.
+            # And set_usage will fail with 'Trying to set "usage" before generating all choices' error.
+            # So we append empty content, so that at least one choice is generated.
+            await consumer.append_content("")
+
+        if usage_metadata:
+            await set_usage(
+                usage_metadata,
+                consumer,
+                self.deployment,
+                is_grounding_added,
+            )
+
+    @override
+    async def truncate_prompt(
+        self, prompt: GeminiGenAIPrompt, max_prompt_tokens: int
+    ) -> TruncatedPrompt[GeminiGenAIPrompt]:
+        return await prompt.truncate(
+            tokenizer=self.count_prompt_tokens, user_limit=max_prompt_tokens
+        )
+
+    @override
+    async def count_prompt_tokens(self, prompt: GeminiGenAIPrompt) -> int:
+        with Timer("count_tokens[prompt] timing: {time}", log.debug):
+            resp = await self.client.aio.models.count_tokens(
+                model=self.model_id,
+                contents=[c for c in prompt.contents],
+            )
+            log.debug(f"count_tokens[prompt] response: {json_dumps(resp)}")
+            if not resp.total_tokens:
+                raise RuntimeError(
+                    "Failed to count tokens for prompt",
+                    "count_tokens_failed",
+                )
+            return resp.total_tokens
+
+    @override
+    async def count_completion_tokens(self, string: str) -> int:
+        with Timer("count_tokens[completion] timing: {time}", log.debug):
+            resp = await self.client.aio.models.count_tokens(
+                model=self.model_id,
+                contents=string,
+            )
+            log.debug(f"count_tokens[completion] response: {json_dumps(resp)}")
+            if not resp.total_tokens:
+                raise RuntimeError(
+                    "Failed to count tokens for prompt",
+                    "count_tokens_failed",
+                )
+            return resp.total_tokens
+
+    @override
+    async def chat(
+        self,
+        params: ModelParameters,
+        consumer: Consumer,
+        prompt: GeminiGenAIPrompt,
+    ) -> None:
+
+        with Timer("predict timing: {time}", log.debug):
+            if log.isEnabledFor(DEBUG):
+                log.debug(
+                    "predict request: "
+                    + json_dumps_short({"parameters": params, "prompt": prompt})
+                )
+
+            completion = ""
+            async for content in generate_with_retries(
+                lambda: self.process_chunks(
+                    consumer,
+                    prompt.tools,
+                    lambda: self.send_message_async(params, prompt),
+                ),
+                2,
+            ):
+                completion += content
+
+            log.debug(f"predict response: {completion!r}")
@@ -2,6 +2,7 @@
 
 import vertexai.preview.generative_models as generative_models
 from aidial_sdk.chat_completion import FinishReason
+from google.genai.types import FinishReason as GenAIFinishReason
 
 from aidial_adapter_vertexai.chat.gemini.error import FinishReasonOtherError
 
@@ -40,3 +41,37 @@ def to_openai_finish_reason(
             )
         case _:
             assert_never(finish_reason)
+
+
+def genai_to_openai_finish_reason(
+    finish_reason: GenAIFinishReason | None, retriable: bool
+) -> FinishReason | None:
+    if not finish_reason:
+        return None
+    match finish_reason:
+        case "FINISH_REASON_UNSPECIFIED":
+            return None
+        case "MAX_TOKENS":
+            return FinishReason.LENGTH
+        case "STOP":
+            return FinishReason.STOP
+        case (
+            "SAFETY"
+            | "RECITATION"
+            | "BLOCKLIST"
+            | "PROHIBITED_CONTENT"
+            | "SPII"
+        ):
+            return FinishReason.CONTENT_FILTER
+        case "OTHER":
+            raise FinishReasonOtherError(
+                msg="The model terminated generation unexpectedly",
+                retriable=retriable,
+            )
+        case "MALFORMED_FUNCTION_CALL":
+            raise FinishReasonOtherError(
+                msg="The function call generated by the model is invalid",
+                retriable=retriable,
+            )
+        case _:
+            assert_never(finish_reason)