livekit · bcherry · Dec 13, 2024 · Dec 11, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/.changeset/great-lizards-pump.md b/.changeset/great-lizards-pump.md
@@ -0,0 +1,9 @@
+---
+"livekit-agents": patch
+"livekit-plugins-anthropic": patch
+"livekit-plugins-openai": patch
+---
+
+Add support for OpenAI's "detail" parameter to ChatImage
+
+Add support for data URLs on ChatImage in the Anthropic plugin.
diff --git a/livekit-agents/livekit/agents/llm/chat_context.py b/livekit-agents/livekit/agents/llm/chat_context.py
@@ -57,6 +57,12 @@ class ChatImage:
     """
     Resizing parameter for rtc.VideoFrame inputs (ignored for URL images)
     """
+    inference_detail: Literal["auto", "high", "low"] = "auto"
+    """
+    Detail parameter for LLM provider, if supported.
+
+    Currently only supported by OpenAI (see https://platform.openai.com/docs/guides/vision?lang=node#low-or-high-fidelity-image-understanding)
+    """
     _cache: dict[Any, Any] = field(default_factory=dict, repr=False, init=False)
     """
     _cache is used internally by LLM implementations to store a processed version of the image

diff --git a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py
@@ -26,6 +26,7 @@
     Literal,
     Tuple,
     Union,
+    cast,
     get_args,
     get_origin,
 )
@@ -427,12 +428,36 @@ def _build_anthropic_message(
 def _build_anthropic_image_content(
     image: llm.ChatImage, cache_key: Any
 ) -> anthropic.types.ImageBlockParam:
-    if isinstance(image.image, str):  # image url
-        logger.warning(
-            "ChatImage with url is not yet supported by the LiveKit Anthropic plugin, skipping image '%s'",
-            image.image,
-        )
-    elif isinstance(image.image, rtc.VideoFrame):  # VideoFrame
+    if isinstance(image.image, str):  # image is a URL
+        if not image.image.startswith("data:"):
+            raise ValueError("LiveKit Anthropic Plugin: Image URLs must be data URLs")
+
+        try:
+            header, b64_data = image.image.split(",", 1)
+            media_type = header.split(";")[0].split(":")[1]
+
+            supported_types = {"image/jpeg", "image/png", "image/webp", "image/gif"}
+            if media_type not in supported_types:
+                raise ValueError(
+                    f"LiveKit Anthropic Plugin: Unsupported media type {media_type}. Must be jpeg, png, webp, or gif"
+                )
+
+            return {
+                "type": "image",
+                "source": {
+                    "type": "base64",
+                    "data": b64_data,
+                    "media_type": cast(
+                        Literal["image/jpeg", "image/png", "image/gif", "image/webp"],
+                        media_type,
+                    ),
+                },
+            }
+        except (ValueError, IndexError) as e:
+            raise ValueError(
+                f"LiveKit Anthropic Plugin: Invalid image data URL {str(e)}"
+            )
+    elif isinstance(image.image, rtc.VideoFrame):  # image is a VideoFrame
         if cache_key not in image._cache:
             # inside our internal implementation, we allow to put extra metadata to
             # each ChatImage (avoid to reencode each time we do a chatcompletion request)
@@ -456,7 +481,9 @@ def _build_anthropic_image_content(
             },
         }
 
-    raise ValueError(f"unknown image type {type(image.image)}")
+    raise ValueError(
+        "LiveKit OpenAI Plugin: ChatImage must be an rtc.VideoFrame or a data URL"
+    )
 
 
 def _create_ai_function_info(

diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py
@@ -729,7 +729,6 @@ async def _run(self) -> None:
 
             user = self._user or openai.NOT_GIVEN
             messages = _build_oai_context(self._chat_ctx, id(self))
-
             stream = await self._client.chat.completions.create(
                 messages=messages,
                 model=self._model,

diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/utils.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/utils.py
@@ -67,7 +67,7 @@ def _build_oai_image_content(image: llm.ChatImage, cache_key: Any):
     if isinstance(image.image, str):  # image url
         return {
             "type": "image_url",
-            "image_url": {"url": image.image, "detail": "auto"},
+            "image_url": {"url": image.image, "detail": image.inference_detail},
         }
     elif isinstance(image.image, rtc.VideoFrame):  # VideoFrame
         if cache_key not in image._cache:
@@ -86,7 +86,12 @@ def _build_oai_image_content(image: llm.ChatImage, cache_key: Any):
 
         return {
             "type": "image_url",
-            "image_url": {"url": f"data:image/jpeg;base64,{image._cache[cache_key]}"},
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{image._cache[cache_key]}",
+                "detail": image.inference_detail,
+            },
         }
 
-    raise ValueError(f"unknown image type {type(image.image)}")
+    raise ValueError(
+        "LiveKit OpenAI Plugin: ChatImage must be an rtc.VideoFrame or a URL"
+    )
diff --git a/tests/.gitattributes b/tests/.gitattributes
@@ -1,2 +1,3 @@
 long.mp3 filter=lfs diff=lfs merge=lfs -text
 change-sophie.wav filter=lfs diff=lfs merge=lfs -text
+hearts.rgba filter=lfs diff=lfs merge=lfs -text
diff --git a/tests/hearts.jpg b/tests/hearts.jpg
diff --git a/tests/hearts.rgba b/tests/hearts.rgba
diff --git a/tests/test_llm.py b/tests/test_llm.py
@@ -1,13 +1,16 @@
 from __future__ import annotations
 
 import asyncio
+import base64
 from enum import Enum
+from pathlib import Path
 from typing import Annotated, Callable, Literal, Optional, Union
 
 import pytest
 from livekit.agents import APIConnectionError, llm
 from livekit.agents.llm import ChatContext, FunctionContext, TypeInfo, ai_callable
 from livekit.plugins import anthropic, openai
+from livekit.rtc import VideoBufferType, VideoFrame
 
 
 class Unit(Enum):
@@ -369,3 +372,82 @@ async def _request_fnc_call(
         pass
 
     return stream
+
+
+_HEARTS_RGBA_PATH = Path(__file__).parent / "hearts.rgba"
+with open(_HEARTS_RGBA_PATH, "rb") as f:
+    image_data = f.read()
+
+    _HEARTS_IMAGE_VIDEO_FRAME = VideoFrame(
+        width=512, height=512, type=VideoBufferType.RGBA, data=image_data
+    )
+
+_HEARTS_JPEG_PATH = Path(__file__).parent / "hearts.jpg"
+with open(_HEARTS_JPEG_PATH, "rb") as f:
+    _HEARTS_IMAGE_DATA_URL = (
+        f"data:image/jpeg;base64,{base64.b64encode(f.read()).decode()}"
+    )
+
+
+@pytest.mark.parametrize("llm_factory", LLMS)
+async def test_chat_with_image_data_url(llm_factory: Callable[[], llm.LLM]):
+    input_llm = llm_factory()
+
+    chat_ctx = (
+        ChatContext()
+        .append(
+            text="You are an AI assistant that describes images in detail upon request.",
+            role="system",
+        )
+        .append(
+            text="Describe this image",
+            images=[
+                llm.ChatImage(image=_HEARTS_IMAGE_DATA_URL, inference_detail="low")
+            ],
+            role="user",
+        )
+    )
+
+    stream = input_llm.chat(chat_ctx=chat_ctx)
+    text = ""
+    async for chunk in stream:
+        if not chunk.choices:
+            continue
+
+        content = chunk.choices[0].delta.content
+        if content:
+            text += content
+
+    assert "heart" in text.lower()
+
+
+@pytest.mark.parametrize("llm_factory", LLMS)
+async def test_chat_with_image_frame(llm_factory: Callable[[], llm.LLM]):
+    input_llm = llm_factory()
+
+    chat_ctx = (
+        ChatContext()
+        .append(
+            text="You are an AI assistant that describes images in detail upon request.",
+            role="system",
+        )
+        .append(
+            text="Describe this image",
+            images=[
+                llm.ChatImage(image=_HEARTS_IMAGE_VIDEO_FRAME, inference_detail="low")
+            ],
+            role="user",
+        )
+    )
+
+    stream = input_llm.chat(chat_ctx=chat_ctx)
+    text = ""
+    async for chunk in stream:
+        if not chunk.choices:
+            continue
+
+        content = chunk.choices[0].delta.content
+        if content:
+            text += content
+
+    assert "heart" in text.lower()