diff --git a/.changeset/great-lizards-pump.md b/.changeset/great-lizards-pump.md new file mode 100644 index 000000000..a9542b8be --- /dev/null +++ b/.changeset/great-lizards-pump.md @@ -0,0 +1,9 @@ +--- +"livekit-agents": patch +"livekit-plugins-anthropic": patch +"livekit-plugins-openai": patch +--- + +Add support for OpenAI's "detail" parameter to ChatImage + +Add support for data URLs on ChatImage in the Anthropic plugin. diff --git a/livekit-agents/livekit/agents/llm/chat_context.py b/livekit-agents/livekit/agents/llm/chat_context.py index 26192bc99..07e36d6c0 100644 --- a/livekit-agents/livekit/agents/llm/chat_context.py +++ b/livekit-agents/livekit/agents/llm/chat_context.py @@ -57,6 +57,12 @@ class ChatImage: """ Resizing parameter for rtc.VideoFrame inputs (ignored for URL images) """ + inference_detail: Literal["auto", "high", "low"] = "auto" + """ + Detail parameter for LLM provider, if supported. + + Currently only supported by OpenAI (see https://platform.openai.com/docs/guides/vision?lang=node#low-or-high-fidelity-image-understanding) + """ _cache: dict[Any, Any] = field(default_factory=dict, repr=False, init=False) """ _cache is used internally by LLM implementations to store a processed version of the image diff --git a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py index 644d8432e..b48d6ec58 100644 --- a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py +++ b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py @@ -26,6 +26,7 @@ Literal, Tuple, Union, + cast, get_args, get_origin, ) @@ -427,12 +428,36 @@ def _build_anthropic_message( def _build_anthropic_image_content( image: llm.ChatImage, cache_key: Any ) -> anthropic.types.ImageBlockParam: - if isinstance(image.image, str): # image url - logger.warning( - "ChatImage with url is not yet supported by the LiveKit Anthropic plugin, skipping image '%s'", - image.image, - ) - elif isinstance(image.image, rtc.VideoFrame): # VideoFrame + if isinstance(image.image, str): # image is a URL + if not image.image.startswith("data:"): + raise ValueError("LiveKit Anthropic Plugin: Image URLs must be data URLs") + + try: + header, b64_data = image.image.split(",", 1) + media_type = header.split(";")[0].split(":")[1] + + supported_types = {"image/jpeg", "image/png", "image/webp", "image/gif"} + if media_type not in supported_types: + raise ValueError( + f"LiveKit Anthropic Plugin: Unsupported media type {media_type}. Must be jpeg, png, webp, or gif" + ) + + return { + "type": "image", + "source": { + "type": "base64", + "data": b64_data, + "media_type": cast( + Literal["image/jpeg", "image/png", "image/gif", "image/webp"], + media_type, + ), + }, + } + except (ValueError, IndexError) as e: + raise ValueError( + f"LiveKit Anthropic Plugin: Invalid image data URL {str(e)}" + ) + elif isinstance(image.image, rtc.VideoFrame): # image is a VideoFrame if cache_key not in image._cache: # inside our internal implementation, we allow to put extra metadata to # each ChatImage (avoid to reencode each time we do a chatcompletion request) @@ -456,7 +481,9 @@ def _build_anthropic_image_content( }, } - raise ValueError(f"unknown image type {type(image.image)}") + raise ValueError( + "LiveKit Anthropic Plugin: ChatImage must be an rtc.VideoFrame or a data URL" + ) def _create_ai_function_info( diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py index 883f8b58f..e13116b7e 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/llm.py @@ -735,7 +735,6 @@ async def _run(self) -> None: user = self._user or openai.NOT_GIVEN messages = _build_oai_context(self._chat_ctx, id(self)) - stream = await self._client.chat.completions.create( messages=messages, model=self._model, diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/utils.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/utils.py index ed830e59e..278c499a1 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/utils.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/utils.py @@ -67,7 +67,7 @@ def _build_oai_image_content(image: llm.ChatImage, cache_key: Any): if isinstance(image.image, str): # image url return { "type": "image_url", - "image_url": {"url": image.image, "detail": "auto"}, + "image_url": {"url": image.image, "detail": image.inference_detail}, } elif isinstance(image.image, rtc.VideoFrame): # VideoFrame if cache_key not in image._cache: @@ -86,7 +86,12 @@ def _build_oai_image_content(image: llm.ChatImage, cache_key: Any): return { "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{image._cache[cache_key]}"}, + "image_url": { + "url": f"data:image/jpeg;base64,{image._cache[cache_key]}", + "detail": image.inference_detail, + }, } - raise ValueError(f"unknown image type {type(image.image)}") + raise ValueError( + "LiveKit OpenAI Plugin: ChatImage must be an rtc.VideoFrame or a URL" + ) diff --git a/tests/.gitattributes b/tests/.gitattributes index 0fd91ce6d..9a8911093 100644 --- a/tests/.gitattributes +++ b/tests/.gitattributes @@ -1,2 +1,4 @@ long.mp3 filter=lfs diff=lfs merge=lfs -text change-sophie.wav filter=lfs diff=lfs merge=lfs -text +hearts.rgba filter=lfs diff=lfs merge=lfs -text +hearts.jpg filter=lfs diff=lfs merge=lfs -text diff --git a/tests/hearts.jpg b/tests/hearts.jpg new file mode 100644 index 000000000..23ecdb8d1 --- /dev/null +++ b/tests/hearts.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d133e5535163b77b4ea65d4ca7c9dbe81f4a24fad530f24b9a31b3bde1e1c38 +size 151017 diff --git a/tests/hearts.rgba b/tests/hearts.rgba new file mode 100644 index 000000000..d40a5334b --- /dev/null +++ b/tests/hearts.rgba @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06087a10c1864e6644d16a6e508852e678ad1a96e4d99bd8056bb7f60ab765cc +size 1048576 diff --git a/tests/test_llm.py b/tests/test_llm.py index 906e27462..4b71c0324 100644 --- a/tests/test_llm.py +++ b/tests/test_llm.py @@ -1,13 +1,16 @@ from __future__ import annotations import asyncio +import base64 from enum import Enum +from pathlib import Path from typing import Annotated, Callable, Literal, Optional, Union import pytest from livekit.agents import APIConnectionError, llm from livekit.agents.llm import ChatContext, FunctionContext, TypeInfo, ai_callable from livekit.plugins import anthropic, openai +from livekit.rtc import VideoBufferType, VideoFrame class Unit(Enum): @@ -369,3 +372,82 @@ async def _request_fnc_call( pass return stream + + +_HEARTS_RGBA_PATH = Path(__file__).parent / "hearts.rgba" +with open(_HEARTS_RGBA_PATH, "rb") as f: + image_data = f.read() + + _HEARTS_IMAGE_VIDEO_FRAME = VideoFrame( + width=512, height=512, type=VideoBufferType.RGBA, data=image_data + ) + +_HEARTS_JPEG_PATH = Path(__file__).parent / "hearts.jpg" +with open(_HEARTS_JPEG_PATH, "rb") as f: + _HEARTS_IMAGE_DATA_URL = ( + f"data:image/jpeg;base64,{base64.b64encode(f.read()).decode()}" + ) + + +@pytest.mark.parametrize("llm_factory", LLMS) +async def test_chat_with_image_data_url(llm_factory: Callable[[], llm.LLM]): + input_llm = llm_factory() + + chat_ctx = ( + ChatContext() + .append( + text="You are an AI assistant that describes images in detail upon request.", + role="system", + ) + .append( + text="Describe this image", + images=[ + llm.ChatImage(image=_HEARTS_IMAGE_DATA_URL, inference_detail="low") + ], + role="user", + ) + ) + + stream = input_llm.chat(chat_ctx=chat_ctx) + text = "" + async for chunk in stream: + if not chunk.choices: + continue + + content = chunk.choices[0].delta.content + if content: + text += content + + assert "heart" in text.lower() + + +@pytest.mark.parametrize("llm_factory", LLMS) +async def test_chat_with_image_frame(llm_factory: Callable[[], llm.LLM]): + input_llm = llm_factory() + + chat_ctx = ( + ChatContext() + .append( + text="You are an AI assistant that describes images in detail upon request.", + role="system", + ) + .append( + text="Describe this image", + images=[ + llm.ChatImage(image=_HEARTS_IMAGE_VIDEO_FRAME, inference_detail="low") + ], + role="user", + ) + ) + + stream = input_llm.chat(chat_ctx=chat_ctx) + text = "" + async for chunk in stream: + if not chunk.choices: + continue + + content = chunk.choices[0].delta.content + if content: + text += content + + assert "heart" in text.lower()