Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for OpenAI image detail parameter, and add support for Anthropic image data URLs #1213

Merged
merged 37 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
b1a1cfb
add support for specifying openai detail on images
bcherry Dec 11, 2024
11b06dc
Merge remote-tracking branch 'origin/main' into bcherry/oai-detail
bcherry Dec 12, 2024
9ec010e
anthropic
bcherry Dec 12, 2024
ff478ed
cleanup
bcherry Dec 12, 2024
3b06850
updates
bcherry Dec 12, 2024
f2e05f0
Create great-lizards-pump.md
bcherry Dec 12, 2024
fa4a251
fmt
bcherry Dec 12, 2024
91bb7c6
Merge remote-tracking branch 'origin/bcherry/oai-detail' into bcherry…
bcherry Dec 12, 2024
c647779
basic url test
bcherry Dec 12, 2024
9d99036
image test
bcherry Dec 12, 2024
52e31d5
fmt
bcherry Dec 12, 2024
e9ba70c
rmbmp
bcherry Dec 12, 2024
f1f2f4b
Move hearts.rgba to Git LFS
bcherry Dec 12, 2024
b2b3f28
attr
bcherry Dec 12, 2024
005de7d
cast
bcherry Dec 12, 2024
3838388
imp
bcherry Dec 12, 2024
80cffdf
lfs
bcherry Dec 12, 2024
23124d6
typ
bcherry Dec 12, 2024
ca89d4c
improvements to endpointing latency (#1212)
davidzhao Dec 12, 2024
b45cf88
Handle optional func args in tool calls when set to `None` (#1211)
jayeshp19 Dec 12, 2024
f169a0b
add `google/gemini-2.0-flash-exp` as default model for vertex (#1214)
jayeshp19 Dec 12, 2024
7b7e3a9
Version Packages (#1188)
github-actions[bot] Dec 12, 2024
01f9964
Support Deepgram TTS (#1201)
jayeshp19 Dec 12, 2024
cb38402
fix release of new plugins (#1216)
theomonnom Dec 12, 2024
a77aed3
anthropic
bcherry Dec 12, 2024
8a376af
cleanup
bcherry Dec 12, 2024
8450c9c
updates
bcherry Dec 12, 2024
02784db
fmt
bcherry Dec 12, 2024
d24a4e9
Create great-lizards-pump.md
bcherry Dec 12, 2024
b9e30ab
basic url test
bcherry Dec 12, 2024
8a0bfbe
image test
bcherry Dec 12, 2024
979bb17
cast
bcherry Dec 12, 2024
6cc6fc5
imp
bcherry Dec 12, 2024
9b96433
typ
bcherry Dec 12, 2024
10e0e09
Merge remote-tracking branch 'origin/main' into bcherry/oai-detail
bcherry Dec 12, 2024
039e55e
Merge remote-tracking branch 'origin/bcherry/oai-detail' into bcherry…
bcherry Dec 12, 2024
483eae1
Merge remote-tracking branch 'origin/main' into bcherry/oai-detail
bcherry Dec 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .changeset/great-lizards-pump.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
"livekit-agents": patch
"livekit-plugins-anthropic": patch
"livekit-plugins-openai": patch
---

Add support for OpenAI's "detail" parameter to ChatImage

Add support for data URLs on ChatImage in the Anthropic plugin.
6 changes: 6 additions & 0 deletions livekit-agents/livekit/agents/llm/chat_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,12 @@ class ChatImage:
"""
Resizing parameter for rtc.VideoFrame inputs (ignored for URL images)
"""
inference_detail: Literal["auto", "high", "low"] = "auto"
"""
Detail parameter for LLM provider, if supported.

Currently only supported by OpenAI (see https://platform.openai.com/docs/guides/vision?lang=node#low-or-high-fidelity-image-understanding)
"""
_cache: dict[Any, Any] = field(default_factory=dict, repr=False, init=False)
"""
_cache is used internally by LLM implementations to store a processed version of the image
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
Literal,
Tuple,
Union,
cast,
get_args,
get_origin,
)
Expand Down Expand Up @@ -427,12 +428,36 @@ def _build_anthropic_message(
def _build_anthropic_image_content(
image: llm.ChatImage, cache_key: Any
) -> anthropic.types.ImageBlockParam:
if isinstance(image.image, str): # image url
logger.warning(
"ChatImage with url is not yet supported by the LiveKit Anthropic plugin, skipping image '%s'",
image.image,
)
elif isinstance(image.image, rtc.VideoFrame): # VideoFrame
if isinstance(image.image, str): # image is a URL
if not image.image.startswith("data:"):
raise ValueError("LiveKit Anthropic Plugin: Image URLs must be data URLs")

try:
header, b64_data = image.image.split(",", 1)
media_type = header.split(";")[0].split(":")[1]

supported_types = {"image/jpeg", "image/png", "image/webp", "image/gif"}
if media_type not in supported_types:
raise ValueError(
f"LiveKit Anthropic Plugin: Unsupported media type {media_type}. Must be jpeg, png, webp, or gif"
)

return {
"type": "image",
"source": {
"type": "base64",
"data": b64_data,
"media_type": cast(
Literal["image/jpeg", "image/png", "image/gif", "image/webp"],
media_type,
),
},
}
except (ValueError, IndexError) as e:
raise ValueError(
f"LiveKit Anthropic Plugin: Invalid image data URL {str(e)}"
)
elif isinstance(image.image, rtc.VideoFrame): # image is a VideoFrame
if cache_key not in image._cache:
# inside our internal implementation, we allow to put extra metadata to
# each ChatImage (avoid to reencode each time we do a chatcompletion request)
Expand All @@ -456,7 +481,9 @@ def _build_anthropic_image_content(
},
}

raise ValueError(f"unknown image type {type(image.image)}")
raise ValueError(
"LiveKit OpenAI Plugin: ChatImage must be an rtc.VideoFrame or a data URL"
)


def _create_ai_function_info(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -729,7 +729,6 @@ async def _run(self) -> None:

user = self._user or openai.NOT_GIVEN
messages = _build_oai_context(self._chat_ctx, id(self))

stream = await self._client.chat.completions.create(
messages=messages,
model=self._model,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def _build_oai_image_content(image: llm.ChatImage, cache_key: Any):
if isinstance(image.image, str): # image url
return {
"type": "image_url",
"image_url": {"url": image.image, "detail": "auto"},
"image_url": {"url": image.image, "detail": image.inference_detail},
bcherry marked this conversation as resolved.
Show resolved Hide resolved
}
elif isinstance(image.image, rtc.VideoFrame): # VideoFrame
if cache_key not in image._cache:
Expand All @@ -86,7 +86,12 @@ def _build_oai_image_content(image: llm.ChatImage, cache_key: Any):

return {
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image._cache[cache_key]}"},
"image_url": {
"url": f"data:image/jpeg;base64,{image._cache[cache_key]}",
"detail": image.inference_detail,
},
}

raise ValueError(f"unknown image type {type(image.image)}")
raise ValueError(
"LiveKit OpenAI Plugin: ChatImage must be an rtc.VideoFrame or a URL"
)
1 change: 1 addition & 0 deletions tests/.gitattributes
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
long.mp3 filter=lfs diff=lfs merge=lfs -text
change-sophie.wav filter=lfs diff=lfs merge=lfs -text
hearts.rgba filter=lfs diff=lfs merge=lfs -text
Binary file added tests/hearts.jpg
bcherry marked this conversation as resolved.
Show resolved Hide resolved
bcherry marked this conversation as resolved.
Show resolved Hide resolved
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 3 additions & 0 deletions tests/hearts.rgba
bcherry marked this conversation as resolved.
Show resolved Hide resolved
Git LFS file not shown
82 changes: 82 additions & 0 deletions tests/test_llm.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from __future__ import annotations

import asyncio
import base64
from enum import Enum
from pathlib import Path
from typing import Annotated, Callable, Literal, Optional, Union

import pytest
from livekit.agents import APIConnectionError, llm
from livekit.agents.llm import ChatContext, FunctionContext, TypeInfo, ai_callable
from livekit.plugins import anthropic, openai
from livekit.rtc import VideoBufferType, VideoFrame


class Unit(Enum):
Expand Down Expand Up @@ -369,3 +372,82 @@ async def _request_fnc_call(
pass

return stream


_HEARTS_RGBA_PATH = Path(__file__).parent / "hearts.rgba"
with open(_HEARTS_RGBA_PATH, "rb") as f:
image_data = f.read()

_HEARTS_IMAGE_VIDEO_FRAME = VideoFrame(
width=512, height=512, type=VideoBufferType.RGBA, data=image_data
)

_HEARTS_JPEG_PATH = Path(__file__).parent / "hearts.jpg"
with open(_HEARTS_JPEG_PATH, "rb") as f:
_HEARTS_IMAGE_DATA_URL = (
f"data:image/jpeg;base64,{base64.b64encode(f.read()).decode()}"
)


@pytest.mark.parametrize("llm_factory", LLMS)
async def test_chat_with_image_data_url(llm_factory: Callable[[], llm.LLM]):
bcherry marked this conversation as resolved.
Show resolved Hide resolved
input_llm = llm_factory()

chat_ctx = (
ChatContext()
.append(
text="You are an AI assistant that describes images in detail upon request.",
role="system",
)
.append(
text="Describe this image",
images=[
llm.ChatImage(image=_HEARTS_IMAGE_DATA_URL, inference_detail="low")
],
role="user",
)
)

stream = input_llm.chat(chat_ctx=chat_ctx)
text = ""
async for chunk in stream:
if not chunk.choices:
continue

content = chunk.choices[0].delta.content
if content:
text += content

assert "heart" in text.lower()


@pytest.mark.parametrize("llm_factory", LLMS)
async def test_chat_with_image_frame(llm_factory: Callable[[], llm.LLM]):
input_llm = llm_factory()

chat_ctx = (
ChatContext()
.append(
text="You are an AI assistant that describes images in detail upon request.",
role="system",
)
.append(
text="Describe this image",
images=[
llm.ChatImage(image=_HEARTS_IMAGE_VIDEO_FRAME, inference_detail="low")
],
role="user",
)
)

stream = input_llm.chat(chat_ctx=chat_ctx)
text = ""
async for chunk in stream:
if not chunk.choices:
continue

content = chunk.choices[0].delta.content
if content:
text += content

assert "heart" in text.lower()
Loading