Merge remote-tracking branch 'origin/main' into bcherry/oai-detail

livekit · Dec 13, 2024 · 483eae1 · 483eae1
2 parents 039e55e + d589a1e
commit 483eae1
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 6 deletions.
diff --git a/.changeset/tiny-papayas-film.md b/.changeset/tiny-papayas-film.md
@@ -0,0 +1,9 @@
+---
+"livekit-agents": patch
+"livekit-plugins-anthropic": patch
+"livekit-plugins-openai": patch
+---
+
+Fix center_aspect_fit bug, add scale_aspect_fit and scale_aspect_fill resizing options.
+
+Make scale_aspect_fit the new default resizing option for video frames.
diff --git a/livekit-agents/livekit/agents/utils/images/image.py b/livekit-agents/livekit/agents/utils/images/image.py
@@ -33,7 +33,18 @@ class EncodeOptions:
 class ResizeOptions:
     width: int
     height: int
-    strategy: Literal["center_aspect_fit", "center_aspect_cover", "skew"]
+    strategy: Literal[
+        # Fit the image into the provided dimensions, with letterboxing
+        "center_aspect_fit",
+        # Fill the provided dimensions, with cropping
+        "center_aspect_cover",
+        # Fit the image into the provided dimensions, preserving its original aspect ratio
+        "scale_aspect_fit",
+        # Fill the provided dimensions, preserving its original aspect ratio (image will be larger than the provided dimensions)
+        "scale_aspect_cover",
+        # Precisely resize the image to the provided dimensions
+        "skew",
+    ]
 
 
 def import_pil():
@@ -83,10 +94,11 @@ def _resize_image(image: Any, options: EncodeOptions):
 
         # If the new image is wider than the original
         if resize_opts.width / resize_opts.height > image.width / image.height:
-            new_width = resize_opts.width
-            new_height = int(image.height * (resize_opts.width / image.width))
+            new_height = resize_opts.height
+            new_width = int(image.width * (resize_opts.height / image.height))
 
         resized = image.resize((new_width, new_height))
+
         Image.Image.paste(
             result,
             resized,
@@ -118,5 +130,27 @@ def _resize_image(image: Any, options: EncodeOptions):
             ),
         )
         return result
+    elif resize_opts.strategy == "scale_aspect_fill":
+        # Start with assuming width is the limiting dimension
+        new_width = resize_opts.width
+        new_height = int(image.height * (resize_opts.width / image.width))
+
+        # If height is under the limit, scale based on height instead
+        if new_height < resize_opts.height:
+            new_height = resize_opts.height
+            new_width = int(image.width * (resize_opts.height / image.height))
+
+        return image.resize((new_width, new_height))
+    elif resize_opts.strategy == "scale_aspect_fit":
+        # Start with assuming width is the limiting dimension
+        new_width = resize_opts.width
+        new_height = int(image.height * (resize_opts.width / image.width))
+
+        # If height would exceed the limit, scale based on height instead
+        if new_height > resize_opts.height:
+            new_height = resize_opts.height
+            new_width = int(image.width * (resize_opts.height / image.height))
+
+        return image.resize((new_width, new_height))
 
     raise ValueError(f"Unknown resize strategy: {resize_opts.strategy}")
diff --git a/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py b/livekit-plugins/livekit-plugins-anthropic/livekit/plugins/anthropic/llm.py
@@ -466,7 +466,7 @@ def _build_anthropic_image_content(
                 opts.resize_options = utils.images.ResizeOptions(
                     width=image.inference_width,
                     height=image.inference_height,
-                    strategy="center_aspect_fit",
+                    strategy="scale_aspect_fit",
                 )
 
             encoded_data = utils.images.encode(image.image, opts)

diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/beta/assistant_llm.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/beta/assistant_llm.py
@@ -532,7 +532,7 @@ async def _upload_frame(
             opts.resize_options = utils.images.ResizeOptions(
                 width=inference_width,
                 height=inference_height,
-                strategy="center_aspect_fit",
+                strategy="scale_aspect_fit",
             )
 
         encoded_data = utils.images.encode(frame, opts)

diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/utils.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/utils.py
@@ -78,7 +78,7 @@ def _build_oai_image_content(image: llm.ChatImage, cache_key: Any):
                 opts.resize_options = utils.images.ResizeOptions(
                     width=image.inference_width,
                     height=image.inference_height,
-                    strategy="center_aspect_fit",
+                    strategy="scale_aspect_fit",
                 )
 
             encoded_data = utils.images.encode(image.image, opts)