feat: allow uploading multiple images in a single generation

HanaokaYuzu · May 23, 2024 · e978a5f · e978a5f
1 parent 17523c2
commit e978a5f
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -137,11 +137,14 @@ asyncio.run(main())
 
 ### Generate contents from image
 
-Gemini supports image recognition and generate contents from image (currently only supports one image at a time). Optionally, you can pass image data in `bytes` or its path in `str` to `GeminiClient.generate_content` together with text prompt.
+Gemini supports image recognition and generating contents from images. Optionally, you can pass images in a list of file data in `bytes` or their paths in `str` to `GeminiClient.generate_content` together with text prompt.
 
 ```python
 async def main():
-    response = await client.generate_content("Describe the image", image="assets/banner.png")
+    response = await client.generate_content(
+            "Describe each of these images",
+            images=["assets/banner.png", "assets/favicon.png"],
+        )
     print(response.text)
 
 asyncio.run(main())

diff --git a/assets/favicon.png b/assets/favicon.png
diff --git a/src/gemini_webapi/client.py b/src/gemini_webapi/client.py
@@ -108,12 +108,12 @@ def __init__(
         self.cookies = {}
         self.proxies = proxies
         self.running: bool = False
-        self.client: AsyncClient = None
-        self.access_token: str = None
+        self.client: AsyncClient | None = None
+        self.access_token: str | None = None
         self.timeout: float = 30
         self.auto_close: bool = False
         self.close_delay: float = 300
-        self.close_task: Task = None
+        self.close_task: Task | None = None
         self.auto_refresh: bool = True
         self.refresh_interval: float = 540
 
@@ -252,7 +252,7 @@ async def start_auto_refresh(self) -> None:
     async def generate_content(
         self,
         prompt: str,
-        image: bytes | str | None = None,
+        images: list[bytes | str] | None = None,
         chat: Optional["ChatSession"] = None,
     ) -> ModelOutput:
         """
@@ -262,8 +262,8 @@ async def generate_content(
         ----------
         prompt: `str`
             Prompt provided by user.
-        image: `bytes` | `str`, optional
-            File data in bytes, or path to the image file to be sent together with the prompt.
+        images: `list[bytes | str]`, optional
+            List of image file data in bytes or file paths in string.
         chat: `ChatSession`, optional
             Chat data to retrieve conversation history. If None, will automatically generate a new chat id when sending post request.
 
@@ -300,12 +300,22 @@ async def generate_content(
                             None,
                             json.dumps(
                                 [
-                                    image
+                                    images
                                     and [
                                         prompt,
                                         0,
                                         None,
-                                        [[[await upload_file(image, self.proxies), 1]]],
+                                        [
+                                            [
+                                                [
+                                                    await upload_file(
+                                                        image, self.proxies
+                                                    ),
+                                                    1,
+                                                ]
+                                            ]
+                                            for image in images
+                                        ],
                                     ]
                                     or [prompt],
                                     None,
@@ -475,7 +485,7 @@ def __setattr__(self, name: str, value: Any) -> None:
             self.rcid = value.rcid
 
     async def send_message(
-        self, prompt: str, image: bytes | str | None = None
+        self, prompt: str, images: list[bytes | str] | None = None,
     ) -> ModelOutput:
         """
         Generates contents with prompt.
@@ -485,8 +495,8 @@ async def send_message(
         ----------
         prompt: `str`
             Prompt provided by user.
-        image: `bytes` | `str`, optional
-            File data in bytes, or path to the image file to be sent together with the prompt.
+        images: `list[bytes | str]`, optional
+            List of image file data in bytes or file paths in string.
 
         Returns
         -------
@@ -507,7 +517,7 @@ async def send_message(
             - If response structure is invalid and failed to parse.
         """
         return await self.geminiclient.generate_content(
-            prompt=prompt, image=image, chat=self
+            prompt=prompt, images=images, chat=self
         )
 
     def choose_candidate(self, index: int) -> ModelOutput:

diff --git a/tests/test_client_features.py b/tests/test_client_features.py
@@ -29,7 +29,7 @@ async def test_successful_request(self):
     @logger.catch(reraise=True)
     async def test_upload_image(self):
         response = await self.geminiclient.generate_content(
-            "Describe the image", image="assets/banner.png"
+            "Describe the image", images=["assets/banner.png"]
         )
         self.assertTrue(response.text)
         logger.debug(response.text)
@@ -59,11 +59,12 @@ async def test_retrieve_previous_conversation(self):
     async def test_chatsession_with_image(self):
         chat = self.geminiclient.start_chat()
         response1 = await chat.send_message(
-            "Describe the image", image="assets/banner.png"
+            "What's the difference between these two images?",
+            images=["assets/pic1.png", "assets/pic2.png"],
         )
         self.assertTrue(response1.text)
         logger.debug(response1.text)
-        response2 = await chat.send_message("Tell me more about it.")
+        response2 = await chat.send_message("Tell me more.")
         self.assertTrue(response2.text)
         logger.debug(response2.text)