Merge branch 'master' of https://github.com/edenai/edenai_apis

edenai · Nov 29, 2023 · a579f23 · a579f23
2 parents e9efff4 + c5f241b
commit a579f23
Show file tree

Hide file tree

Showing 6 changed files with 118 additions and 9 deletions.
diff --git a/AVAILABLES_FEATURES_AND_PROVIDERS.md b/AVAILABLES_FEATURES_AND_PROVIDERS.md
@@ -170,6 +170,7 @@
 |----------|-------------|
 | **embeddings** | alephalpha |
 | **question_answer** | alephalpha |
+| | openai |
 | **explicit_content** | amazon |
 | | api4ai |
 | | clarifai |
@@ -711,6 +712,7 @@
 | **audio** | speech_to_text_async |
 | | text_to_speech |
 | **image** | generation |
+| | question_answer |
 | **text** | anonymization |
 | | chat |
 | | code_generation |

diff --git a/edenai_apis/apis/openai/info.json b/edenai_apis/apis/openai/info.json
@@ -380,6 +380,15 @@
         "default_model": "dall-e-2"
       },
       "version": "v1Beta"
+    },
+    "question_answer": {
+      "constraints": {
+        "models": [
+          "gpt-4-vision-preview"
+        ],
+        "default_model": "gpt-4-vision-preview"
+      },
+      "version": "v1"
     }
   },
   "audio": {

diff --git a/edenai_apis/apis/openai/openai_image_api.py b/edenai_apis/apis/openai/openai_image_api.py
@@ -1,5 +1,6 @@
 import base64
 from io import BytesIO
+from json import JSONDecodeError
 from typing import Sequence, Literal, Optional
 
 import requests
@@ -14,6 +15,8 @@
 from .helpers import (
     get_openapi_response,
 )
+from ...features.image.question_answer import QuestionAnswerDataClass
+from ...utils.exception import ProviderException
 
 
 class OpenaiImageApi(ImageInterface):
@@ -22,7 +25,7 @@ def image__generation(
         text: str,
         resolution: Literal["256x256", "512x512", "1024x1024"],
         num_images: int = 1,
-        model: Optional[str] = None
+        model: Optional[str] = None,
     ) -> ResponseType[ImageGenerationDataClass]:
         url = f"{self.url}/images/generations"
         payload = {
@@ -32,9 +35,7 @@ def image__generation(
             "size": resolution,
             "response_format": "b64_json",
         }
-        response = requests.post(
-            url, json=payload, headers=self.headers
-        )
+        response = requests.post(url, json=payload, headers=self.headers)
         original_response = get_openapi_response(response)
 
         generations: Sequence[GeneratedImageDataClass] = []
@@ -54,3 +55,68 @@ def image__generation(
             original_response=original_response,
             standardized_response=ImageGenerationDataClass(items=generations),
         )
+
+    def image__question_answer(
+        self,
+        file: str,
+        temperature: float,
+        max_tokens: int,
+        file_url: str = "",
+        model: Optional[str] = None,
+        question: Optional[str] = None,
+    ) -> ResponseType[QuestionAnswerDataClass]:
+        with open(file, "rb") as fstream:
+            file_content = fstream.read()
+            file_b64 = base64.b64encode(file_content).decode("utf-8")
+
+            url = f"{self.url}/chat/completions"
+            payload = {
+                "model": "gpt-4-vision-preview" or model,
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": question or "Describe the following image",
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/png;base64,{file_b64}"
+                                },
+                            },
+                        ],
+                    },
+                ],
+                "max_tokens": max_tokens,
+                "temperature": temperature,
+            }
+
+            response = requests.post(url, json=payload, headers=self.headers)
+
+            if response.status_code >= 500:
+                raise ProviderException(
+                    f"OpenAI API is not available. Status code: {response.status_code}"
+                )
+
+            if response.status_code != 200:
+                raise ProviderException(
+                    message=response.text, code=response.status_code
+                )
+
+            try:
+                original_response = response.json()
+            except JSONDecodeError as exc:
+                raise ProviderException(
+                    message="Invalid JSON response", code=response.status_code
+                ) from exc
+
+            standardized_response = QuestionAnswerDataClass(
+                answers=[original_response["choices"][0]["message"]["content"]]
+            )
+
+            return ResponseType[QuestionAnswerDataClass](
+                original_response=original_response,
+                standardized_response=standardized_response,
+            )
diff --git a/edenai_apis/apis/openai/outputs/image/question_answer_output.json b/edenai_apis/apis/openai/outputs/image/question_answer_output.json
@@ -0,0 +1,30 @@
+{
+  "original_response": {
+    "id": "chatcmpl-8QBuI35InuUHU0ZWIxncpk6auVn8v",
+    "object": "chat.completion",
+    "created": 1701252990,
+    "model": "gpt-4-1106-vision-preview",
+    "usage": {
+      "prompt_tokens": 1120,
+      "completion_tokens": 64,
+      "total_tokens": 1184
+    },
+    "choices": [
+      {
+        "message": {
+          "role": "assistant",
+          "content": "The image contains a combination of logos from four major technology companies, often referred to collectively as \"FAANG\" (Facebook, Apple, Amazon, Netflix, Google), but without the Netflix logo. From left to right, the logos are:\n\n1. Facebook: The blue square with a lowercase 'f' is the logo"
+        },
+        "finish_details": {
+          "type": "max_tokens"
+        },
+        "index": 0
+      }
+    ]
+  },
+  "standardized_response": {
+    "answers": [
+      "The image contains a combination of logos from four major technology companies, often referred to collectively as \"FAANG\" (Facebook, Apple, Amazon, Netflix, Google), but without the Netflix logo. From left to right, the logos are:\n\n1. Facebook: The blue square with a lowercase 'f' is the logo"
+    ]
+  }
+}
diff --git a/edenai_apis/features/image/image_interface.py b/edenai_apis/features/image/image_interface.py
@@ -172,9 +172,11 @@ def image__question_answer(
 
         Args:
             file (BufferedReader): image to analyze
-            question (str): your query
-            maximum_tokens (int): maximum number of tokens to be generated
-            model (str): which ai model to use, default to 'None'
+            file_url (str, optional): url of the image to analyze
+            temperature: (float): temperature of the answer
+            max_tokens (int): maximum number of tokens to be generated
+            question (str, optional): question to ask, if default to `None` a description of the image will be asked
+            model (str, optional): which AI model to use, default to 'None'
         """
         raise NotImplementedError
 

diff --git a/edenai_apis/features/image/question_answer/question_answer_args.py b/edenai_apis/features/image/question_answer/question_answer_args.py
@@ -1,7 +1,7 @@
 # pylint: disable=locally-disabled, line-too-long
 import mimetypes
 import os
-from typing import Dict
+from typing import Dict, Any
 
 from pydub.utils import mediainfo
 
@@ -24,7 +24,7 @@
 file_wrapper = FileWrapper(image_path, "", file_info)
 
 
-def question_answer_arguments(provider_name: str) -> Dict:
+def question_answer_arguments(provider_name: str) -> Dict[str, Any]:
     return {
         "file": file_wrapper,
         "question": "What are the logos on the image ?",