From ccf74176f5f0b228792a727b6f8ef49c9440a251 Mon Sep 17 00:00:00 2001
From: Chi Wang <wang.chi@microsoft.com>
Date: Thu, 15 Jun 2023 17:58:44 -0700
Subject: [PATCH] update openai model support (#1082)

* update openai model support

* new gpt3.5

* docstr

* function_call and content may co-exist

* test function call

---------

Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu>
---
 flaml/autogen/agent/assistant_agent.py    |  1 +
 flaml/autogen/oai/completion.py           | 44 ++++++++++++----
 setup.py                                  |  4 +-
 test/autogen/test_assistant_agent.py      |  4 +-
 test/autogen/test_function_call.py        | 63 +++++++++++++++++++++++
 website/docs/Use-Cases/Auto-Generation.md |  4 +-
 6 files changed, 105 insertions(+), 15 deletions(-)
 create mode 100644 test/autogen/test_function_call.py

diff --git a/flaml/autogen/agent/assistant_agent.py b/flaml/autogen/agent/assistant_agent.py
index 09e0ae0761c..0381cfdb97e 100644
--- a/flaml/autogen/agent/assistant_agent.py
+++ b/flaml/autogen/agent/assistant_agent.py
@@ -39,6 +39,7 @@ def receive(self, message, sender):
             self._conversations[sender.name] = [{"content": self._system_message, "role": "system"}]
         super().receive(message, sender)
         responses = oai.ChatCompletion.create(messages=self._conversations[sender.name], **self._config)
+        # TODO: handle function_call
         response = oai.ChatCompletion.extract_text(responses)[0]
         self._send(response, sender)
 
diff --git a/flaml/autogen/oai/completion.py b/flaml/autogen/oai/completion.py
index e7b4a503198..d4375598239 100644
--- a/flaml/autogen/oai/completion.py
+++ b/flaml/autogen/oai/completion.py
@@ -45,12 +45,16 @@ class Completion(openai_Completion):
     # set of models that support chat completion
     chat_models = {
         "gpt-3.5-turbo",
-        "gpt-3.5-turbo-0301",
+        "gpt-3.5-turbo-0301",  # deprecate in Sep
+        "gpt-3.5-turbo-0613",
+        "gpt-3.5-turbo-16k",
         "gpt-35-turbo",
         "gpt-4",
         "gpt-4-32k",
-        "gpt-4-32k-0314",
-        "gpt-4-0314",
+        "gpt-4-32k-0314",  # deprecate in Sep
+        "gpt-4-0314",  # deprecate in Sep
+        "gpt-4-0613",
+        "gpt-4-32k-0613",
     }
 
     # price per 1k tokens
@@ -62,13 +66,17 @@ class Completion(openai_Completion):
         "code-davinci-002": 0.1,
         "text-davinci-002": 0.02,
         "text-davinci-003": 0.02,
-        "gpt-3.5-turbo": 0.002,
-        "gpt-3.5-turbo-0301": 0.002,
+        "gpt-3.5-turbo": (0.0015, 0.002),
+        "gpt-3.5-turbo-0301": (0.0015, 0.002),  # deprecate in Sep
+        "gpt-3.5-turbo-0613": (0.0015, 0.002),
+        "gpt-3.5-turbo-16k": (0.003, 0.004),
         "gpt-35-turbo": 0.002,
         "gpt-4": (0.03, 0.06),
-        "gpt-4-0314": (0.03, 0.06),
         "gpt-4-32k": (0.06, 0.12),
-        "gpt-4-32k-0314": (0.06, 0.12),
+        "gpt-4-0314": (0.03, 0.06),  # deprecate in Sep
+        "gpt-4-32k-0314": (0.06, 0.12),  # deprecate in Sep
+        "gpt-4-0613": (0.03, 0.06),
+        "gpt-4-32k-0613": (0.06, 0.12),
     }
 
     default_search_space = {
@@ -386,7 +394,7 @@ def _eval(cls, config: dict, prune=True, eval_only=False):
                         result["cost"] = cost
                         return result
                     # evaluate the quality of the responses
-                    responses = cls.extract_text(response)
+                    responses = cls.extract_text_or_function_call(response)
                     usage = response["usage"]
                     n_input_tokens = usage["prompt_tokens"]
                     n_output_tokens = usage.get("completion_tokens", 0)
@@ -898,7 +906,7 @@ def eval_func(responses, **data):
             response = cls.create(data_i, use_cache, **config)
             cost += response["cost"]
             # evaluate the quality of the responses
-            responses = cls.extract_text(response)
+            responses = cls.extract_text_or_function_call(response)
             if eval_func is not None:
                 metrics = eval_func(responses, **data_i)
             elif hasattr(cls, "_eval_func"):
@@ -991,6 +999,24 @@ def extract_text(cls, response: dict) -> List[str]:
             return [choice["text"] for choice in choices]
         return [choice["message"].get("content", "") for choice in choices]
 
+    @classmethod
+    def extract_text_or_function_call(cls, response: dict) -> List[str]:
+        """Extract the text or function calls from a completion or chat response.
+
+        Args:
+            response (dict): The response from OpenAI API.
+
+        Returns:
+            A list of text or function calls in the responses.
+        """
+        choices = response["choices"]
+        if "text" in choices[0]:
+            return [choice["text"] for choice in choices]
+        return [
+            choice["message"] if "function_call" in choice["message"] else choice["message"].get("content", "")
+            for choice in choices
+        ]
+
     @classmethod
     @property
     def logged_history(cls) -> Dict:
diff --git a/setup.py b/setup.py
index 33c86047754..c29272b3859 100644
--- a/setup.py
+++ b/setup.py
@@ -127,8 +127,8 @@
             "pytorch-forecasting>=0.9.0",
         ],
         "benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3", "pandas==1.1.4"],
-        "openai": ["openai==0.27.4", "diskcache"],
-        "autogen": ["openai==0.27.4", "diskcache", "docker"],
+        "openai": ["openai==0.27.8", "diskcache"],
+        "autogen": ["openai==0.27.8", "diskcache", "docker"],
         "synapse": [
             "joblibspark>=0.5.0",
             "optuna==2.8.0",
diff --git a/test/autogen/test_assistant_agent.py b/test/autogen/test_assistant_agent.py
index c5230930b30..46bacb88498 100644
--- a/test/autogen/test_assistant_agent.py
+++ b/test/autogen/test_assistant_agent.py
@@ -11,10 +11,10 @@ def test_gpt35(human_input_mode="NEVER", max_consecutive_auto_reply=5):
         import openai
     except ImportError:
         return
-    config_list = oai.config_list_from_models(key_file_path=KEY_LOC, model_list=["gpt-3.5-turbo"])
+    config_list = oai.config_list_from_models(key_file_path=KEY_LOC, model_list=["gpt-3.5-turbo-0613"])
     assistant = AssistantAgent(
         "coding_agent",
-        request_timeout=600,
+        # request_timeout=600,
         seed=40,
         max_tokens=1024,
         config_list=config_list,
diff --git a/test/autogen/test_function_call.py b/test/autogen/test_function_call.py
new file mode 100644
index 00000000000..9a86ae9414e
--- /dev/null
+++ b/test/autogen/test_function_call.py
@@ -0,0 +1,63 @@
+try:
+    import openai
+except ImportError:
+    openai = None
+import pytest
+import json
+from flaml import oai
+from flaml.autogen.math_utils import eval_math_responses
+
+KEY_LOC = "test/autogen"
+
+
+@pytest.mark.skipif(openai is None, reason="openai not installed")
+def test_eval_math_responses():
+    config_list = oai.config_list_openai_aoai(KEY_LOC, exclude="aoai")
+    functions = [
+        {
+            "name": "eval_math_responses",
+            "description": "Select a response for a math problem using voting, and check if the response is correct if the solution is provided",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "responses": {
+                        "type": "string",
+                        "description": "The responses in a list",
+                    },
+                    "solution": {
+                        "type": "string",
+                        "description": "The canonical solution",
+                    },
+                },
+                "required": ["responses"],
+            },
+        },
+    ]
+    response = oai.ChatCompletion.create(
+        model="gpt-3.5-turbo-0613",
+        config_list=config_list,
+        messages=[
+            {
+                "role": "user",
+                "content": 'evaluate the math responses ["1", "5/2", "5/2"] against the true answer \\frac{5}{2}',
+            },
+        ],
+        functions=functions,
+    )
+    print(response)
+    responses = oai.ChatCompletion.extract_text_or_function_call(response)
+    print(responses[0])
+    function_call = responses[0]["function_call"]
+    name, arguments = function_call["name"], json.loads(function_call["arguments"])
+    assert name == "eval_math_responses"
+    print(arguments["responses"])
+    if isinstance(arguments["responses"], str):
+        arguments["responses"] = json.loads(arguments["responses"])
+    arguments["responses"] = [f"\\boxed{{{x}}}" for x in arguments["responses"]]
+    print(arguments["responses"])
+    arguments["solution"] = f"\\boxed{{{arguments['solution']}}}"
+    print(eval_math_responses(**arguments))
+
+
+if __name__ == "__main__":
+    test_eval_math_responses()
diff --git a/website/docs/Use-Cases/Auto-Generation.md b/website/docs/Use-Cases/Auto-Generation.md
index 4838668def4..9f00f1e13d1 100644
--- a/website/docs/Use-Cases/Auto-Generation.md
+++ b/website/docs/Use-Cases/Auto-Generation.md
@@ -371,14 +371,14 @@ Set `compact=False` in `start_logging()` to switch.
     },
 }
 ```
-It can be seen that the individual API call history contain redundant information of the conversation. For a long conversation the degree of redundancy is high.
+It can be seen that the individual API call history contains redundant information of the conversation. For a long conversation the degree of redundancy is high.
 The compact history is more efficient and the individual API call history contains more details.
 
 ### Other Utilities
 
 - a [`cost`](../reference/autogen/oai/completion#cost) function to calculate the cost of an API call.
 - a [`test`](../reference/autogen/oai/completion#test) function to conveniently evaluate the configuration over test data.
-- a [`extract_text`](../reference/autogen/oai/completion#extract_text) function to extract the text from a completion or chat response.
+- an [`extract_text_or_function_call`](../reference/autogen/oai/completion#extract_text_or_function_call) function to extract the text or function call from a completion or chat response.
 
 
 ## Agents (Experimental)