diff --git a/flaml/autogen/agent/assistant_agent.py b/flaml/autogen/agent/assistant_agent.py index 09e0ae0761c..0381cfdb97e 100644 --- a/flaml/autogen/agent/assistant_agent.py +++ b/flaml/autogen/agent/assistant_agent.py @@ -39,6 +39,7 @@ def receive(self, message, sender): self._conversations[sender.name] = [{"content": self._system_message, "role": "system"}] super().receive(message, sender) responses = oai.ChatCompletion.create(messages=self._conversations[sender.name], **self._config) + # TODO: handle function_call response = oai.ChatCompletion.extract_text(responses)[0] self._send(response, sender) diff --git a/flaml/autogen/oai/completion.py b/flaml/autogen/oai/completion.py index e7b4a503198..d4375598239 100644 --- a/flaml/autogen/oai/completion.py +++ b/flaml/autogen/oai/completion.py @@ -45,12 +45,16 @@ class Completion(openai_Completion): # set of models that support chat completion chat_models = { "gpt-3.5-turbo", - "gpt-3.5-turbo-0301", + "gpt-3.5-turbo-0301", # deprecate in Sep + "gpt-3.5-turbo-0613", + "gpt-3.5-turbo-16k", "gpt-35-turbo", "gpt-4", "gpt-4-32k", - "gpt-4-32k-0314", - "gpt-4-0314", + "gpt-4-32k-0314", # deprecate in Sep + "gpt-4-0314", # deprecate in Sep + "gpt-4-0613", + "gpt-4-32k-0613", } # price per 1k tokens @@ -62,13 +66,17 @@ class Completion(openai_Completion): "code-davinci-002": 0.1, "text-davinci-002": 0.02, "text-davinci-003": 0.02, - "gpt-3.5-turbo": 0.002, - "gpt-3.5-turbo-0301": 0.002, + "gpt-3.5-turbo": (0.0015, 0.002), + "gpt-3.5-turbo-0301": (0.0015, 0.002), # deprecate in Sep + "gpt-3.5-turbo-0613": (0.0015, 0.002), + "gpt-3.5-turbo-16k": (0.003, 0.004), "gpt-35-turbo": 0.002, "gpt-4": (0.03, 0.06), - "gpt-4-0314": (0.03, 0.06), "gpt-4-32k": (0.06, 0.12), - "gpt-4-32k-0314": (0.06, 0.12), + "gpt-4-0314": (0.03, 0.06), # deprecate in Sep + "gpt-4-32k-0314": (0.06, 0.12), # deprecate in Sep + "gpt-4-0613": (0.03, 0.06), + "gpt-4-32k-0613": (0.06, 0.12), } default_search_space = { @@ -386,7 +394,7 @@ def _eval(cls, config: dict, prune=True, eval_only=False): result["cost"] = cost return result # evaluate the quality of the responses - responses = cls.extract_text(response) + responses = cls.extract_text_or_function_call(response) usage = response["usage"] n_input_tokens = usage["prompt_tokens"] n_output_tokens = usage.get("completion_tokens", 0) @@ -898,7 +906,7 @@ def eval_func(responses, **data): response = cls.create(data_i, use_cache, **config) cost += response["cost"] # evaluate the quality of the responses - responses = cls.extract_text(response) + responses = cls.extract_text_or_function_call(response) if eval_func is not None: metrics = eval_func(responses, **data_i) elif hasattr(cls, "_eval_func"): @@ -991,6 +999,24 @@ def extract_text(cls, response: dict) -> List[str]: return [choice["text"] for choice in choices] return [choice["message"].get("content", "") for choice in choices] + @classmethod + def extract_text_or_function_call(cls, response: dict) -> List[str]: + """Extract the text or function calls from a completion or chat response. + + Args: + response (dict): The response from OpenAI API. + + Returns: + A list of text or function calls in the responses. + """ + choices = response["choices"] + if "text" in choices[0]: + return [choice["text"] for choice in choices] + return [ + choice["message"] if "function_call" in choice["message"] else choice["message"].get("content", "") + for choice in choices + ] + @classmethod @property def logged_history(cls) -> Dict: diff --git a/setup.py b/setup.py index 33c86047754..c29272b3859 100644 --- a/setup.py +++ b/setup.py @@ -127,8 +127,8 @@ "pytorch-forecasting>=0.9.0", ], "benchmark": ["catboost>=0.26", "psutil==5.8.0", "xgboost==1.3.3", "pandas==1.1.4"], - "openai": ["openai==0.27.4", "diskcache"], - "autogen": ["openai==0.27.4", "diskcache", "docker"], + "openai": ["openai==0.27.8", "diskcache"], + "autogen": ["openai==0.27.8", "diskcache", "docker"], "synapse": [ "joblibspark>=0.5.0", "optuna==2.8.0", diff --git a/test/autogen/test_assistant_agent.py b/test/autogen/test_assistant_agent.py index c5230930b30..46bacb88498 100644 --- a/test/autogen/test_assistant_agent.py +++ b/test/autogen/test_assistant_agent.py @@ -11,10 +11,10 @@ def test_gpt35(human_input_mode="NEVER", max_consecutive_auto_reply=5): import openai except ImportError: return - config_list = oai.config_list_from_models(key_file_path=KEY_LOC, model_list=["gpt-3.5-turbo"]) + config_list = oai.config_list_from_models(key_file_path=KEY_LOC, model_list=["gpt-3.5-turbo-0613"]) assistant = AssistantAgent( "coding_agent", - request_timeout=600, + # request_timeout=600, seed=40, max_tokens=1024, config_list=config_list, diff --git a/test/autogen/test_function_call.py b/test/autogen/test_function_call.py new file mode 100644 index 00000000000..9a86ae9414e --- /dev/null +++ b/test/autogen/test_function_call.py @@ -0,0 +1,63 @@ +try: + import openai +except ImportError: + openai = None +import pytest +import json +from flaml import oai +from flaml.autogen.math_utils import eval_math_responses + +KEY_LOC = "test/autogen" + + +@pytest.mark.skipif(openai is None, reason="openai not installed") +def test_eval_math_responses(): + config_list = oai.config_list_openai_aoai(KEY_LOC, exclude="aoai") + functions = [ + { + "name": "eval_math_responses", + "description": "Select a response for a math problem using voting, and check if the response is correct if the solution is provided", + "parameters": { + "type": "object", + "properties": { + "responses": { + "type": "string", + "description": "The responses in a list", + }, + "solution": { + "type": "string", + "description": "The canonical solution", + }, + }, + "required": ["responses"], + }, + }, + ] + response = oai.ChatCompletion.create( + model="gpt-3.5-turbo-0613", + config_list=config_list, + messages=[ + { + "role": "user", + "content": 'evaluate the math responses ["1", "5/2", "5/2"] against the true answer \\frac{5}{2}', + }, + ], + functions=functions, + ) + print(response) + responses = oai.ChatCompletion.extract_text_or_function_call(response) + print(responses[0]) + function_call = responses[0]["function_call"] + name, arguments = function_call["name"], json.loads(function_call["arguments"]) + assert name == "eval_math_responses" + print(arguments["responses"]) + if isinstance(arguments["responses"], str): + arguments["responses"] = json.loads(arguments["responses"]) + arguments["responses"] = [f"\\boxed{{{x}}}" for x in arguments["responses"]] + print(arguments["responses"]) + arguments["solution"] = f"\\boxed{{{arguments['solution']}}}" + print(eval_math_responses(**arguments)) + + +if __name__ == "__main__": + test_eval_math_responses() diff --git a/website/docs/Use-Cases/Auto-Generation.md b/website/docs/Use-Cases/Auto-Generation.md index 4838668def4..9f00f1e13d1 100644 --- a/website/docs/Use-Cases/Auto-Generation.md +++ b/website/docs/Use-Cases/Auto-Generation.md @@ -371,14 +371,14 @@ Set `compact=False` in `start_logging()` to switch. }, } ``` -It can be seen that the individual API call history contain redundant information of the conversation. For a long conversation the degree of redundancy is high. +It can be seen that the individual API call history contains redundant information of the conversation. For a long conversation the degree of redundancy is high. The compact history is more efficient and the individual API call history contains more details. ### Other Utilities - a [`cost`](../reference/autogen/oai/completion#cost) function to calculate the cost of an API call. - a [`test`](../reference/autogen/oai/completion#test) function to conveniently evaluate the configuration over test data. -- a [`extract_text`](../reference/autogen/oai/completion#extract_text) function to extract the text from a completion or chat response. +- an [`extract_text_or_function_call`](../reference/autogen/oai/completion#extract_text_or_function_call) function to extract the text or function call from a completion or chat response. ## Agents (Experimental)