diff --git a/autogen/agentchat/contrib/agent_eval/agent_eval.py b/autogen/agentchat/contrib/agent_eval/agent_eval.py index d2a9384744b..060a1d1c972 100644 --- a/autogen/agentchat/contrib/agent_eval/agent_eval.py +++ b/autogen/agentchat/contrib/agent_eval/agent_eval.py @@ -53,7 +53,10 @@ def generate_criteria( critic_user.initiate_chat(critic_manager, message=task.sys_msg) criteria = critic_user.last_message() - criteria = Criterion.parse_json_str(criteria["content"]) + content = criteria["content"] + # need to strip out any extra code around the returned json + content = content[content.find("{") : content.rfind("}") + 1] + criteria = Criterion.parse_json_str(content) return criteria diff --git a/test/agentchat/contrib/agent_eval/test_agent_eval.py b/test/agentchat/contrib/agent_eval/test_agent_eval.py new file mode 100644 index 00000000000..1c3e5672dae --- /dev/null +++ b/test/agentchat/contrib/agent_eval/test_agent_eval.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 -m pytest + +import json + +import pytest +from conftest import reason, skip_openai # noqa: E402 + +import autogen +from autogen.agentchat.contrib.agent_eval.agent_eval import generate_criteria, quantify_criteria +from autogen.agentchat.contrib.agent_eval.criterion import Criterion +from autogen.agentchat.contrib.agent_eval.task import Task + +KEY_LOC = "notebook" +OAI_CONFIG_LIST = "OAI_CONFIG_LIST" + + +def remove_ground_truth(test_case): + test_details = json.loads(test_case) + # need to remove the ground truth from the test details + correctness = test_details.pop("is_correct", None) + test_details.pop("correct_ans", None) + test_details.pop("check_result", None) + return test_details, correctness + + +if not skip_openai: + openai_config_list = autogen.config_list_from_json( + OAI_CONFIG_LIST, + file_location=KEY_LOC, + # The Retrieval tool requires at least gpt-3.5-turbo-1106 (newer versions are supported) or gpt-4-turbo-preview models. + # https://platform.openai.com/docs/models/overview + filter_dict={ + "api_type": ["openai"], + "model": [ + "gpt-4-turbo", + "gpt-4-turbo-preview", + "gpt-4-0125-preview", + "gpt-4-1106-preview", + "gpt-3.5-turbo", + "gpt-3.5-turbo-0125", + "gpt-3.5-turbo-1106", + ], + }, + ) + + aoai_config_list = autogen.config_list_from_json( + OAI_CONFIG_LIST, + file_location=KEY_LOC, + filter_dict={"api_type": ["azure"]}, + ) + + success_str = open("test/test_files/agenteval-in-out/samples/sample_math_response_successful.txt", "r").read() + response_successful = remove_ground_truth(success_str)[0] + failed_str = open("test/test_files/agenteval-in-out/samples/sample_math_response_failed.txt", "r").read() + response_failed = remove_ground_truth(failed_str)[0] + task = Task.parse_json_str( + json.dumps( + { + "name": "Math problem solving", + "description": "Given any question, the system needs to solve the problem as consisely and accurately as possible", + "successful_response": response_successful, + "failed_response": response_failed, + } + ) + ) + + +@pytest.mark.skipif( + skip_openai, + reason=reason, +) +def test_generate_criteria(): + criteria = generate_criteria(task=task, llm_config={"config_list": aoai_config_list}) + assert criteria + assert len(criteria) > 0 + assert criteria[0].description + assert criteria[0].name + assert criteria[0].accepted_values + + +@pytest.mark.skipif( + skip_openai, + reason=reason, +) +def test_quantify_criteria(): + criteria_file = "test/test_files/agenteval-in-out/samples/sample_math_criteria.json" + criteria = open(criteria_file, "r").read() + criteria = Criterion.parse_json_str(criteria) + + test_case = open("test/test_files/agenteval-in-out/samples/sample_test_case.json", "r").read() + test_case, ground_truth = remove_ground_truth(test_case) + + quantified = quantify_criteria( + llm_config={"config_list": aoai_config_list}, + criteria=criteria, + task=task, + test_case=test_case, + ground_truth=ground_truth, + ) + assert quantified + assert quantified["actual_success"] + assert quantified["estimated_performance"] diff --git a/test/agentchat/contrib/agent_eval/test_criterion.py b/test/agentchat/contrib/agent_eval/test_criterion.py new file mode 100644 index 00000000000..4bf3020fb55 --- /dev/null +++ b/test/agentchat/contrib/agent_eval/test_criterion.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 -m pytest + +from autogen.agentchat.contrib.agent_eval.criterion import Criterion + + +def test_parse_json_str(): + criteria_file = "test/test_files/agenteval-in-out/samples/sample_math_criteria.json" + criteria = open(criteria_file, "r").read() + criteria = Criterion.parse_json_str(criteria) + assert criteria + assert len(criteria) == 6 + assert criteria[0].name == "Problem Interpretation" + assert criteria[0].description == "Ability to correctly interpret the problem." + assert len(criteria[0].accepted_values) == 5 + + +def test_write_json(): + criteria1 = Criterion(name="test1", description="test1 description", accepted_values=["test1", "test2"]) + criteria2 = Criterion(name="test2", description="test2 description", accepted_values=["test1", "test2"]) + output = Criterion.write_json([criteria1, criteria2]) + assert ( + output + == """{ + "test1": { + "description": "test1 description", + "accepted_values": [ + "test1", + "test2" + ], + "sub_criteria": [] + }, + "test2": { + "description": "test2 description", + "accepted_values": [ + "test1", + "test2" + ], + "sub_criteria": [] + } +}""" + ) + + +def test_write_parse_compatibility(): + criterion1 = Criterion(name="test1", description="test1 description", accepted_values=["test1", "test2"]) + criterion2 = Criterion(name="test2", description="test2 description", accepted_values=["test1", "test2"]) + output = Criterion.write_json([criterion1, criterion2]) + criteria = Criterion.parse_json_str(output) + assert criteria + assert len(criteria) == 2 + assert criteria[0].name == "test1" + assert criteria[0].description == "test1 description" + assert len(criteria[0].accepted_values) == 2 + assert criteria[1].name == "test2" + assert criteria[1].description == "test2 description" + assert len(criteria[1].accepted_values) == 2 diff --git a/test/agentchat/contrib/agent_eval/test_task.py b/test/agentchat/contrib/agent_eval/test_task.py new file mode 100644 index 00000000000..e13831bed70 --- /dev/null +++ b/test/agentchat/contrib/agent_eval/test_task.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 -m pytest + +from autogen.agentchat.contrib.agent_eval.task import Task + + +def test_parse_json_str(): + task = Task.parse_json_str( + """{ + "name": "Math problem solving", + "description": "Given any question, the system needs to solve the problem as consisely and accurately as possible", + "successful_response": { + "message": "The answer is 5", + "is_correct": true + }, + "failed_response": { + "message": "I don't know the answer", + "is_correct": false + } + }""" + ) + assert task + assert task.name == "Math problem solving" + assert ( + task.description + == "Given any question, the system needs to solve the problem as consisely and accurately as possible" + ) + assert task.successful_response == {"message": "The answer is 5", "is_correct": True} + assert task.failed_response == {"message": "I don't know the answer", "is_correct": False}