adding basic agenteval tests and fixing criteria parsing bug

microsoft · Apr 30, 2024 · e8d1f59 · e8d1f59
1 parent be35fbc
commit e8d1f59
Show file tree

Hide file tree

Showing 4 changed files with 190 additions and 1 deletion.
diff --git a/autogen/agentchat/contrib/agent_eval/agent_eval.py b/autogen/agentchat/contrib/agent_eval/agent_eval.py
@@ -53,7 +53,10 @@ def generate_criteria(
 
     critic_user.initiate_chat(critic_manager, message=task.sys_msg)
     criteria = critic_user.last_message()
-    criteria = Criterion.parse_json_str(criteria["content"])
+    content = criteria["content"]
+    # need to strip out any extra code around the returned json
+    content = content[content.find("{") : content.rfind("}") + 1]
+    criteria = Criterion.parse_json_str(content)
     return criteria
 
 

diff --git a/test/agentchat/contrib/agent_eval/test_agent_eval.py b/test/agentchat/contrib/agent_eval/test_agent_eval.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3 -m pytest
+
+import json
+
+import pytest
+from conftest import reason, skip_openai  # noqa: E402
+
+import autogen
+from autogen.agentchat.contrib.agent_eval.agent_eval import generate_criteria, quantify_criteria
+from autogen.agentchat.contrib.agent_eval.criterion import Criterion
+from autogen.agentchat.contrib.agent_eval.task import Task
+
+KEY_LOC = "notebook"
+OAI_CONFIG_LIST = "OAI_CONFIG_LIST"
+
+
+def remove_ground_truth(test_case):
+    test_details = json.loads(test_case)
+    # need to remove the ground truth from the test details
+    correctness = test_details.pop("is_correct", None)
+    test_details.pop("correct_ans", None)
+    test_details.pop("check_result", None)
+    return test_details, correctness
+
+
+if not skip_openai:
+    openai_config_list = autogen.config_list_from_json(
+        OAI_CONFIG_LIST,
+        file_location=KEY_LOC,
+        # The Retrieval tool requires at least gpt-3.5-turbo-1106 (newer versions are supported) or gpt-4-turbo-preview models.
+        # https://platform.openai.com/docs/models/overview
+        filter_dict={
+            "api_type": ["openai"],
+            "model": [
+                "gpt-4-turbo",
+                "gpt-4-turbo-preview",
+                "gpt-4-0125-preview",
+                "gpt-4-1106-preview",
+                "gpt-3.5-turbo",
+                "gpt-3.5-turbo-0125",
+                "gpt-3.5-turbo-1106",
+            ],
+        },
+    )
+
+    aoai_config_list = autogen.config_list_from_json(
+        OAI_CONFIG_LIST,
+        file_location=KEY_LOC,
+        filter_dict={"api_type": ["azure"]},
+    )
+
+    success_str = open("test/test_files/agenteval-in-out/samples/sample_math_response_successful.txt", "r").read()
+    response_successful = remove_ground_truth(success_str)[0]
+    failed_str = open("test/test_files/agenteval-in-out/samples/sample_math_response_failed.txt", "r").read()
+    response_failed = remove_ground_truth(failed_str)[0]
+    task = Task.parse_json_str(
+        json.dumps(
+            {
+                "name": "Math problem solving",
+                "description": "Given any question, the system needs to solve the problem as consisely and accurately as possible",
+                "successful_response": response_successful,
+                "failed_response": response_failed,
+            }
+        )
+    )
+
+
+@pytest.mark.skipif(
+    skip_openai,
+    reason=reason,
+)
+def test_generate_criteria():
+    criteria = generate_criteria(task=task, llm_config={"config_list": aoai_config_list})
+    assert criteria
+    assert len(criteria) > 0
+    assert criteria[0].description
+    assert criteria[0].name
+    assert criteria[0].accepted_values
+
+
+@pytest.mark.skipif(
+    skip_openai,
+    reason=reason,
+)
+def test_quantify_criteria():
+    criteria_file = "test/test_files/agenteval-in-out/samples/sample_math_criteria.json"
+    criteria = open(criteria_file, "r").read()
+    criteria = Criterion.parse_json_str(criteria)
+
+    test_case = open("test/test_files/agenteval-in-out/samples/sample_test_case.json", "r").read()
+    test_case, ground_truth = remove_ground_truth(test_case)
+
+    quantified = quantify_criteria(
+        llm_config={"config_list": aoai_config_list},
+        criteria=criteria,
+        task=task,
+        test_case=test_case,
+        ground_truth=ground_truth,
+    )
+    assert quantified
+    assert quantified["actual_success"]
+    assert quantified["estimated_performance"]
diff --git a/test/agentchat/contrib/agent_eval/test_criterion.py b/test/agentchat/contrib/agent_eval/test_criterion.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3 -m pytest
+
+from autogen.agentchat.contrib.agent_eval.criterion import Criterion
+
+
+def test_parse_json_str():
+    criteria_file = "test/test_files/agenteval-in-out/samples/sample_math_criteria.json"
+    criteria = open(criteria_file, "r").read()
+    criteria = Criterion.parse_json_str(criteria)
+    assert criteria
+    assert len(criteria) == 6
+    assert criteria[0].name == "Problem Interpretation"
+    assert criteria[0].description == "Ability to correctly interpret the problem."
+    assert len(criteria[0].accepted_values) == 5
+
+
+def test_write_json():
+    criteria1 = Criterion(name="test1", description="test1 description", accepted_values=["test1", "test2"])
+    criteria2 = Criterion(name="test2", description="test2 description", accepted_values=["test1", "test2"])
+    output = Criterion.write_json([criteria1, criteria2])
+    assert (
+        output
+        == """{
+  "test1": {
+    "description": "test1 description",
+    "accepted_values": [
+      "test1",
+      "test2"
+    ],
+    "sub_criteria": []
+  },
+  "test2": {
+    "description": "test2 description",
+    "accepted_values": [
+      "test1",
+      "test2"
+    ],
+    "sub_criteria": []
+  }
+}"""
+    )
+
+
+def test_write_parse_compatibility():
+    criterion1 = Criterion(name="test1", description="test1 description", accepted_values=["test1", "test2"])
+    criterion2 = Criterion(name="test2", description="test2 description", accepted_values=["test1", "test2"])
+    output = Criterion.write_json([criterion1, criterion2])
+    criteria = Criterion.parse_json_str(output)
+    assert criteria
+    assert len(criteria) == 2
+    assert criteria[0].name == "test1"
+    assert criteria[0].description == "test1 description"
+    assert len(criteria[0].accepted_values) == 2
+    assert criteria[1].name == "test2"
+    assert criteria[1].description == "test2 description"
+    assert len(criteria[1].accepted_values) == 2
diff --git a/test/agentchat/contrib/agent_eval/test_task.py b/test/agentchat/contrib/agent_eval/test_task.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3 -m pytest
+
+from autogen.agentchat.contrib.agent_eval.task import Task
+
+
+def test_parse_json_str():
+    task = Task.parse_json_str(
+        """{
+        "name": "Math problem solving",
+        "description": "Given any question, the system needs to solve the problem as consisely and accurately as possible",
+        "successful_response": {
+            "message": "The answer is 5",
+            "is_correct": true
+        },
+        "failed_response": {
+            "message": "I don't know the answer",
+            "is_correct": false
+        }
+    }"""
+    )
+    assert task
+    assert task.name == "Math problem solving"
+    assert (
+        task.description
+        == "Given any question, the system needs to solve the problem as consisely and accurately as possible"
+    )
+    assert task.successful_response == {"message": "The answer is 5", "is_correct": True}
+    assert task.failed_response == {"message": "I don't know the answer", "is_correct": False}