Skip to content

Commit

Permalink
adding basic agenteval tests and fixing criteria parsing bug
Browse files Browse the repository at this point in the history
  • Loading branch information
jluey1 committed Apr 30, 2024
1 parent be35fbc commit e8d1f59
Show file tree
Hide file tree
Showing 4 changed files with 190 additions and 1 deletion.
5 changes: 4 additions & 1 deletion autogen/agentchat/contrib/agent_eval/agent_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,10 @@ def generate_criteria(

critic_user.initiate_chat(critic_manager, message=task.sys_msg)
criteria = critic_user.last_message()
criteria = Criterion.parse_json_str(criteria["content"])
content = criteria["content"]
# need to strip out any extra code around the returned json
content = content[content.find("{") : content.rfind("}") + 1]
criteria = Criterion.parse_json_str(content)
return criteria


Expand Down
102 changes: 102 additions & 0 deletions test/agentchat/contrib/agent_eval/test_agent_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/env python3 -m pytest

import json

import pytest
from conftest import reason, skip_openai # noqa: E402

import autogen
from autogen.agentchat.contrib.agent_eval.agent_eval import generate_criteria, quantify_criteria
from autogen.agentchat.contrib.agent_eval.criterion import Criterion
from autogen.agentchat.contrib.agent_eval.task import Task

KEY_LOC = "notebook"
OAI_CONFIG_LIST = "OAI_CONFIG_LIST"


def remove_ground_truth(test_case):
test_details = json.loads(test_case)
# need to remove the ground truth from the test details
correctness = test_details.pop("is_correct", None)
test_details.pop("correct_ans", None)
test_details.pop("check_result", None)
return test_details, correctness


if not skip_openai:
openai_config_list = autogen.config_list_from_json(
OAI_CONFIG_LIST,
file_location=KEY_LOC,
# The Retrieval tool requires at least gpt-3.5-turbo-1106 (newer versions are supported) or gpt-4-turbo-preview models.
# https://platform.openai.com/docs/models/overview
filter_dict={
"api_type": ["openai"],
"model": [
"gpt-4-turbo",
"gpt-4-turbo-preview",
"gpt-4-0125-preview",
"gpt-4-1106-preview",
"gpt-3.5-turbo",
"gpt-3.5-turbo-0125",
"gpt-3.5-turbo-1106",
],
},
)

aoai_config_list = autogen.config_list_from_json(
OAI_CONFIG_LIST,
file_location=KEY_LOC,
filter_dict={"api_type": ["azure"]},
)

success_str = open("test/test_files/agenteval-in-out/samples/sample_math_response_successful.txt", "r").read()
response_successful = remove_ground_truth(success_str)[0]
failed_str = open("test/test_files/agenteval-in-out/samples/sample_math_response_failed.txt", "r").read()
response_failed = remove_ground_truth(failed_str)[0]
task = Task.parse_json_str(
json.dumps(
{
"name": "Math problem solving",
"description": "Given any question, the system needs to solve the problem as consisely and accurately as possible",
"successful_response": response_successful,
"failed_response": response_failed,
}
)
)


@pytest.mark.skipif(
skip_openai,
reason=reason,
)
def test_generate_criteria():
criteria = generate_criteria(task=task, llm_config={"config_list": aoai_config_list})
assert criteria
assert len(criteria) > 0
assert criteria[0].description
assert criteria[0].name
assert criteria[0].accepted_values


@pytest.mark.skipif(
skip_openai,
reason=reason,
)
def test_quantify_criteria():
criteria_file = "test/test_files/agenteval-in-out/samples/sample_math_criteria.json"
criteria = open(criteria_file, "r").read()
criteria = Criterion.parse_json_str(criteria)

test_case = open("test/test_files/agenteval-in-out/samples/sample_test_case.json", "r").read()
test_case, ground_truth = remove_ground_truth(test_case)

quantified = quantify_criteria(
llm_config={"config_list": aoai_config_list},
criteria=criteria,
task=task,
test_case=test_case,
ground_truth=ground_truth,
)
assert quantified
assert quantified["actual_success"]
assert quantified["estimated_performance"]
56 changes: 56 additions & 0 deletions test/agentchat/contrib/agent_eval/test_criterion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env python3 -m pytest

from autogen.agentchat.contrib.agent_eval.criterion import Criterion


def test_parse_json_str():
criteria_file = "test/test_files/agenteval-in-out/samples/sample_math_criteria.json"
criteria = open(criteria_file, "r").read()
criteria = Criterion.parse_json_str(criteria)
assert criteria
assert len(criteria) == 6
assert criteria[0].name == "Problem Interpretation"
assert criteria[0].description == "Ability to correctly interpret the problem."
assert len(criteria[0].accepted_values) == 5


def test_write_json():
criteria1 = Criterion(name="test1", description="test1 description", accepted_values=["test1", "test2"])
criteria2 = Criterion(name="test2", description="test2 description", accepted_values=["test1", "test2"])
output = Criterion.write_json([criteria1, criteria2])
assert (
output
== """{
"test1": {
"description": "test1 description",
"accepted_values": [
"test1",
"test2"
],
"sub_criteria": []
},
"test2": {
"description": "test2 description",
"accepted_values": [
"test1",
"test2"
],
"sub_criteria": []
}
}"""
)


def test_write_parse_compatibility():
criterion1 = Criterion(name="test1", description="test1 description", accepted_values=["test1", "test2"])
criterion2 = Criterion(name="test2", description="test2 description", accepted_values=["test1", "test2"])
output = Criterion.write_json([criterion1, criterion2])
criteria = Criterion.parse_json_str(output)
assert criteria
assert len(criteria) == 2
assert criteria[0].name == "test1"
assert criteria[0].description == "test1 description"
assert len(criteria[0].accepted_values) == 2
assert criteria[1].name == "test2"
assert criteria[1].description == "test2 description"
assert len(criteria[1].accepted_values) == 2
28 changes: 28 additions & 0 deletions test/agentchat/contrib/agent_eval/test_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env python3 -m pytest

from autogen.agentchat.contrib.agent_eval.task import Task


def test_parse_json_str():
task = Task.parse_json_str(
"""{
"name": "Math problem solving",
"description": "Given any question, the system needs to solve the problem as consisely and accurately as possible",
"successful_response": {
"message": "The answer is 5",
"is_correct": true
},
"failed_response": {
"message": "I don't know the answer",
"is_correct": false
}
}"""
)
assert task
assert task.name == "Math problem solving"
assert (
task.description
== "Given any question, the system needs to solve the problem as consisely and accurately as possible"
)
assert task.successful_response == {"message": "The answer is 5", "is_correct": True}
assert task.failed_response == {"message": "I don't know the answer", "is_correct": False}

0 comments on commit e8d1f59

Please sign in to comment.