Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AgentEval Offline Integration #2345

Closed
wants to merge 28 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
23adac1
first pass at offline agent eval integration
jluey1 Mar 26, 2024
ece7a24
Integrating AgentEval for offline scenarios
jluey1 Apr 10, 2024
3d58536
removing old changes
jluey1 Apr 10, 2024
d604f46
fixing notebook, updating docs
jluey1 Apr 17, 2024
e3cee1f
fixing subcriteria bug
jluey1 Apr 17, 2024
04f0938
updating class comment
jluey1 Apr 17, 2024
c98054e
cleaning up agent constructors
jluey1 Apr 17, 2024
2c1dd84
moving AgentEval agents to separate folder and adding a brief README
jluey1 Apr 17, 2024
a2a5d0a
fixing build breaks
jluey1 Apr 18, 2024
2d6658a
fixing formatting break
jluey1 Apr 18, 2024
5a3969a
Merge branch 'main' into offlineAgentEval
BeibinLi Apr 20, 2024
de2ae18
fixing comments
jluey1 Apr 29, 2024
489ccb2
consolidating files in the agenteval folder under contrib and cleanin…
jluey1 Apr 29, 2024
9a6ecfa
:Merge branch 'offlineAgentEval' of github.com:jluey1/autogen into of…
jluey1 Apr 29, 2024
be35fbc
fixing import ordering
jluey1 Apr 29, 2024
e8d1f59
adding basic agenteval tests and fixing criteria parsing bug
jluey1 Apr 30, 2024
7e64a96
Merge branch 'main' into offlineAgentEval
sonichi May 2, 2024
2300ea6
first try at adding openai agenteval tests to build process
jluey1 May 3, 2024
0e26a9e
merging upstream and first try at adding openai agenteval tests to bu…
jluey1 May 3, 2024
50cf0c7
adding non-openai agenteval tests to build process
jluey1 May 3, 2024
72bd361
updating test settings
jluey1 May 6, 2024
e8e9eb6
updating openai test
jluey1 May 6, 2024
289d9ed
Update test/agentchat/contrib/agent_eval/test_agent_eval.py
jluey1 May 7, 2024
6b4de8c
Update .github/workflows/contrib-openai.yml
jluey1 May 7, 2024
efe9351
Merge branch 'main' of github.com:jluey1/autogen
jluey1 May 7, 2024
71284e4
Merge branch 'main' into offlineAgentEval
jluey1 May 7, 2024
fdc1811
updating typing and converting to pydantic objects
jluey1 May 9, 2024
1aa8cee
fixing test file
jluey1 May 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions autogen/agentchat/contrib/critic_agent.py
jluey1 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from typing import Callable, Dict, Literal, Optional, Union

from autogen.agentchat.conversable_agent import ConversableAgent
from autogen.runtime_logging import log_new_agent, logging_enabled


class CriticAgent(ConversableAgent):
jluey1 marked this conversation as resolved.
Show resolved Hide resolved
"""
An agent for creating list of criteria for evaluating the utility of a given task.
"""

DEFAULT_SYSTEM_MESSAGE = """You are a helpful assistant. You suggest criteria for evaluating different tasks. They should be distinguishable, quantifiable and not redundant.
Convert the evaluation criteria into a dictionary where the keys are the criteria.
The value of each key is a dictionary as follows {"description": criteria description , "accepted_values": possible accepted inputs for this key}
Make sure the keys are criteria for assessing the given task. "accepted_values" include the acceptable inputs for each key that are fine-grained and preferably multi-graded levels. "description" includes the criterion description.
Return the dictionary."""

DEFAULT_DESCRIPTION = "An AI agent for creating list criteria for evaluating the utility of a given task."

def __init__(
self,
name="critic",
system_message: Optional[str] = DEFAULT_SYSTEM_MESSAGE,
llm_config: Optional[Union[Dict, bool]] = None,
max_consecutive_auto_reply: Optional[int] = None,
human_input_mode: Optional[str] = "NEVER",
description: Optional[str] = DEFAULT_DESCRIPTION,
jluey1 marked this conversation as resolved.
Show resolved Hide resolved
**kwargs,
):
"""
Args:
- name (str): agent name.
- system_message (str): system message for the ChatCompletion inference.
Please override this attribute if you want to reprogram the agent.
- llm_config (dict or False or None): llm inference configuration.
Please refer to [OpenAIWrapper.create](/docs/reference/oai/client#create)
for available options.
- max_consecutive_auto_reply (int): the maximum number of consecutive auto replies.
default to None (no limit provided, class attribute MAX_CONSECUTIVE_AUTO_REPLY will be used as the limit in this case).
The limit only plays a role when human_input_mode is not "ALWAYS".
- human_input_mode (str): The human input mode for the agent.
- "ALWAYS": The agent will always require human input.
- "NEVER": The agent will never require human input.
- "SOMETIMES": The agent will sometimes require human input.
- description (str): The description of the agent.
**kwargs (dict): Please refer to other kwargs in
[ConversableAgent](../conversable_agent#__init__).
"""
super().__init__(
name=name,
system_message=system_message,
human_input_mode="NEVER",
llm_config=llm_config,
**kwargs,
)
52 changes: 52 additions & 0 deletions autogen/agentchat/contrib/quantifier_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from typing import Callable, Dict, Literal, Optional, Union

from autogen.agentchat.conversable_agent import ConversableAgent
from autogen.runtime_logging import log_new_agent, logging_enabled


class QuantifierAgent(ConversableAgent):
"""
An agent for quantifing the performance of a system using the provided criteria.
"""

DEFAULT_SYSTEM_MESSAGE = """"You are a helpful assistant. You quantify the output of different tasks based on the given criteria.
The criterion is given in a dictionary format where each key is a dintinct criteria.
The value of each key is a dictionary as follows {"description": criteria description , "accepted_values": possible accepted inputs for this key}
You are going to quantify each of the crieria for a given task based on the task description.
Return a dictionary where the keys are the criteria and the values are the assessed performance based on accepted values for each criteria.
Return only the dictionary."""

DEFAULT_DESCRIPTION = "An AI agent for quantifing the performance of a system using the provided criteria."

def __init__(
self,
name="quantifier",
system_message: Optional[str] = DEFAULT_SYSTEM_MESSAGE,
llm_config: Optional[Union[Dict, bool]] = None,
max_consecutive_auto_reply: Optional[int] = None,
human_input_mode: Optional[str] = "NEVER",
description: Optional[str] = DEFAULT_DESCRIPTION,
**kwargs,
):
"""
Args:
- name (str): agent name.
- system_message (str): system message for the ChatCompletion inference.
Please override this attribute if you want to reprogram the agent.
- llm_config (dict or False or None): llm inference configuration.
Please refer to [OpenAIWrapper.create](/docs/reference/oai/client#create)
jluey1 marked this conversation as resolved.
Show resolved Hide resolved
for available options. To disable llm-based auto reply, set to False.
- max_consecutive_auto_reply (int): the maximum number of consecutive auto replies.
default to None (no limit provided, class attribute MAX_CONSECUTIVE_AUTO_REPLY will be used as the limit in this case).
The limit only plays a role when human_input_mode is not "ALWAYS".
- human_input_mode (str): The human input mode for the agent.
- "ALWAYS": The agent will always require human input.
- "NEVER": The agent will never require human input.
- "SOMETIMES": The agent will sometimes require human input.
- description (str): The description of the agent.
**kwargs (dict): Please refer to other kwargs in
[ConversableAgent](../conversable_agent#__init__).
"""
super().__init__(
name=name, system_message=system_message, human_input_mode="NEVER", llm_config=llm_config, **kwargs
)
57 changes: 57 additions & 0 deletions autogen/agentchat/contrib/subcritic_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from typing import Callable, Dict, Literal, Optional, Union

from autogen.agentchat.conversable_agent import ConversableAgent
from autogen.runtime_logging import log_new_agent, logging_enabled


class SubCriticAgent(ConversableAgent):
"""
An agent for creating subcriteria from a given list of criteria for evaluating the utility of a given task.
"""

DEFAULT_SYSTEM_MESSAGE = """You are a helpful assistant to the critic agent. You suggest sub criteria for evaluating different tasks based on the criteria provided by the critic agent (if you feel it is needed).
They should be distinguishable, quantifiable, and related to the overall theme of the critic's provided criteria.
You operate by taking in the description of the criteria. You then create a new key called sub criteria where you provide the sub criteria for the given criteria.
The value of the sub_criteria is a dictionary where the keys are the subcriteria and each value is as follows {"description": sub criteria description , "accepted_values": possible accepted inputs for this key}
Do this for each criteria provided by the critic (removing the criteria's accepted values). "accepted_values" include the acceptable inputs for each key that are fine-grained and preferably multi-graded levels. "description" includes the criterion description.
Once you have created the sub criteria for the given criteria, you return the json (make sure to include the contents of the critic's dictionary in the final dictionary as well).
Make sure to return a valid json and not a python dictionary."""

DEFAULT_DESCRIPTION = "An AI agent for creating subcriteria from a given list of criteria."

def __init__(
self,
name="subcritic",
system_message: Optional[str] = DEFAULT_SYSTEM_MESSAGE,
llm_config: Optional[Union[Dict, bool]] = None,
max_consecutive_auto_reply: Optional[int] = None,
human_input_mode: Optional[str] = "NEVER",
description: Optional[str] = DEFAULT_DESCRIPTION,
**kwargs,
):
"""
Args:
- name (str): agent name.
- system_message (str): system message for the ChatCompletion inference.
Please override this attribute if you want to reprogram the agent.
- llm_config (dict or False or None): llm inference configuration.
Please refer to [OpenAIWrapper.create](/docs/reference/oai/client#create)
for available options.
- max_consecutive_auto_reply (int): the maximum number of consecutive auto replies.
default to None (no limit provided, class attribute MAX_CONSECUTIVE_AUTO_REPLY will be used as the limit in this case).
The limit only plays a role when human_input_mode is not "ALWAYS".
- human_input_mode (str): The human input mode for the agent.
- "ALWAYS": The agent will always require human input.
- "NEVER": The agent will never require human input.
- "SOMETIMES": The agent will sometimes require human input.
- description (str): The description of the agent.
**kwargs (dict): Please refer to other kwargs in
[ConversableAgent](../conversable_agent#__init__).
"""
super().__init__(
name=name,
system_message=system_message,
human_input_mode="NEVER",
llm_config=llm_config,
**kwargs,
)
102 changes: 102 additions & 0 deletions autogen/agenteval/agent_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import os
import sys
from typing import Callable, Dict, Optional, Union

from criterion import Criterion
from critic_agent import CriticAgent
from quantifier_agent import QuantifierAgent
from subcritic_agent import SubCriticAgent
from task import Task

import autogen


def generate_criteria(
llm_config: Optional[Union[Dict, bool]] = None,
task: Task = None,
additional_instructions: str = "",
max_round=2,
use_subcritic: bool = False,
):
"""
Creates a list of criteria for evaluating the utility of a given task.
args:
- llm_config (dict or bool): llm inference configuration.
- task (Task): The task to evaluate.
- additional_instructions (str): Additional instructions for the criteria agent.
- max_round (int): The maximum number of rounds to run the conversation.
- use_subcritic (bool): Whether to use the subcritic agent to generate subcriteria.
returns:
- list: A list of Criterion objects for evaluating the utility of the given task.
jluey1 marked this conversation as resolved.
Show resolved Hide resolved
"""
critic = CriticAgent(
system_message=CriticAgent.DEFAULT_SYSTEM_MESSAGE + "\n" + additional_instructions,
llm_config=llm_config,
)

critic_user = autogen.UserProxyAgent(
name="critic_user",
max_consecutive_auto_reply=0, # terminate without auto-reply
human_input_mode="NEVER",
code_execution_config={"use_docker": False},
)

agents = [critic_user, critic]

if use_subcritic:
subcritic = SubCriticAgent(
llm_config=llm_config,
)
agents.append(subcritic)

groupchat = autogen.GroupChat(
agents=agents, messages=[], max_round=max_round, speaker_selection_method="round_robin"
)
critic_manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=llm_config)

critic_user.initiate_chat(critic_manager, message=task.sys_msg)
criteria = critic_user.last_message()
print(criteria["content"])
criteria = Criterion.parse_json_str(criteria["content"])
return criteria


def quantify_criteria(
llm_config: Optional[Union[Dict, bool]] = None,
criteria: [Criterion] = None,
task: Task = None,
test_case: Dict = None,
ground_truth: str = "",
):
"""
Quantifies the performance of a system using the provided criteria.
args:
- llm_config (dict or bool): llm inference configuration.
- criteria ([Criterion]): A list of criteria for evaluating the utility of a given task.
- task (Task): The task to evaluate.
- test_case (dict): The test case to evaluate.
- ground_truth (str): The ground truth for the test case.
returns:
- dict: A dictionary where the keys are the criteria and the values are the assessed performance based on accepted values for each criteria.
"""
quantifier = QuantifierAgent(
llm_config=llm_config,
)

quantifier_user = autogen.UserProxyAgent(
name="quantifier_user",
max_consecutive_auto_reply=0, # terminate without auto-reply
human_input_mode="NEVER",
code_execution_config={"use_docker": False},
)

cq_results = quantifier_user.initiate_chat( # noqa: F841
quantifier,
message=task.sys_msg
+ "Evaluation dictionary: "
+ Criterion.write_json(criteria)
+ "actual test case to evaluate: "
+ str(test_case),
)
quantified_results = quantifier_user.last_message()
return {"actual_success": ground_truth, "estimated_performance": quantified_results["content"]}
61 changes: 61 additions & 0 deletions autogen/agenteval/criterion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import json


class Criterion:
"""
A class that represents a criterion for agent evaluation.
"""

def __init__(self, name: str, description: str, accepted_values: [str], sub_criteria=None):
jluey1 marked this conversation as resolved.
Show resolved Hide resolved
"""
args:
- name (str): The name of the criterion.
- description (str): The description of the criterion.
- accepted_values ([str]): The list of accepted values for the criterion.
jluey1 marked this conversation as resolved.
Show resolved Hide resolved
"""
self.name = name
self.description = description
self.accepted_values = accepted_values
self.sub_criteria = sub_criteria

def to_json(self):
"""
Create a json object from the criterion.
"""
return {self.name: {"description": self.description, "accepted_values": self.accepted_values}}
jluey1 marked this conversation as resolved.
Show resolved Hide resolved

@staticmethod
def parse_json_str(criteria: str):
"""
Create a list of Criterion objects from a json string.
args:
- criteria (str): Json string that represents the criteria
returns:
- [Criterion]: A list of Criterion objects that represents the json criteria information.
"""
criteria_list = []
parsed_json = json.loads(criteria)
for criterion_name, criterion_data in parsed_json.items():
sub_criteria = None
accepted_values = ""
if criterion_data.get("sub_criteria") is not None:
sub_criteria = Criterion.parse_json_str(json.dumps(criterion_data.get("sub_criteria")))
else:
accepted_values = criterion_data.get("accepted_values")
criterion = Criterion(criterion_name, criterion_data["description"], accepted_values, sub_criteria)
criteria_list.append(criterion)
return criteria_list

@staticmethod
def write_json(criteria):
"""
Create a json string from a list of Criterion objects.
args:
- criteria ([Criterion]): A list of Criterion objects.
returns:
- str: A json string that represents the list of Criterion objects.
"""
criteria_json = {}
for criterion in criteria:
criteria_json.update(criterion.to_json())
return json.dumps(criteria_json, indent=2)
41 changes: 41 additions & 0 deletions autogen/agenteval/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import json


class Task:
"""
Task class represents a task that the agent should be able to perform.
jluey1 marked this conversation as resolved.
Show resolved Hide resolved
"""

def __init__(self, name: str, description: str, successful_response: str, failed_response: str):
"""
args:
- name (str): The name of the task.
- description (str): The description of the task.
- successful_response (str): An example of a successful response execution.
- failed_response (str): An example of a failed response execution.
"""
self.name = name
self.description = description
self.successful_response = successful_response
self.failed_response = failed_response
self.sys_msg = f"""Task: {self.name}.
Task description: {self.description}
Task successful example: {self.successful_response}
Task failed example: {self.failed_response}
"""

@staticmethod
def parse_json_str(task: str):
"""
Create a Task object from a json object.
args:
- json_data (dict): A dictionary that represents the task.
returns:
- Task: A Task object that represents the json task information.
"""
json_data = json.loads(task)
name = json_data.get("name")
description = json_data.get("description")
successful_response = json_data.get("successful_response")
failed_response = json_data.get("failed_response")
return Task(name, description, successful_response, failed_response)
Loading
Loading