Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

agenteval on autogenstudio #3572

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions autogen/agentchat/contrib/agent_eval/criterion.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import json
from typing import List
from typing import List, Optional

import pydantic_core
from pydantic import BaseModel
Expand All @@ -15,8 +15,8 @@ class Criterion(BaseModel):

name: str
description: str
accepted_values: List[str]
sub_criteria: List[Criterion] = list()
accepted_values: Optional[List[str]] = None
sub_criteria: Optional[List[Criterion]] = None

@staticmethod
def parse_json_str(criteria: str):
Expand All @@ -27,7 +27,14 @@ def parse_json_str(criteria: str):
returns:
[Criterion]: A list of Criterion objects that represents the json criteria information.
"""
return [Criterion(**crit) for crit in json.loads(criteria)]

def parse_dict(crit: dict):
if "sub_criteria" in crit:
crit["sub_criteria"] = [parse_dict(c) for c in crit["sub_criteria"]]
return Criterion(**crit)

criteria_list = json.loads(criteria)
return [parse_dict(crit) for crit in criteria_list]

@staticmethod
def write_json(criteria):
Expand All @@ -38,4 +45,4 @@ def write_json(criteria):
Returns:
str: A json string that represents the list of Criterion objects.
"""
return json.dumps([crit.model_dump() for crit in criteria], indent=2)
return json.dumps([crit.dict(exclude_unset=True) for crit in criteria], indent=2, default=str)
2 changes: 1 addition & 1 deletion autogen/agentchat/contrib/agent_eval/critic_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class CriticAgent(ConversableAgent):
"""

DEFAULT_SYSTEM_MESSAGE = """You are a helpful assistant. You suggest criteria for evaluating different tasks. They should be distinguishable, quantifiable and not redundant.
Convert the evaluation criteria into a list where each item is a criteria which consists of the following dictionary as follows
Convert the evaluation criteria into a json list where each item is a criteria which consists of the following dictionary as follows
{"name": name of the criterion, "description": criteria description , "accepted_values": possible accepted inputs for this key}
Make sure "accepted_values" include the acceptable inputs for each key that are fine-grained and preferably multi-graded levels and "description" includes the criterion description.
Output just the criteria string you have created, no code.
Expand Down
9 changes: 5 additions & 4 deletions autogen/agentchat/contrib/agent_eval/subcritic_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@ class SubCriticAgent(ConversableAgent):

DEFAULT_SYSTEM_MESSAGE = """You are a helpful assistant to the critic agent. You suggest sub criteria for evaluating different tasks based on the criteria provided by the critic agent (if you feel it is needed).
They should be distinguishable, quantifiable, and related to the overall theme of the critic's provided criteria.
You operate by taking in the description of the criteria. You then create a new key called sub criteria where you provide the sub criteria for the given criteria.
The value of the sub_criteria is a dictionary where the keys are the subcriteria and each value is as follows {"description": sub criteria description , "accepted_values": possible accepted inputs for this key}
Do this for each criteria provided by the critic (removing the criteria's accepted values). "accepted_values" include the acceptable inputs for each key that are fine-grained and preferably multi-graded levels. "description" includes the criterion description.
Once you have created the sub criteria for the given criteria, you return the json (make sure to include the contents of the critic's dictionary in the final dictionary as well).
You operate by taking in the description of the criteria. You then create a new key called sub_criteria where you provide the subcriteria for the given criteria.
The value of the sub_criteria is a into a json list where each item is a subcriterion which consists of the following dictionary {"name": name of the subcriterion, "description": subcriteria description ,
"accepted_values": possible accepted inputs for this key. They should be that are fine-grained and preferably multi-graded levels.}
Do this for each criteria provided by the critic (removing the criteria's accepted values).
Once you have created the sub criteria for the given criteria, you return the updated criteria json (make sure to include the contents of the critic's dictionary in the final dictionary as well).
Make sure to return a valid json and no code"""

DEFAULT_DESCRIPTION = "An AI agent for creating subcriteria from a given list of criteria."
Expand Down
9 changes: 9 additions & 0 deletions samples/apps/autogen-studio/autogenstudio/datamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,15 @@ class Workflow(SQLModel, table=True):
sample_tasks: Optional[List[str]] = Field(default_factory=list, sa_column=Column(JSON))


class Criteria(SQLModel, table=True):
__tablename__ = "criteria"
__table_args__ = {"sqlite_autoincrement": True}
id: Optional[int] = Field(default=None, primary_key=True)
task_name: Optional[str]
task_description: Optional[str]
criteria: str = Field(sa_column=Column(JSON))


class Response(SQLModel):
message: str
status: bool
Expand Down
142 changes: 140 additions & 2 deletions samples/apps/autogen-studio/autogenstudio/web/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,22 @@
from contextlib import asynccontextmanager
from typing import Any, Union

from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi import Body, FastAPI, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from loguru import logger
from openai import OpenAIError
from pydantic import BaseModel

from autogen.agentchat.contrib.agent_eval.agent_eval import generate_criteria, quantify_criteria
from autogen.agentchat.contrib.agent_eval.criterion import Criterion
from autogen.agentchat.contrib.agent_eval.task import Task
from autogenstudio.utils.utils import sanitize_model

from ..chatmanager import AutoGenChatManager
from ..database import workflow_from_id
from ..database.dbmanager import DBManager
from ..datamodel import Agent, Message, Model, Response, Session, Skill, Workflow
from ..datamodel import Agent, Criteria, Message, Model, Response, Session, Skill, Workflow
from ..profiler import Profiler
from ..utils import check_and_cast_datetime_fields, init_app_folders, md5_hash, test_model
from ..version import VERSION
Expand Down Expand Up @@ -473,6 +479,138 @@ async def run_session_workflow(message: Message, session_id: int, workflow_id: i
}


@api.get("/agenteval/criteria")
async def criteria():
return list_entity(Criteria, return_json=True)


@api.delete("/agenteval/criteria/delete/{criteria_id}")
async def delete_agenteval_criteria(criteria_id: int):
filters = {"id": criteria_id}
return delete_entity(Criteria, filters=filters)


class AgentEvalGenerate(BaseModel):
user_id: str
model_id: int
task_name: str
task_description: str
success_session_id: int = (None,)
failure_session_id: int = (None,)
additonal_instructions: str = ""
max_round: int = 5
use_subcritic: bool = False


@api.post("/agenteval/criteria/generate")
async def generate_agenteval_criteria(params: AgentEvalGenerate):
if params.task_name == "" or str.isspace(params.task_name):
response: Response = Response(
message="Task name is required.",
status=False,
)
return response
if not params.success_session_id and not params.failure_session_id:
response: Response = Response(
message="At least one session is required to be selected.",
status=False,
)
return response

task = Task(name=params.task_name, description=params.task_description, successful_response="", failed_response="")
if params.success_session_id:
task.successful_response = get_session(params.user_id, params.success_session_id)
if params.failure_session_id:
task.failed_response = get_session(params.user_id, params.failure_session_id)

model = get_model(params.model_id)
if type(model) is Response:
return model

criteria = generate_criteria(
llm_config=model,
task=task,
additional_instructions=params.additonal_instructions,
max_round=params.max_round,
use_subcritic=params.use_subcritic,
)

criteria = Criterion.write_json(criteria)
criteria_entry = Criteria(task_name=task.name, task_description=task.description, criteria=criteria)
criteria = create_entity(criteria_entry, Criteria)
return criteria


@api.post("/agenteval/criteria/create")
async def create_agenteval_criteria(criteria: list[Criterion], task: Task):
if not task.name or str.isspace(task.name):
response: Response = Response(
message="Task name is required.",
status=False,
)
return response
criteria = Criterion.write_json(criteria)
criteria_entry = Criteria(task_name=task.name, task_description=task.description, criteria=criteria)
create_entity(criteria_entry, Criteria)
return criteria


@api.post("/agenteval/criteria/update/{criteria_id}")
async def update_agenteval_criteria(criteria: list[Criterion], task: Task, criteria_id: int):
filters = {"id": criteria_id}
delete_entity(Criteria, filters=filters)
criteria = Criterion.write_json(criteria)
criteria_entry = Criteria(task_name=task.name, task_description=task.description, criteria=criteria)
create_entity(criteria_entry, Criteria)
return criteria


@api.post("/agenteval/criteria/validate")
async def validate_agenteval_criteria(criteria: str = Body(...)):
try:
criteria = Criterion.parse_json_str(criteria)
except ValueError as ex:
return {
"status": False,
"message": "Invalid json: " + str(ex),
}
except Exception as ex:
return {"status": False, "message": str(ex)}
return {"status": True}


@api.post("/agenteval/quantify")
async def quantify_agenteval_criteria(criteria_id: int, model_id: int, task: Task, test_session_id: int, user_id: str):
filters = {"id": criteria_id}
criteria = list_entity(Criteria, filters=filters).data[0]
criteria = Criterion.parse_json_str(criteria["criteria"])

model = get_model(model_id)
test_case = get_session(user_id=user_id, session_id=test_session_id)
return quantify_criteria(llm_config=model, criteria=criteria, task=task, test_case=test_case)


def get_session(user_id: int, session_id: int):
filters = {"user_id": user_id, "session_id": session_id}
session = list_entity(Message, filters=filters, order="asc", return_json=True).data
return str(session)


def get_model(model_id: int):
filters = {"id": model_id}
model = list_entity(Model, filters=filters).data
if model and len(model) > 0:
model = model[0]
else:
response: Response = Response(
message="Invalid model",
status=False,
)
return response

return sanitize_model(model)


@api.get("/version")
async def get_version():
return {
Expand Down
21 changes: 21 additions & 0 deletions samples/apps/autogen-studio/frontend/src/components/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,27 @@ export interface IChatSession {
name: string;
}

export interface IAgentEvalCriteria {
id?: number;
task_name: string;
task_description: string;
criteria: string;
model_id?: number;
execution_session_id?: number;
}

export interface IAgentEvalGenerate {
user_id: string;
model_id: number;
task_name: string;
task_description: string;
success_session_id: number;
failure_session_id: number;
additional_instructions: string;
max_round: number;
use_subcritic: boolean;
}

export interface IGalleryItem {
id: number;
messages: Array<IMessage>;
Expand Down
Loading
Loading