microsoft · jluey1 · Sep 19, 2024 · Sep 26, 2024 · Sep 26, 2024
diff --git a/autogen/agentchat/contrib/agent_eval/criterion.py b/autogen/agentchat/contrib/agent_eval/criterion.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import json
-from typing import List
+from typing import List, Optional
 
 import pydantic_core
 from pydantic import BaseModel
@@ -15,8 +15,8 @@ class Criterion(BaseModel):
 
     name: str
     description: str
-    accepted_values: List[str]
-    sub_criteria: List[Criterion] = list()
+    accepted_values: Optional[List[str]] = None
+    sub_criteria: Optional[List[Criterion]] = None
 
     @staticmethod
     def parse_json_str(criteria: str):
@@ -27,7 +27,14 @@ def parse_json_str(criteria: str):
         returns:
             [Criterion]: A list of Criterion objects that represents the json criteria information.
         """
-        return [Criterion(**crit) for crit in json.loads(criteria)]
+
+        def parse_dict(crit: dict):
+            if "sub_criteria" in crit:
+                crit["sub_criteria"] = [parse_dict(c) for c in crit["sub_criteria"]]
+            return Criterion(**crit)
+
+        criteria_list = json.loads(criteria)
+        return [parse_dict(crit) for crit in criteria_list]
 
     @staticmethod
     def write_json(criteria):
@@ -38,4 +45,4 @@ def write_json(criteria):
         Returns:
             str: A json string that represents the list of Criterion objects.
         """
-        return json.dumps([crit.model_dump() for crit in criteria], indent=2)
+        return json.dumps([crit.dict(exclude_unset=True) for crit in criteria], indent=2, default=str)
diff --git a/autogen/agentchat/contrib/agent_eval/critic_agent.py b/autogen/agentchat/contrib/agent_eval/critic_agent.py
@@ -9,7 +9,7 @@ class CriticAgent(ConversableAgent):
     """
 
     DEFAULT_SYSTEM_MESSAGE = """You are a helpful assistant. You suggest criteria for evaluating different tasks. They should be distinguishable, quantifiable and not redundant.
-    Convert the evaluation criteria into a list where each item is a criteria which consists of the following dictionary as follows
+    Convert the evaluation criteria into a json list where each item is a criteria which consists of the following dictionary as follows
     {"name": name of the criterion, "description": criteria description , "accepted_values": possible accepted inputs for this key}
     Make sure "accepted_values" include the acceptable inputs for each key that are fine-grained and preferably multi-graded levels and "description" includes the criterion description.
     Output just the criteria string you have created, no code.

diff --git a/autogen/agentchat/contrib/agent_eval/subcritic_agent.py b/autogen/agentchat/contrib/agent_eval/subcritic_agent.py
@@ -10,10 +10,11 @@ class SubCriticAgent(ConversableAgent):
 
     DEFAULT_SYSTEM_MESSAGE = """You are a helpful assistant to the critic agent. You suggest sub criteria for evaluating different tasks based on the criteria provided by the critic agent (if you feel it is needed).
         They should be distinguishable, quantifiable, and related to the overall theme of the critic's provided criteria.
-        You operate by taking in the description of the criteria. You then create a new key called sub criteria where you provide the sub criteria for the given criteria.
-        The value of the sub_criteria is a dictionary where the keys are the subcriteria and each value is as follows {"description": sub criteria description , "accepted_values": possible accepted inputs for this key}
-        Do this for each criteria provided by the critic (removing the criteria's accepted values). "accepted_values" include the acceptable inputs for each key that are fine-grained and preferably multi-graded levels. "description" includes the criterion description.
-        Once you have created the sub criteria for the given criteria, you return the json (make sure to include the contents of the critic's dictionary in the final dictionary as well).
+        You operate by taking in the description of the criteria. You then create a new key called sub_criteria where you provide the subcriteria for the given criteria.
+        The value of the sub_criteria is a into a json list where each item is a subcriterion which consists of the following dictionary {"name": name of the subcriterion, "description": subcriteria description ,
+        "accepted_values": possible accepted inputs for this key. They should be that are fine-grained and preferably multi-graded levels.}
+        Do this for each criteria provided by the critic (removing the criteria's accepted values).
+        Once you have created the sub criteria for the given criteria, you return the updated criteria json (make sure to include the contents of the critic's dictionary in the final dictionary as well).
         Make sure to return a valid json and no code"""
 
     DEFAULT_DESCRIPTION = "An AI agent for creating subcriteria from a given list of criteria."

diff --git a/samples/apps/autogen-studio/autogenstudio/datamodel.py b/samples/apps/autogen-studio/autogenstudio/datamodel.py
@@ -284,6 +284,15 @@ class Workflow(SQLModel, table=True):
     sample_tasks: Optional[List[str]] = Field(default_factory=list, sa_column=Column(JSON))
 
 
+class Criteria(SQLModel, table=True):
+    __tablename__ = "criteria"
+    __table_args__ = {"sqlite_autoincrement": True}
+    id: Optional[int] = Field(default=None, primary_key=True)
+    task_name: Optional[str]
+    task_description: Optional[str]
+    criteria: str = Field(sa_column=Column(JSON))
+
+
 class Response(SQLModel):
     message: str
     status: bool

diff --git a/samples/apps/autogen-studio/autogenstudio/web/app.py b/samples/apps/autogen-studio/autogenstudio/web/app.py
@@ -6,16 +6,22 @@
 from contextlib import asynccontextmanager
 from typing import Any, Union
 
-from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+from fastapi import Body, FastAPI, WebSocket, WebSocketDisconnect
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 from loguru import logger
 from openai import OpenAIError
+from pydantic import BaseModel
+
+from autogen.agentchat.contrib.agent_eval.agent_eval import generate_criteria, quantify_criteria
+from autogen.agentchat.contrib.agent_eval.criterion import Criterion
+from autogen.agentchat.contrib.agent_eval.task import Task
+from autogenstudio.utils.utils import sanitize_model
 
 from ..chatmanager import AutoGenChatManager
 from ..database import workflow_from_id
 from ..database.dbmanager import DBManager
-from ..datamodel import Agent, Message, Model, Response, Session, Skill, Workflow
+from ..datamodel import Agent, Criteria, Message, Model, Response, Session, Skill, Workflow
 from ..profiler import Profiler
 from ..utils import check_and_cast_datetime_fields, init_app_folders, md5_hash, test_model
 from ..version import VERSION
@@ -473,6 +479,138 @@ async def run_session_workflow(message: Message, session_id: int, workflow_id: i
         }
 
 
+@api.get("/agenteval/criteria")
+async def criteria():
+    return list_entity(Criteria, return_json=True)
+
+
+@api.delete("/agenteval/criteria/delete/{criteria_id}")
+async def delete_agenteval_criteria(criteria_id: int):
+    filters = {"id": criteria_id}
+    return delete_entity(Criteria, filters=filters)
+
+
+class AgentEvalGenerate(BaseModel):
+    user_id: str
+    model_id: int
+    task_name: str
+    task_description: str
+    success_session_id: int = (None,)
+    failure_session_id: int = (None,)
+    additonal_instructions: str = ""
+    max_round: int = 5
+    use_subcritic: bool = False
+
+
+@api.post("/agenteval/criteria/generate")
+async def generate_agenteval_criteria(params: AgentEvalGenerate):
+    if params.task_name == "" or str.isspace(params.task_name):
+        response: Response = Response(
+            message="Task name is required.",
+            status=False,
+        )
+        return response
+    if not params.success_session_id and not params.failure_session_id:
+        response: Response = Response(
+            message="At least one session is required to be selected.",
+            status=False,
+        )
+        return response
+
+    task = Task(name=params.task_name, description=params.task_description, successful_response="", failed_response="")
+    if params.success_session_id:
+        task.successful_response = get_session(params.user_id, params.success_session_id)
+    if params.failure_session_id:
+        task.failed_response = get_session(params.user_id, params.failure_session_id)
+
+    model = get_model(params.model_id)
+    if type(model) is Response:
+        return model
+
+    criteria = generate_criteria(
+        llm_config=model,
+        task=task,
+        additional_instructions=params.additonal_instructions,
+        max_round=params.max_round,
+        use_subcritic=params.use_subcritic,
+    )
+
+    criteria = Criterion.write_json(criteria)
+    criteria_entry = Criteria(task_name=task.name, task_description=task.description, criteria=criteria)
+    criteria = create_entity(criteria_entry, Criteria)
+    return criteria
+
+
+@api.post("/agenteval/criteria/create")
+async def create_agenteval_criteria(criteria: list[Criterion], task: Task):
+    if not task.name or str.isspace(task.name):
+        response: Response = Response(
+            message="Task name is required.",
+            status=False,
+        )
+        return response
+    criteria = Criterion.write_json(criteria)
+    criteria_entry = Criteria(task_name=task.name, task_description=task.description, criteria=criteria)
+    create_entity(criteria_entry, Criteria)
+    return criteria
+
+
+@api.post("/agenteval/criteria/update/{criteria_id}")
+async def update_agenteval_criteria(criteria: list[Criterion], task: Task, criteria_id: int):
+    filters = {"id": criteria_id}
+    delete_entity(Criteria, filters=filters)
+    criteria = Criterion.write_json(criteria)
+    criteria_entry = Criteria(task_name=task.name, task_description=task.description, criteria=criteria)
+    create_entity(criteria_entry, Criteria)
+    return criteria
+
+
+@api.post("/agenteval/criteria/validate")
+async def validate_agenteval_criteria(criteria: str = Body(...)):
+    try:
+        criteria = Criterion.parse_json_str(criteria)
+    except ValueError as ex:
+        return {
+            "status": False,
+            "message": "Invalid json: " + str(ex),
+        }
+    except Exception as ex:
+        return {"status": False, "message": str(ex)}
+    return {"status": True}
+
+
+@api.post("/agenteval/quantify")
+async def quantify_agenteval_criteria(criteria_id: int, model_id: int, task: Task, test_session_id: int, user_id: str):
+    filters = {"id": criteria_id}
+    criteria = list_entity(Criteria, filters=filters).data[0]
+    criteria = Criterion.parse_json_str(criteria["criteria"])
+
+    model = get_model(model_id)
+    test_case = get_session(user_id=user_id, session_id=test_session_id)
+    return quantify_criteria(llm_config=model, criteria=criteria, task=task, test_case=test_case)
+
+
+def get_session(user_id: int, session_id: int):
+    filters = {"user_id": user_id, "session_id": session_id}
+    session = list_entity(Message, filters=filters, order="asc", return_json=True).data
+    return str(session)
+
+
+def get_model(model_id: int):
+    filters = {"id": model_id}
+    model = list_entity(Model, filters=filters).data
+    if model and len(model) > 0:
+        model = model[0]
+    else:
+        response: Response = Response(
+            message="Invalid model",
+            status=False,
+        )
+        return response
+
+    return sanitize_model(model)
+
+
 @api.get("/version")
 async def get_version():
     return {

diff --git a/samples/apps/autogen-studio/frontend/src/components/types.ts b/samples/apps/autogen-studio/frontend/src/components/types.ts
@@ -105,6 +105,27 @@ export interface IChatSession {
   name: string;
 }
 
+export interface IAgentEvalCriteria {
+  id?: number;
+  task_name: string;
+  task_description: string;
+  criteria: string;
+  model_id?: number;
+  execution_session_id?: number;
+}
+
+export interface IAgentEvalGenerate {
+  user_id: string;
+  model_id: number;
+  task_name: string;
+  task_description: string;
+  success_session_id: number;
+  failure_session_id: number;
+  additional_instructions: string;
+  max_round: number;
+  use_subcritic: boolean;
+}
+
 export interface IGalleryItem {
   id: number;
   messages: Array<IMessage>;