Skip to content

Commit

Permalink
Merge pull request #2071 from Agenta-AI/AGE-587/-implement-evaluation…
Browse files Browse the repository at this point in the history
…-main-page

[feature] Evaluators - Debugging
  • Loading branch information
mmabrouk authored Sep 23, 2024
2 parents 0ef9e56 + 10797f4 commit 7ad6d7c
Show file tree
Hide file tree
Showing 72 changed files with 4,685 additions and 2,842 deletions.
25 changes: 24 additions & 1 deletion agenta-backend/agenta_backend/models/api/evaluation_model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from enum import Enum
from datetime import datetime
from pydantic import BaseModel
from typing import Optional, List, Dict, Any, Union

from pydantic import BaseModel, Field, model_validator

from agenta_backend.models.api.api_models import Result


Expand All @@ -12,6 +14,8 @@ class Evaluator(BaseModel):
settings_template: dict
description: Optional[str] = None
oss: Optional[bool] = False
requires_llm_api_keys: Optional[bool] = False
tags: List[str]


class EvaluatorConfig(BaseModel):
Expand Down Expand Up @@ -80,6 +84,25 @@ class Evaluation(BaseModel):
updated_at: datetime


class EvaluatorInputInterface(BaseModel):
inputs: Dict[str, Any] = Field(default_factory=dict)
settings: Optional[Dict[str, Any]] = None
credentials: Optional[Dict[str, Any]] = None


class EvaluatorOutputInterface(BaseModel):
outputs: Dict[str, Any]


class EvaluatorMappingInputInterface(BaseModel):
inputs: Dict[str, Any]
mapping: Dict[str, Any]


class EvaluatorMappingOutputInterface(BaseModel):
outputs: Dict[str, Any]


class SimpleEvaluationOutput(BaseModel):
id: str
variant_ids: List[str]
Expand Down
51 changes: 40 additions & 11 deletions agenta-backend/agenta_backend/resources/evaluators/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,10 @@
"name": "Exact Match",
"key": "auto_exact_match",
"direct_use": True,
"settings_template": {
"correct_answer_key": {
"label": "Expected Answer Column",
"default": "correct_answer",
"type": "string",
"advanced": True, # Tells the frontend that this setting is advanced and should be hidden by default
"ground_truth_key": True, # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
"description": "The name of the column in the test data that contains the correct answer",
},
},
"settings_template": {},
"description": "Exact Match evaluator determines if the output exactly matches the specified correct answer, ensuring precise alignment with expected results.",
"oss": True,
"tags": ["functional"],
},
{
"name": "Contains JSON",
Expand All @@ -49,6 +41,7 @@
"settings_template": {},
"description": "'Contains JSON' evaluator checks if the output contains the a valid JSON.",
"oss": True,
"tags": ["functional", "classifiers"],
},
{
"name": "Similarity Match",
Expand All @@ -75,11 +68,13 @@
},
"description": "Similarity Match evaluator checks if the generated answer is similar to the expected answer. You need to provide the similarity threshold. It uses the Jaccard similarity to compare the answers.",
"oss": True,
"tags": ["similarity", "functional"],
},
{
"name": "Semantic Similarity Match",
"key": "auto_semantic_similarity",
"direct_use": False,
"requires_llm_api_keys": True,
"description": "Semantic Similarity Match evaluator measures the similarity between two pieces of text by analyzing their meaning and context. It compares the semantic content, providing a score that reflects how closely the texts match in terms of meaning, rather than just exact word matches.",
"settings_template": {
"correct_answer_key": {
Expand All @@ -92,6 +87,7 @@
},
},
"oss": True,
"tags": ["similarity", "ai_llm"],
},
{
"name": "Regex Test",
Expand All @@ -114,6 +110,7 @@
},
},
"oss": True,
"tags": ["classifiers", "functional"],
},
{
"name": "JSON Field Match",
Expand All @@ -138,6 +135,7 @@
},
"description": "JSON Field Match evaluator compares specific fields within JSON (JavaScript Object Notation) data. This matching can involve finding similarities or correspondences between fields in different JSON objects.",
"oss": True,
"tags": ["functional"],
},
{
"name": "JSON Diff Match",
Expand Down Expand Up @@ -176,11 +174,13 @@
},
},
"oss": True,
"tags": ["similarity", "functional"],
},
{
"name": "LLM-as-a-judge",
"key": "auto_ai_critique",
"direct_use": False,
"requires_llm_api_keys": True,
"settings_template": {
"prompt_template": {
"label": "Prompt Template",
Expand All @@ -200,16 +200,25 @@
},
"description": "AI Critique evaluator sends the generated answer and the correct_answer to an LLM model and uses it to evaluate the correctness of the answer. You need to provide the evaluation prompt (or use the default prompt).",
"oss": True,
"tags": ["ai_llm", "functional"],
},
{
"name": "Code Evaluation",
"key": "auto_custom_code_run",
"direct_use": False,
"settings_template": {
"requires_llm_api_keys": {
"label": "Requires LLM API Key(s)",
"type": "boolean",
"required": True,
"default": False,
"advanced": True,
"description": "Indicates whether the evaluation requires LLM API key(s) to function.",
},
"code": {
"label": "Evaluation Code",
"type": "code",
"default": "from typing import Dict\n\ndef evaluate(\n app_params: Dict[str, str],\n inputs: Dict[str, str],\n output: Union[str, Dict[str, Any]], # output of the llm app\n datapoint: Dict[str, str] # contains the testset row \n) -> float:\n if output in datapoint.get('correct_answer', None):\n return 1.0\n else:\n return 0.0\n",
"default": "from typing import Dict, Union, Any\n\ndef evaluate(\n app_params: Dict[str, str],\n inputs: Dict[str, str],\n output: Union[str, Dict[str, Any]], # output of the llm app\n correct_answer: str # contains the testset row \n) -> float:\n if output in correct_answer:\n return 1.0\n else:\n return 0.0\n",
"description": "Code for evaluating submissions",
"required": True,
},
Expand All @@ -224,12 +233,21 @@
},
"description": "Code Evaluation allows you to write your own evaluator in Python. You need to provide the Python code for the evaluator.",
"oss": True,
"tags": ["functional"],
},
{
"name": "Webhook test",
"key": "auto_webhook_test",
"direct_use": False,
"settings_template": {
"requires_llm_api_keys": {
"label": "Requires LLM API Key(s)",
"type": "boolean",
"required": True,
"default": False,
"advanced": True,
"description": "Indicates whether the evaluation requires LLM API key(s) to function.",
},
"webhook_url": {
"label": "Webhook URL",
"type": "string",
Expand All @@ -247,6 +265,7 @@
},
"description": "Webhook test evaluator sends the generated answer and the correct_answer to a webhook and expects a response, in JSON format, indicating the correctness of the answer, along with a 200 HTTP status. You need to provide the URL of the webhook and the response of the webhook must be between 0 and 1.",
"oss": True,
"tags": ["functional"],
},
{
"name": "Starts With",
Expand All @@ -268,6 +287,7 @@
},
"description": "Starts With evaluator checks if the output starts with a specified prefix, considering case sensitivity based on the settings.",
"oss": True,
"tags": ["classifiers", "functional"],
},
{
"name": "Ends With",
Expand All @@ -289,6 +309,7 @@
},
"description": "Ends With evaluator checks if the output ends with a specified suffix, considering case sensitivity based on the settings.",
"oss": True,
"tags": ["classifiers", "functional"],
},
{
"name": "Contains",
Expand All @@ -310,6 +331,7 @@
},
"description": "Contains evaluator checks if the output contains a specified substring, considering case sensitivity based on the settings.",
"oss": True,
"tags": ["classifiers", "functional"],
},
{
"name": "Contains Any",
Expand All @@ -331,6 +353,7 @@
},
"description": "Contains Any evaluator checks if the output contains any of the specified substrings from a comma-separated list, considering case sensitivity based on the settings.",
"oss": True,
"tags": ["classifiers", "functional"],
},
{
"name": "Contains All",
Expand All @@ -352,6 +375,7 @@
},
"description": "Contains All evaluator checks if the output contains all of the specified substrings from a comma-separated list, considering case sensitivity based on the settings.",
"oss": True,
"tags": ["classifiers", "functional"],
},
{
"name": "Levenshtein Distance",
Expand All @@ -375,20 +399,25 @@
},
"description": "This evaluator calculates the Levenshtein distance between the output and the correct answer. If a threshold is provided in the settings, it returns a boolean indicating whether the distance is within the threshold. If no threshold is provided, it returns the actual Levenshtein distance as a numerical value.",
"oss": True,
"tags": ["functional"],
},
{
"name": "RAG Faithfulness",
"key": "rag_faithfulness",
"direct_use": False,
"requires_llm_api_keys": True,
"settings_template": rag_evaluator_settings_template,
"description": "RAG Faithfulness evaluator assesses the accuracy and reliability of responses generated by Retrieval-Augmented Generation (RAG) models. It evaluates how faithfully the responses adhere to the retrieved documents or sources, ensuring that the generated text accurately reflects the information from the original sources.",
"tags": ["rag"],
},
{
"name": "RAG Context Relevancy",
"key": "rag_context_relevancy",
"direct_use": False,
"requires_llm_api_keys": True,
"settings_template": rag_evaluator_settings_template,
"description": "RAG Context Relevancy evaluator measures how relevant the retrieved documents or contexts are to the given question or prompt. It ensures that the selected documents provide the necessary information for generating accurate and meaningful responses, improving the overall quality of the RAG model's output.",
"tags": ["rag"],
},
]

Expand Down
13 changes: 6 additions & 7 deletions agenta-backend/agenta_backend/routers/evaluation_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from fastapi.responses import JSONResponse
from fastapi import HTTPException, Request, status, Response, Query

from agenta_backend.services import helpers
from agenta_backend.models import converters
from agenta_backend.tasks.evaluations import evaluate
from agenta_backend.utils.common import APIRouter, isCloudEE
Expand All @@ -14,9 +15,6 @@
NewEvaluation,
DeleteEvaluation,
)
from agenta_backend.services.evaluator_manager import (
check_ai_critique_inputs,
)
from agenta_backend.services import evaluation_service, db_manager, app_manager

if isCloudEE():
Expand Down Expand Up @@ -112,8 +110,9 @@ async def create_evaluation(
status_code=403,
)

success, response = await check_ai_critique_inputs(
payload.evaluators_configs, payload.lm_providers_keys
llm_provider_keys = helpers.format_llm_provider_keys(payload.lm_providers_keys)
success, response = await helpers.ensure_required_llm_keys_exist(
payload.evaluators_configs, llm_provider_keys
)
if not success:
return response
Expand All @@ -134,8 +133,8 @@ async def create_evaluation(
evaluators_config_ids=payload.evaluators_configs,
testset_id=payload.testset_id,
evaluation_id=evaluation.id,
rate_limit_config=payload.rate_limit.dict(),
lm_providers_keys=payload.lm_providers_keys,
rate_limit_config=payload.rate_limit.model_dump(),
lm_providers_keys=llm_provider_keys,
)
evaluations.append(evaluation)

Expand Down
69 changes: 68 additions & 1 deletion agenta-backend/agenta_backend/routers/evaluators_router.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,27 @@
import logging
import traceback

from typing import List
from fastapi import HTTPException, Request
from fastapi.responses import JSONResponse

from agenta_backend.utils.common import APIRouter, isCloudEE
from agenta_backend.services import evaluator_manager, db_manager, app_manager
from agenta_backend.services import (
evaluator_manager,
db_manager,
evaluators_service,
app_manager,
)

from agenta_backend.models.api.evaluation_model import (
Evaluator,
EvaluatorConfig,
NewEvaluatorConfig,
UpdateEvaluatorConfig,
EvaluatorInputInterface,
EvaluatorOutputInterface,
EvaluatorMappingInputInterface,
EvaluatorMappingOutputInterface,
)

if isCloudEE():
Expand Down Expand Up @@ -47,6 +57,63 @@ async def get_evaluators_endpoint():
raise HTTPException(status_code=500, detail=str(e))


@router.post("/map/", response_model=EvaluatorMappingOutputInterface)
async def evaluator_data_map(request: Request, payload: EvaluatorMappingInputInterface):
"""Endpoint to map the experiment data tree to evaluator interface.
Args:
request (Request): The request object.
payload (EvaluatorMappingInputInterface): The payload containing the request data.
Returns:
EvaluatorMappingOutputInterface: the evaluator mapping output object
"""

try:
mapped_outputs = await evaluators_service.map(mapping_input=payload)
return mapped_outputs
except Exception as e:
logger.error(f"Error mapping data tree: {str(e)}")
raise HTTPException(
status_code=500,
detail={
"message": "Error mapping data tree",
"stacktrace": traceback.format_exc(),
},
)


@router.post("/{evaluator_key}/run/", response_model=EvaluatorOutputInterface)
async def evaluator_run(
request: Request, evaluator_key: str, payload: EvaluatorInputInterface
):
"""Endpoint to evaluate LLM app run
Args:
request (Request): The request object.
evaluator_key (str): The key of the evaluator.
payload (EvaluatorInputInterface): The payload containing the request data.
Returns:
result: EvaluatorOutputInterface object containing the outputs.
"""

try:
result = await evaluators_service.run(
evaluator_key=evaluator_key, evaluator_input=payload
)
return result
except Exception as e:
logger.error(f"Error while running {evaluator_key} evaluator: {str(e)}")
raise HTTPException(
status_code=500,
detail={
"message": f"Error while running {evaluator_key} evaluator",
"stacktrace": traceback.format_exc(),
},
)


@router.get("/configs/", response_model=List[EvaluatorConfig])
async def get_evaluator_configs(app_id: str, request: Request):
"""Endpoint to fetch evaluator configurations for a specific app.
Expand Down
Loading

0 comments on commit 7ad6d7c

Please sign in to comment.