Merge pull request #2071 from Agenta-AI/AGE-587/-implement-evaluation…

…-main-page [feature] Evaluators - Debugging
Agenta-AI · Sep 23, 2024 · 7ad6d7c · 7ad6d7c
2 parents 0ef9e56 + 10797f4
commit 7ad6d7c
Show file tree

Hide file tree

Showing 72 changed files with 4,685 additions and 2,842 deletions.
diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py
@@ -1,7 +1,9 @@
 from enum import Enum
 from datetime import datetime
-from pydantic import BaseModel
 from typing import Optional, List, Dict, Any, Union
+
+from pydantic import BaseModel, Field, model_validator
+
 from agenta_backend.models.api.api_models import Result
 
 
@@ -12,6 +14,8 @@ class Evaluator(BaseModel):
     settings_template: dict
     description: Optional[str] = None
     oss: Optional[bool] = False
+    requires_llm_api_keys: Optional[bool] = False
+    tags: List[str]
 
 
 class EvaluatorConfig(BaseModel):
@@ -80,6 +84,25 @@ class Evaluation(BaseModel):
     updated_at: datetime
 
 
+class EvaluatorInputInterface(BaseModel):
+    inputs: Dict[str, Any] = Field(default_factory=dict)
+    settings: Optional[Dict[str, Any]] = None
+    credentials: Optional[Dict[str, Any]] = None
+
+
+class EvaluatorOutputInterface(BaseModel):
+    outputs: Dict[str, Any]
+
+
+class EvaluatorMappingInputInterface(BaseModel):
+    inputs: Dict[str, Any]
+    mapping: Dict[str, Any]
+
+
+class EvaluatorMappingOutputInterface(BaseModel):
+    outputs: Dict[str, Any]
+
+
 class SimpleEvaluationOutput(BaseModel):
     id: str
     variant_ids: List[str]

diff --git a/agenta-backend/agenta_backend/resources/evaluators/evaluators.py b/agenta-backend/agenta_backend/resources/evaluators/evaluators.py
@@ -29,18 +29,10 @@
         "name": "Exact Match",
         "key": "auto_exact_match",
         "direct_use": True,
-        "settings_template": {
-            "correct_answer_key": {
-                "label": "Expected Answer Column",
-                "default": "correct_answer",
-                "type": "string",
-                "advanced": True,  # Tells the frontend that this setting is advanced and should be hidden by default
-                "ground_truth_key": True,  # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
-                "description": "The name of the column in the test data that contains the correct answer",
-            },
-        },
+        "settings_template": {},
         "description": "Exact Match evaluator determines if the output exactly matches the specified correct answer, ensuring precise alignment with expected results.",
         "oss": True,
+        "tags": ["functional"],
     },
     {
         "name": "Contains JSON",
@@ -49,6 +41,7 @@
         "settings_template": {},
         "description": "'Contains JSON' evaluator checks if the output contains the a valid JSON.",
         "oss": True,
+        "tags": ["functional", "classifiers"],
     },
     {
         "name": "Similarity Match",
@@ -75,11 +68,13 @@
         },
         "description": "Similarity Match evaluator checks if the generated answer is similar to the expected answer. You need to provide the similarity threshold. It uses the Jaccard similarity to compare the answers.",
         "oss": True,
+        "tags": ["similarity", "functional"],
     },
     {
         "name": "Semantic Similarity Match",
         "key": "auto_semantic_similarity",
         "direct_use": False,
+        "requires_llm_api_keys": True,
         "description": "Semantic Similarity Match evaluator measures the similarity between two pieces of text by analyzing their meaning and context. It compares the semantic content, providing a score that reflects how closely the texts match in terms of meaning, rather than just exact word matches.",
         "settings_template": {
             "correct_answer_key": {
@@ -92,6 +87,7 @@
             },
         },
         "oss": True,
+        "tags": ["similarity", "ai_llm"],
     },
     {
         "name": "Regex Test",
@@ -114,6 +110,7 @@
             },
         },
         "oss": True,
+        "tags": ["classifiers", "functional"],
     },
     {
         "name": "JSON Field Match",
@@ -138,6 +135,7 @@
         },
         "description": "JSON Field Match evaluator compares specific fields within JSON (JavaScript Object Notation) data. This matching can involve finding similarities or correspondences between fields in different JSON objects.",
         "oss": True,
+        "tags": ["functional"],
     },
     {
         "name": "JSON Diff Match",
@@ -176,11 +174,13 @@
             },
         },
         "oss": True,
+        "tags": ["similarity", "functional"],
     },
     {
         "name": "LLM-as-a-judge",
         "key": "auto_ai_critique",
         "direct_use": False,
+        "requires_llm_api_keys": True,
         "settings_template": {
             "prompt_template": {
                 "label": "Prompt Template",
@@ -200,16 +200,25 @@
         },
         "description": "AI Critique evaluator sends the generated answer and the correct_answer to an LLM model and uses it to evaluate the correctness of the answer. You need to provide the evaluation prompt (or use the default prompt).",
         "oss": True,
+        "tags": ["ai_llm", "functional"],
     },
     {
         "name": "Code Evaluation",
         "key": "auto_custom_code_run",
         "direct_use": False,
         "settings_template": {
+            "requires_llm_api_keys": {
+                "label": "Requires LLM API Key(s)",
+                "type": "boolean",
+                "required": True,
+                "default": False,
+                "advanced": True,
+                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+            },
             "code": {
                 "label": "Evaluation Code",
                 "type": "code",
-                "default": "from typing import Dict\n\ndef evaluate(\n    app_params: Dict[str, str],\n    inputs: Dict[str, str],\n    output: Union[str, Dict[str, Any]], # output of the llm app\n    datapoint: Dict[str, str] # contains the testset row \n) -> float:\n    if output in datapoint.get('correct_answer', None):\n        return 1.0\n    else:\n        return 0.0\n",
+                "default": "from typing import Dict, Union, Any\n\ndef evaluate(\n    app_params: Dict[str, str],\n    inputs: Dict[str, str],\n    output: Union[str, Dict[str, Any]], # output of the llm app\n    correct_answer: str # contains the testset row \n) -> float:\n    if output in correct_answer:\n        return 1.0\n    else:\n        return 0.0\n",
                 "description": "Code for evaluating submissions",
                 "required": True,
             },
@@ -224,12 +233,21 @@
         },
         "description": "Code Evaluation allows you to write your own evaluator in Python. You need to provide the Python code for the evaluator.",
         "oss": True,
+        "tags": ["functional"],
     },
     {
         "name": "Webhook test",
         "key": "auto_webhook_test",
         "direct_use": False,
         "settings_template": {
+            "requires_llm_api_keys": {
+                "label": "Requires LLM API Key(s)",
+                "type": "boolean",
+                "required": True,
+                "default": False,
+                "advanced": True,
+                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+            },
             "webhook_url": {
                 "label": "Webhook URL",
                 "type": "string",
@@ -247,6 +265,7 @@
         },
         "description": "Webhook test evaluator sends the generated answer and the correct_answer to a webhook and expects a response, in JSON format, indicating the correctness of the answer, along with a 200 HTTP status. You need to provide the URL of the webhook and the response of the webhook must be between 0 and 1.",
         "oss": True,
+        "tags": ["functional"],
     },
     {
         "name": "Starts With",
@@ -268,6 +287,7 @@
         },
         "description": "Starts With evaluator checks if the output starts with a specified prefix, considering case sensitivity based on the settings.",
         "oss": True,
+        "tags": ["classifiers", "functional"],
     },
     {
         "name": "Ends With",
@@ -289,6 +309,7 @@
         },
         "description": "Ends With evaluator checks if the output ends with a specified suffix, considering case sensitivity based on the settings.",
         "oss": True,
+        "tags": ["classifiers", "functional"],
     },
     {
         "name": "Contains",
@@ -310,6 +331,7 @@
         },
         "description": "Contains evaluator checks if the output contains a specified substring, considering case sensitivity based on the settings.",
         "oss": True,
+        "tags": ["classifiers", "functional"],
     },
     {
         "name": "Contains Any",
@@ -331,6 +353,7 @@
         },
         "description": "Contains Any evaluator checks if the output contains any of the specified substrings from a comma-separated list, considering case sensitivity based on the settings.",
         "oss": True,
+        "tags": ["classifiers", "functional"],
     },
     {
         "name": "Contains All",
@@ -352,6 +375,7 @@
         },
         "description": "Contains All evaluator checks if the output contains all of the specified substrings from a comma-separated list, considering case sensitivity based on the settings.",
         "oss": True,
+        "tags": ["classifiers", "functional"],
     },
     {
         "name": "Levenshtein Distance",
@@ -375,20 +399,25 @@
         },
         "description": "This evaluator calculates the Levenshtein distance between the output and the correct answer. If a threshold is provided in the settings, it returns a boolean indicating whether the distance is within the threshold. If no threshold is provided, it returns the actual Levenshtein distance as a numerical value.",
         "oss": True,
+        "tags": ["functional"],
     },
     {
         "name": "RAG Faithfulness",
         "key": "rag_faithfulness",
         "direct_use": False,
+        "requires_llm_api_keys": True,
         "settings_template": rag_evaluator_settings_template,
         "description": "RAG Faithfulness evaluator assesses the accuracy and reliability of responses generated by Retrieval-Augmented Generation (RAG) models. It evaluates how faithfully the responses adhere to the retrieved documents or sources, ensuring that the generated text accurately reflects the information from the original sources.",
+        "tags": ["rag"],
     },
     {
         "name": "RAG Context Relevancy",
         "key": "rag_context_relevancy",
         "direct_use": False,
+        "requires_llm_api_keys": True,
         "settings_template": rag_evaluator_settings_template,
         "description": "RAG Context Relevancy evaluator measures how relevant the retrieved documents or contexts are to the given question or prompt. It ensures that the selected documents provide the necessary information for generating accurate and meaningful responses, improving the overall quality of the RAG model's output.",
+        "tags": ["rag"],
     },
 ]
 

diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py
@@ -5,6 +5,7 @@
 from fastapi.responses import JSONResponse
 from fastapi import HTTPException, Request, status, Response, Query
 
+from agenta_backend.services import helpers
 from agenta_backend.models import converters
 from agenta_backend.tasks.evaluations import evaluate
 from agenta_backend.utils.common import APIRouter, isCloudEE
@@ -14,9 +15,6 @@
     NewEvaluation,
     DeleteEvaluation,
 )
-from agenta_backend.services.evaluator_manager import (
-    check_ai_critique_inputs,
-)
 from agenta_backend.services import evaluation_service, db_manager, app_manager
 
 if isCloudEE():
@@ -112,8 +110,9 @@ async def create_evaluation(
                     status_code=403,
                 )
 
-        success, response = await check_ai_critique_inputs(
-            payload.evaluators_configs, payload.lm_providers_keys
+        llm_provider_keys = helpers.format_llm_provider_keys(payload.lm_providers_keys)
+        success, response = await helpers.ensure_required_llm_keys_exist(
+            payload.evaluators_configs, llm_provider_keys
         )
         if not success:
             return response
@@ -134,8 +133,8 @@ async def create_evaluation(
                 evaluators_config_ids=payload.evaluators_configs,
                 testset_id=payload.testset_id,
                 evaluation_id=evaluation.id,
-                rate_limit_config=payload.rate_limit.dict(),
-                lm_providers_keys=payload.lm_providers_keys,
+                rate_limit_config=payload.rate_limit.model_dump(),
+                lm_providers_keys=llm_provider_keys,
             )
             evaluations.append(evaluation)
 

diff --git a/agenta-backend/agenta_backend/routers/evaluators_router.py b/agenta-backend/agenta_backend/routers/evaluators_router.py
@@ -1,17 +1,27 @@
 import logging
+import traceback
 
 from typing import List
 from fastapi import HTTPException, Request
 from fastapi.responses import JSONResponse
 
 from agenta_backend.utils.common import APIRouter, isCloudEE
-from agenta_backend.services import evaluator_manager, db_manager, app_manager
+from agenta_backend.services import (
+    evaluator_manager,
+    db_manager,
+    evaluators_service,
+    app_manager,
+)
 
 from agenta_backend.models.api.evaluation_model import (
     Evaluator,
     EvaluatorConfig,
     NewEvaluatorConfig,
     UpdateEvaluatorConfig,
+    EvaluatorInputInterface,
+    EvaluatorOutputInterface,
+    EvaluatorMappingInputInterface,
+    EvaluatorMappingOutputInterface,
 )
 
 if isCloudEE():
@@ -47,6 +57,63 @@ async def get_evaluators_endpoint():
         raise HTTPException(status_code=500, detail=str(e))
 
 
+@router.post("/map/", response_model=EvaluatorMappingOutputInterface)
+async def evaluator_data_map(request: Request, payload: EvaluatorMappingInputInterface):
+    """Endpoint to map the experiment data tree to evaluator interface.
+
+    Args:
+        request (Request): The request object.
+        payload (EvaluatorMappingInputInterface): The payload containing the request data.
+
+    Returns:
+        EvaluatorMappingOutputInterface: the evaluator mapping output object
+    """
+
+    try:
+        mapped_outputs = await evaluators_service.map(mapping_input=payload)
+        return mapped_outputs
+    except Exception as e:
+        logger.error(f"Error mapping data tree: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "message": "Error mapping data tree",
+                "stacktrace": traceback.format_exc(),
+            },
+        )
+
+
+@router.post("/{evaluator_key}/run/", response_model=EvaluatorOutputInterface)
+async def evaluator_run(
+    request: Request, evaluator_key: str, payload: EvaluatorInputInterface
+):
+    """Endpoint to evaluate LLM app run
+
+    Args:
+        request (Request): The request object.
+        evaluator_key (str): The key of the evaluator.
+        payload (EvaluatorInputInterface): The payload containing the request data.
+
+    Returns:
+        result: EvaluatorOutputInterface object containing the outputs.
+    """
+
+    try:
+        result = await evaluators_service.run(
+            evaluator_key=evaluator_key, evaluator_input=payload
+        )
+        return result
+    except Exception as e:
+        logger.error(f"Error while running {evaluator_key} evaluator: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "message": f"Error while running {evaluator_key} evaluator",
+                "stacktrace": traceback.format_exc(),
+            },
+        )
+
+
 @router.get("/configs/", response_model=List[EvaluatorConfig])
 async def get_evaluator_configs(app_id: str, request: Request):
     """Endpoint to fetch evaluator configurations for a specific app.