From 5c5ad15caec1bebd09e43477e8e1d1ef4aabcb28 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 1 Aug 2024 21:26:52 +0100 Subject: [PATCH 001/149] feat (backend): add utility function to ensure event loop is retrieved or created --- .../agenta_backend/utils/event_loop_utils.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 agenta-backend/agenta_backend/utils/event_loop_utils.py diff --git a/agenta-backend/agenta_backend/utils/event_loop_utils.py b/agenta-backend/agenta_backend/utils/event_loop_utils.py new file mode 100644 index 0000000000..e95a5c3fb9 --- /dev/null +++ b/agenta-backend/agenta_backend/utils/event_loop_utils.py @@ -0,0 +1,19 @@ +import asyncio + + +def ensure_event_loop() -> asyncio.AbstractEventLoop: + """ + Ensure that there is an event loop available in the current thread. + If there isn't one, create a new event loop and set it. + + Returns: + asyncio.AbstractEventLoop: The event loop for the current thread. + """ + + try: + loop = asyncio.get_event_loop() + except RuntimeError as e: + if "There is no current event loop in thread" in str(e): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + return loop From 60064efb506c117d911a053eb53f201a2cbf0838 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 1 Aug 2024 21:31:30 +0100 Subject: [PATCH 002/149] feat (backend): add endpoint to run evaluation on a specific evaluator --- .../models/api/evaluation_model.py | 9 ++++++ .../routers/evaluators_router.py | 31 ++++++++++++++++++- 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 024a92de3c..e241dadf58 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -79,6 +79,15 @@ class Evaluation(BaseModel): updated_at: datetime +class VariantEvaluation(BaseModel): + output: Any + data_point: Any + settings_values: Dict[str, Any] + variant_parameters: Dict[str, Any] + data_point: Dict[str, Any] + llm_provider_keys: Dict[str, Any] + + class SimpleEvaluationOutput(BaseModel): id: str variant_ids: List[str] diff --git a/agenta-backend/agenta_backend/routers/evaluators_router.py b/agenta-backend/agenta_backend/routers/evaluators_router.py index 932bc5a6f8..b74e74b1d1 100644 --- a/agenta-backend/agenta_backend/routers/evaluators_router.py +++ b/agenta-backend/agenta_backend/routers/evaluators_router.py @@ -5,11 +5,13 @@ from fastapi.responses import JSONResponse from agenta_backend.utils.common import APIRouter, isCloudEE -from agenta_backend.services import evaluator_manager, db_manager +from agenta_backend.services import evaluator_manager, db_manager, evaluators_service from agenta_backend.models.api.evaluation_model import ( + Result, Evaluator, EvaluatorConfig, + VariantEvaluation, NewEvaluatorConfig, UpdateEvaluatorConfig, ) @@ -47,6 +49,33 @@ async def get_evaluators_endpoint(): raise HTTPException(status_code=500, detail=str(e)) +@router.post("/{evaluator_key}/evaluate/", response_model=Result) +def evaluator_evaluate( + request: Request, evaluator_key: str, payload: VariantEvaluation +): + """Endpoint to evaluate LLM app run + + Args: + request (Request): The request object. + evaluator_key (str): The key of the evaluator. + payload (VariantEvaluation): The payload containing request data. + + Returns: + result: Result object containing the type, value, or error. + """ + + result = evaluators_service.evaluate( + evaluator_key=evaluator_key, + output=payload.output, + data_point=payload.data_point, + settings_values=payload.settings_values, + app_params=payload.variant_parameters, + inputs=payload.data_point, + lm_providers_keys=payload.llm_provider_keys, + ) + return result + + @router.get("/configs/", response_model=List[EvaluatorConfig]) async def get_evaluator_configs(app_id: str, request: Request): """Endpoint to fetch evaluator configurations for a specific app. From 0a66e265bcf165e9ef842393bbb20758cc3e82d1 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 1 Aug 2024 21:32:07 +0100 Subject: [PATCH 003/149] refactor (backend): make use of ensure_event_loop utility function --- agenta-backend/agenta_backend/services/evaluators_service.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index bba32bf61a..66b9e30e6e 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -1,7 +1,6 @@ import re import json import logging -import asyncio import traceback from typing import Any, Dict, Union @@ -12,6 +11,8 @@ from agenta_backend.services.security import sandbox from agenta_backend.models.shared_models import Error, Result +from agenta_backend.utils.event_loop_utils import ensure_event_loop + logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -744,7 +745,7 @@ def auto_semantic_similarity( lm_providers_keys: Dict[str, Any], ) -> Result: try: - loop = asyncio.get_event_loop() + loop = ensure_event_loop() openai_api_key = lm_providers_keys["OPENAI_API_KEY"] correct_answer = get_correct_answer(data_point, settings_values) From 9ba496f66bbad26e9156573d3c3ee08f9f7e12ce Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 2 Aug 2024 14:49:18 +0100 Subject: [PATCH 004/149] docs (backend): improve docstring in ensure_event_loop function --- agenta-backend/agenta_backend/utils/event_loop_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/agenta-backend/agenta_backend/utils/event_loop_utils.py b/agenta-backend/agenta_backend/utils/event_loop_utils.py index e95a5c3fb9..47e5ab326c 100644 --- a/agenta-backend/agenta_backend/utils/event_loop_utils.py +++ b/agenta-backend/agenta_backend/utils/event_loop_utils.py @@ -6,6 +6,9 @@ def ensure_event_loop() -> asyncio.AbstractEventLoop: Ensure that there is an event loop available in the current thread. If there isn't one, create a new event loop and set it. + Raises: + RuntimeError: There is no current event loop in thread 'AnyIO worker thread'. + Returns: asyncio.AbstractEventLoop: The event loop for the current thread. """ From bd9b2d31e6b7f11da2131c7a42d8405b883138f5 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 4 Aug 2024 12:28:45 +0100 Subject: [PATCH 005/149] minor refactor (build): replace use of 'docker-compose' to 'docker compose' --- .github/workflows/run-backend-tests.yml | 2 +- .github/workflows/run-frontend-tests.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run-backend-tests.yml b/.github/workflows/run-backend-tests.yml index 1ecf977642..a76dc51ad8 100644 --- a/.github/workflows/run-backend-tests.yml +++ b/.github/workflows/run-backend-tests.yml @@ -27,7 +27,7 @@ jobs: run: sudo apt install curl -y - name: Start Docker Compose - run: OPENAI_API_KEY=${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} ENVIRONMENT=github docker-compose -f "docker-compose.test.yml" up -d --build + run: OPENAI_API_KEY=${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} ENVIRONMENT=github docker compose -f "docker-compose.test.yml" up -d --build - name: Wait for Backend Service run: | diff --git a/.github/workflows/run-frontend-tests.yml b/.github/workflows/run-frontend-tests.yml index 47bcb1342f..ade6318314 100644 --- a/.github/workflows/run-frontend-tests.yml +++ b/.github/workflows/run-frontend-tests.yml @@ -33,7 +33,7 @@ jobs: NEXT_PUBLIC_OPENAI_API_KEY: ${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} run: | sudo apt install curl -y - OPENAI_API_KEY=${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} ENVIRONMENT=github docker-compose -f "docker-compose.test.yml" up -d --build + OPENAI_API_KEY=${{ secrets.NEXT_PUBLIC_OPENAI_API_KEY }} ENVIRONMENT=github docker compose -f "docker-compose.test.yml" up -d --build - name: Restart Backend Service To Fetch Template(s) run: docker container restart agenta-backend-test From 3f6b507ffecb38724db436f5bf76af2f14b0129b Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 8 Aug 2024 21:44:06 +0100 Subject: [PATCH 006/149] feat (backend): created evaluator mapping and input interfaces --- .../models/api/evaluation_model.py | 44 +++++++++++++++---- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 45591bfd3e..f7cc7489f0 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -1,7 +1,9 @@ from enum import Enum from datetime import datetime -from pydantic import BaseModel from typing import Optional, List, Dict, Any, Union + +from pydantic import BaseModel, Field, model_validator + from agenta_backend.models.api.api_models import Result @@ -80,13 +82,39 @@ class Evaluation(BaseModel): updated_at: datetime -class VariantEvaluation(BaseModel): - output: Any - data_point: Any - settings_values: Dict[str, Any] - variant_parameters: Dict[str, Any] - data_point: Dict[str, Any] - llm_provider_keys: Dict[str, Any] +class SimpleEvaluatorMappingInterface(BaseModel): + ground_truth: Any + prediction: Any + + +class RagEvaluatorMappingInterface(BaseModel): + question: Any + contexts: Any + answer: Any + + +class EvaluationSettingsInterface(BaseModel): + llm_provider: str + llm_api_key: str + + +class EvaluatorInputInterface(BaseModel): + inputs: Dict[str, Any] = Field(default_factory=dict) + settings: Optional[Dict[str, Any]] = None + credentials: Optional[Dict[str, Any]] = None + + +class EvaluatorOutputInterface(BaseModel): + outputs: Dict[str, Any] + + +class EvaluatorMappingInputInterface(BaseModel): + inputs: Dict[str, Any] + mapping: Dict[str, Any] + + +class EvaluatorMappingOutputInterface(BaseModel): + outputs: Dict[str, Any] class SimpleEvaluationOutput(BaseModel): From 4a1604d59696bd58b9c326a4dad1231d3503e127 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 8 Aug 2024 21:44:58 +0100 Subject: [PATCH 007/149] feat (backend): implemented endpoints to map experiment data tree to evaluator interface and modified endpoint to evaluate llm app run --- .../routers/evaluators_router.py | 75 ++++++++++++++----- 1 file changed, 57 insertions(+), 18 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluators_router.py b/agenta-backend/agenta_backend/routers/evaluators_router.py index b74e74b1d1..676c68b5f6 100644 --- a/agenta-backend/agenta_backend/routers/evaluators_router.py +++ b/agenta-backend/agenta_backend/routers/evaluators_router.py @@ -1,19 +1,26 @@ import logging +import traceback from typing import List from fastapi import HTTPException, Request from fastapi.responses import JSONResponse from agenta_backend.utils.common import APIRouter, isCloudEE -from agenta_backend.services import evaluator_manager, db_manager, evaluators_service +from agenta_backend.services import ( + evaluator_manager, + db_manager, + evaluators_service, +) from agenta_backend.models.api.evaluation_model import ( - Result, Evaluator, EvaluatorConfig, - VariantEvaluation, NewEvaluatorConfig, UpdateEvaluatorConfig, + EvaluatorInputInterface, + EvaluatorOutputInterface, + EvaluatorMappingInputInterface, + EvaluatorMappingOutputInterface, ) if isCloudEE(): @@ -49,31 +56,63 @@ async def get_evaluators_endpoint(): raise HTTPException(status_code=500, detail=str(e)) -@router.post("/{evaluator_key}/evaluate/", response_model=Result) -def evaluator_evaluate( - request: Request, evaluator_key: str, payload: VariantEvaluation +@router.post("/map/", response_model=EvaluatorMappingOutputInterface) +async def evaluator_data_map( + request: Request, payload: EvaluatorMappingInputInterface +): + """Endpoint to map the experiment data tree to evaluator interface. + + Args: + request (Request): The request object. + payload (EvaluatorMappingInputInterface): The payload containing the request data. + + Returns: + EvaluatorMappingOutputInterface: the evaluator mapping output object + """ + + try: + mapped_outputs = evaluators_service.map(mapping_input=payload) + return mapped_outputs + except Exception as e: + logger.error(f"Error mapping data tree: {str(e)}") + raise HTTPException( + status_code=500, + detail={ + "message": "Error mapping data tree", + "stacktrace": traceback.format_exc(), + }, + ) + + +@router.post("/{evaluator_key}/run/", response_model=EvaluatorOutputInterface) +def evaluator_run( + request: Request, evaluator_key: str, payload: EvaluatorInputInterface ): """Endpoint to evaluate LLM app run Args: request (Request): The request object. evaluator_key (str): The key of the evaluator. - payload (VariantEvaluation): The payload containing request data. + payload (EvaluatorInputInterface): The payload containing the request data. Returns: - result: Result object containing the type, value, or error. + result: EvaluatorOutputInterface object containing the outputs. """ - result = evaluators_service.evaluate( - evaluator_key=evaluator_key, - output=payload.output, - data_point=payload.data_point, - settings_values=payload.settings_values, - app_params=payload.variant_parameters, - inputs=payload.data_point, - lm_providers_keys=payload.llm_provider_keys, - ) - return result + try: + result = evaluators_service.run( + evaluator_key=evaluator_key, evaluator_input=payload + ) + return result + except Exception as e: + logger.error(f"Error while running evaluator: {str(e)}") + raise HTTPException( + status_code=500, + detail={ + "message": "Error while running evaluator", + "stacktrace": traceback.format_exc(), + }, + ) @router.get("/configs/", response_model=List[EvaluatorConfig]) From 55c727e8ce843f8957d5ae35d06a579616112f93 Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 8 Aug 2024 21:46:19 +0100 Subject: [PATCH 008/149] refactor (backend): update evaluator handlers to make use of new handlers --- .../services/evaluators_service.py | 566 ++++++++++++------ 1 file changed, 399 insertions(+), 167 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index efe594a0ee..9c6b57cb6d 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -14,6 +14,12 @@ from agenta_backend.services.security import sandbox from agenta_backend.models.shared_models import Error, Result from agenta_backend.utils.event_loop_utils import ensure_event_loop +from agenta_backend.models.api.evaluation_model import ( + EvaluatorInputInterface, + EvaluatorOutputInterface, + EvaluatorMappingInputInterface, + EvaluatorMappingOutputInterface, +) from agenta_backend.utils.traces import process_distributed_trace_into_trace_tree @@ -21,6 +27,63 @@ logger.setLevel(logging.DEBUG) +def map( + mapping_input: EvaluatorMappingInputInterface, +) -> EvaluatorMappingOutputInterface: + """ + Maps the evaluator inputs based on the provided mapping and data tree. + + Returns: + EvaluatorMappingOutputInterface: A dictionary containing the mapped evaluator inputs. + """ + + def get_nested_value(data: Dict[str, Any], key: str) -> Any: + """ + Retrieves the nested value from a dictionary based on a dotted key path, + where list indices can be included in square brackets. + + Args: + data (Dict[str, Any]): The data dictionary to retrieve the value from. + key (str): The key path to the desired value, with possible list indices. + + Returns: + Any: The value found at the specified key path, or None if not found. + + Example: + >>> data = { + ... 'rag': { + ... 'summarizer': [{'outputs': {'report': 'The answer is 42'}}] + ... } + ... } + >>> key = 'rag.summarizer[0].outputs.report' + >>> get_nested_value(data, key) + 'The answer is 42' + """ + + pattern = re.compile(r"([^\[\].]+|\[\d+\])") + keys = pattern.findall(key) + + for k in keys: + if k.startswith("[") and k.endswith("]"): + # Convert list index from '[index]' to integer + k = int(k[1:-1]) + if isinstance(data, list): + data = data[k] if 0 <= k < len(data) else None + else: + return None + else: + if isinstance(data, dict): + data = data.get(k, None) + else: + return None + return data + + mapping_outputs = {} + for to_key, from_key in mapping_input.mapping.items(): + mapping_outputs[to_key] = get_nested_value(mapping_input.inputs, from_key) + return {"outputs": mapping_outputs} + + def get_correct_answer( data_point: Dict[str, Any], settings_values: Dict[str, Any] ) -> Any: @@ -139,8 +202,9 @@ def auto_exact_match( """ try: correct_answer = get_correct_answer(data_point, settings_values) - exact_match = True if output == correct_answer else False - result = Result(type="bool", value=exact_match) + inputs = {"ground_truth": correct_answer, "prediction": output} + response = exact_match(input=EvaluatorInputInterface(**{"inputs": inputs})) + result = Result(type="bool", value=response["outputs"]["success"]) return result except ValueError as e: return Result( @@ -161,6 +225,13 @@ def auto_exact_match( ) +def exact_match(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: + prediction = input.inputs.get("prediction", "") + ground_truth = input.inputs.get("ground_truth", "") + success = True if prediction == ground_truth else False + return {"outputs": {"success": success}} + + def auto_regex_test( inputs: Dict[str, Any], # pylint: disable=unused-argument output: str, @@ -170,11 +241,13 @@ def auto_regex_test( lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: try: - re_pattern = re.compile(settings_values["regex_pattern"], re.IGNORECASE) - result = ( - bool(re_pattern.search(output)) == settings_values["regex_should_match"] + inputs = {"ground_truth": data_point, "prediction": output} + response = regex_test( + input=EvaluatorInputInterface( + **{"inputs": inputs, "settings": settings_values} + ) ) - return Result(type="bool", value=result) + return Result(type="bool", value=response["outputs"]["success"]) except Exception as e: # pylint: disable=broad-except return Result( type="error", @@ -186,7 +259,16 @@ def auto_regex_test( ) -def field_match_test( +def regex_test(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: + pattern = re.compile(input.settings["regex_pattern"], re.IGNORECASE) + result = ( + bool(pattern.search(input.inputs["prediction"])) + == input.settings["regex_should_match"] + ) + return {"outputs": {"success": result}} + + +def auto_field_match_test( inputs: Dict[str, Any], # pylint: disable=unused-argument output: str, data_point: Dict[str, Any], @@ -196,9 +278,9 @@ def field_match_test( ) -> Result: try: correct_answer = get_correct_answer(data_point, settings_values) - output_json = json.loads(output) - result = output_json[settings_values["json_field"]] == correct_answer - return Result(type="bool", value=result) + inputs = {"ground_truth": correct_answer, "prediction": output} + response = field_match_test(input=EvaluatorInputInterface(**{"inputs": inputs})) + return Result(type="bool", value=response["outputs"]["success"]) except ValueError as e: return Result( type="error", @@ -212,6 +294,12 @@ def field_match_test( return Result(type="bool", value=False) +def field_match_test(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: + prediction_json = json.loads(input.inputs["prediction"]) + result = prediction_json == input.inputs["ground_truth"] + return {"outputs": {"success": result}} + + def auto_webhook_test( inputs: Dict[str, Any], output: str, @@ -222,34 +310,13 @@ def auto_webhook_test( ) -> Result: try: correct_answer = get_correct_answer(data_point, settings_values) - - with httpx.Client() as client: - payload = { - "correct_answer": correct_answer, - "output": output, - "inputs": inputs, - } - response = client.post(url=settings_values["webhook_url"], json=payload) - response.raise_for_status() - response_data = response.json() - score = response_data.get("score", None) - if score is None and not isinstance(score, (int, float)): - return Result( - type="error", - value=None, - error=Error( - message="Error during Auto Webhook evaluation; Webhook did not return a score", - ), - ) - if score < 0 or score > 1: - return Result( - type="error", - value=None, - error=Error( - message="Error during Auto Webhook evaluation; Webhook returned an invalid score. Score must be between 0 and 1", - ), - ) - return Result(type="number", value=score) + inputs = {"prediction": output, "ground_truth": correct_answer} + response = webhook_test( + input=EvaluatorInputInterface( + **{"inputs": inputs, "settings": settings_values} + ) + ) + return Result(type="number", value=response["outputs"]["score"]) except httpx.HTTPError as e: return Result( type="error", @@ -279,6 +346,20 @@ def auto_webhook_test( ) +def webhook_test(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: + with httpx.Client() as client: + payload = { + "correct_answer": input.inputs["ground_truth"], + "output": input.inputs["prediction"], + "inputs": input.inputs, + } + response = client.post(url=input.settings["webhook_url"], json=payload) + response.raise_for_status() + response_data = response.json() + score = response_data.get("score", None) + return {"outputs": {"score": score}} + + def auto_custom_code_run( inputs: Dict[str, Any], output: str, @@ -288,17 +369,18 @@ def auto_custom_code_run( lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: try: - result = sandbox.execute_code_safely( - app_params=app_params, - inputs=inputs, - output=output, - correct_answer=data_point.get( - "correct_answer", None - ), # for backward compatibility - code=settings_values["code"], - datapoint=data_point, + correct_answer = get_correct_answer(data_point, settings_values) + inputs = { + "app_config": app_params, + "prediction": output, + "ground_truth": correct_answer, + } + response = custom_code_run( + input=EvaluatorInputInterface( + **{"inputs": inputs, "settings": {"code": settings_values["code"]}} + ) ) - return Result(type="number", value=result) + return Result(type="number", value=response["outputs"]["score"]) except Exception as e: # pylint: disable=broad-except return Result( type="error", @@ -310,6 +392,18 @@ def auto_custom_code_run( ) +def custom_code_run(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: + result = sandbox.execute_code_safely( + app_params=input.inputs["app_config"], + inputs=input.inputs, + output=input.inputs["prediction"], + correct_answer=input.inputs["ground_truth"], + code=input.settings["code"], + datapoint=input.inputs["ground_truth"], + ) + return {"outputs": {"score": result}} + + def auto_ai_critique( inputs: Dict[str, Any], output: str, @@ -334,30 +428,17 @@ def auto_ai_critique( """ try: correct_answer = get_correct_answer(data_point, settings_values) - openai_api_key = lm_providers_keys["OPENAI_API_KEY"] - - chain_run_args = { - "llm_app_prompt_template": app_params.get("prompt_user", ""), - "variant_output": output, - "correct_answer": correct_answer, + inputs = { + "prompt_user": app_params.get("prompt_user", ""), + "prediction": output, + "ground_truth": correct_answer, } - - for key, value in inputs.items(): - chain_run_args[key] = value - - prompt_template = settings_values["prompt_template"] - messages = [ - {"role": "system", "content": prompt_template}, - {"role": "user", "content": str(chain_run_args)}, - ] - - client = OpenAI(api_key=openai_api_key) - response = client.chat.completions.create( - model="gpt-3.5-turbo", messages=messages, temperature=0.8 + response = ai_critique( + input=EvaluatorInputInterface( + **{"inputs": inputs, "credentials": lm_providers_keys} + ) ) - - evaluation_output = response.choices[0].message.content.strip() - return Result(type="text", value=evaluation_output) + return Result(type="text", value=response["outputs"]["score"]) except Exception as e: # pylint: disable=broad-except return Result( type="error", @@ -369,6 +450,32 @@ def auto_ai_critique( ) +def ai_critique(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: + openai_api_key = input.credentials["OPENAI_API_KEY"] + + chain_run_args = { + "llm_app_prompt_template": input.inputs.get("prompt_user", ""), + "variant_output": input.inputs["prediction"], + "correct_answer": input.inputs["ground_truth"], + } + + for key, value in input.inputs.items(): + chain_run_args[key] = value + + prompt_template = input.settings["prompt_template"] + messages = [ + {"role": "system", "content": prompt_template}, + {"role": "user", "content": str(chain_run_args)}, + ] + + client = OpenAI(api_key=openai_api_key) + response = client.chat.completions.create( + model="gpt-3.5-turbo", messages=messages, temperature=0.8 + ) + evaluation_output = response.choices[0].message.content.strip() + return {"outputs": {"score": evaluation_output}} + + def auto_starts_with( inputs: Dict[str, Any], # pylint: disable=unused-argument output: str, @@ -378,15 +485,13 @@ def auto_starts_with( lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: try: - prefix = settings_values.get("prefix", "") - case_sensitive = settings_values.get("case_sensitive", True) - - if not case_sensitive: - output = output.lower() - prefix = prefix.lower() - - result = Result(type="bool", value=output.startswith(prefix)) - return result + inputs = {"prediction": output} + response = starts_with( + input=EvaluatorInputInterface( + **{"inputs": inputs, "settings": settings_values} + ) + ) + return Result(type="text", value=response["outputs"]["success"]) except Exception as e: # pylint: disable=broad-except return Result( type="error", @@ -398,6 +503,18 @@ def auto_starts_with( ) +def starts_with(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: + prefix = input.settings.get("prefix", "") + case_sensitive = input.settings.get("case_sensitive", True) + + if not case_sensitive: + output = str(input.inputs["prediction"]).lower() + prefix = prefix.lower() + + result = output.startswith(prefix) + return {"outputs": {"success": result}} + + def auto_ends_with( inputs: Dict[str, Any], # pylint: disable=unused-argument output: str, @@ -407,14 +524,13 @@ def auto_ends_with( lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: try: - suffix = settings_values.get("suffix", "") - case_sensitive = settings_values.get("case_sensitive", True) - - if not case_sensitive: - output = output.lower() - suffix = suffix.lower() - - result = Result(type="bool", value=output.endswith(suffix)) + inputs = {"prediction": output} + response = ends_with( + input=EvaluatorInputInterface( + **{"inputs": inputs, "settings": settings_values} + ) + ) + result = Result(type="bool", value=response["outputs"]["success"]) return result except Exception as e: # pylint: disable=broad-except return Result( @@ -427,6 +543,18 @@ def auto_ends_with( ) +def ends_with(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: + suffix = input.settings.get("suffix", "") + case_sensitive = input.settings.get("case_sensitive", True) + + if not case_sensitive: + output = str(input.inputs["prediction"]).lower() + suffix = suffix.lower() + + result = output.endswith(suffix) + return {"outputs": {"success": result}} + + def auto_contains( inputs: Dict[str, Any], # pylint: disable=unused-argument output: str, @@ -436,14 +564,13 @@ def auto_contains( lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: try: - substring = settings_values.get("substring", "") - case_sensitive = settings_values.get("case_sensitive", True) - - if not case_sensitive: - output = output.lower() - substring = substring.lower() - - result = Result(type="bool", value=substring in output) + inputs = {"prediction": output} + response = contains( + input=EvaluatorInputInterface( + **{"inputs": inputs, "settings": settings_values} + ) + ) + result = Result(type="bool", value=response["outputs"["success"]]) return result except Exception as e: # pylint: disable=broad-except return Result( @@ -456,6 +583,18 @@ def auto_contains( ) +def contains(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: + substring = input.settings.get("substring", "") + case_sensitive = input.settings.get("case_sensitive", True) + + if not case_sensitive: + output = str(input.inputs["prediction"]).lower() + substring = substring.lower() + + result = substring in output + return {"outputs": {"success": result}} + + def auto_contains_any( inputs: Dict[str, Any], # pylint: disable=unused-argument output: str, @@ -465,17 +604,13 @@ def auto_contains_any( lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: try: - substrings_str = settings_values.get("substrings", "") - substrings = [substring.strip() for substring in substrings_str.split(",")] - case_sensitive = settings_values.get("case_sensitive", True) - - if not case_sensitive: - output = output.lower() - substrings = [substring.lower() for substring in substrings] - - result = Result( - type="bool", value=any(substring in output for substring in substrings) + inputs = {"prediction": output} + response = contains_any( + input=EvaluatorInputInterface( + **{"inputs": inputs, "settings": settings_values} + ) ) + result = Result(type="bool", value=response["outputs"]["success"]) return result except Exception as e: # pylint: disable=broad-except return Result( @@ -488,6 +623,20 @@ def auto_contains_any( ) +def contains_any(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: + substrings_str = input.settings.get("substrings", "") + substrings = [substring.strip() for substring in substrings_str.split(",")] + case_sensitive = input.settings.get("case_sensitive", True) + + if not case_sensitive: + output = str(input.inputs["prediction"]).lower() + substrings = [substring.lower() for substring in substrings] + + return { + "outputs": {"success": any(substring in output for substring in substrings)} + } + + def auto_contains_all( inputs: Dict[str, Any], # pylint: disable=unused-argument output: str, @@ -497,17 +646,12 @@ def auto_contains_all( lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: try: - substrings_str = settings_values.get("substrings", "") - substrings = [substring.strip() for substring in substrings_str.split(",")] - case_sensitive = settings_values.get("case_sensitive", True) - - if not case_sensitive: - output = output.lower() - substrings = [substring.lower() for substring in substrings] - - result = Result( - type="bool", value=all(substring in output for substring in substrings) + response = contains_all( + input=EvaluatorInputInterface( + **{"inputs": {"prediction": output}, "settings": settings_values} + ) ) + result = Result(type="bool", value=response["outputs"]["success"]) return result except Exception as e: # pylint: disable=broad-except return Result( @@ -520,6 +664,19 @@ def auto_contains_all( ) +def contains_all(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: + substrings_str = input.settings.get("substrings", "") + substrings = [substring.strip() for substring in substrings_str.split(",")] + case_sensitive = input.settings.get("case_sensitive", True) + + if not case_sensitive: + output = str(input.inputs["prediction"]).lower() + substrings = [substring.lower() for substring in substrings] + + result = all(substring in output for substring in substrings) + return {"outputs": {"success": result}} + + def auto_contains_json( inputs: Dict[str, Any], # pylint: disable=unused-argument output: str, @@ -529,16 +686,9 @@ def auto_contains_json( lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: try: - try: - start_index = output.index("{") - end_index = output.rindex("}") + 1 - potential_json = output[start_index:end_index] - - json.loads(potential_json) - contains_json = True - except (ValueError, json.JSONDecodeError): - contains_json = False - + response = contains_json( + input=EvaluatorInputInterface(**{"inputs": {"prediction": output}}) + ) return Result(type="bool", value=contains_json) except Exception as e: # pylint: disable=broad-except return Result( @@ -551,6 +701,20 @@ def auto_contains_json( ) +def contains_json(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: + start_index = str(input.inputs["prediction"]).index("{") + end_index = str(input.inputs["prediction"]).rindex("}") + 1 + potential_json = str(input.inputs["prediction"])[start_index:end_index] + + try: + json.loads(potential_json) + contains_json = True + except (ValueError, json.JSONDecodeError): + contains_json = False + + return {"outputs": {"success": contains_json}} + + def flatten_json(json_obj: Union[list, dict]) -> Dict[str, Any]: """ This function takes a (nested) JSON object and flattens it into a single-level dictionary where each key represents the path to the value in the original JSON structure. This is done recursively, ensuring that the full hierarchical context is preserved in the keys. @@ -662,12 +826,15 @@ def auto_json_diff( ) -> Result: try: correct_answer = get_correct_answer(data_point, settings_values) - average_score = compare_jsons( - ground_truth=correct_answer, - app_output=json.loads(output), - settings_values=settings_values, + response = json_diff( + input=EvaluatorInputInterface( + **{ + "inputs": {"prediction": output, "ground_truth": correct_answer}, + "settings": settings_values, + } + ) ) - return Result(type="number", value=average_score) + return Result(type="number", value=response["outputs"]["score"]) except (ValueError, json.JSONDecodeError, Exception): return Result( type="error", @@ -679,6 +846,15 @@ def auto_json_diff( ) +def json_diff(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: + average_score = compare_jsons( + ground_truth=input.inputs["ground_truth"], + app_output=json.loads(input.inputs["prediction"]), + settings_values=input.settings, + ) + return {"outputs": {"score": average_score}} + + def rag_faithfulness( inputs: Dict[str, Any], # pylint: disable=unused-argument output: Dict[str, Any], @@ -848,24 +1024,36 @@ def rag_context_relevancy( ) -def levenshtein_distance(s1, s2): - if len(s1) < len(s2): - return levenshtein_distance(s2, s1) # pylint: disable=arguments-out-of-order +def levenshtein_distance(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: + prediction = input.inputs["prediction"] + ground_truth = input.inputs["ground_truth"] + if len(prediction) < len(ground_truth): + return levenshtein_distance( + input=EvaluatorInputInterface( + **{"inputs": {"prediction": prediction, "ground_truth": ground_truth}} + ) + ) # pylint: disable=arguments-out-of-order - if len(s2) == 0: + if len(ground_truth) == 0: return len(s1) - previous_row = range(len(s2) + 1) - for i, c1 in enumerate(s1): + previous_row = range(len(ground_truth) + 1) + for i, c1 in enumerate(prediction): current_row = [i + 1] - for j, c2 in enumerate(s2): + for j, c2 in enumerate(ground_truth): insertions = previous_row[j + 1] + 1 deletions = current_row[j] + 1 substitutions = previous_row[j] + (c1 != c2) current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row - return previous_row[-1] + result = previous_row[-1] + if "threshold" in input.settings: + threshold = input.settings["threshold"] + is_within_threshold = distance <= threshold + return {"outputs": {"success": is_within_threshold}} + + return {"outputs": {"score": distance}} def auto_levenshtein_distance( @@ -878,15 +1066,12 @@ def auto_levenshtein_distance( ) -> Result: try: correct_answer = get_correct_answer(data_point, settings_values) - - distance = levenshtein_distance(output, correct_answer) - - if "threshold" in settings_values: - threshold = settings_values["threshold"] - is_within_threshold = distance <= threshold - return Result(type="bool", value=is_within_threshold) - - return Result(type="number", value=distance) + response = levenshtein_distance( + input=EvaluatorInputInterface( + **{"inputs": {"prediction": output, "ground_truth": correct_answer}} + ) + ) + return Result(type="number", value=response["outputs"].get("score", "success")) except ValueError as e: return Result( @@ -917,17 +1102,15 @@ def auto_similarity_match( ) -> Result: try: correct_answer = get_correct_answer(data_point, settings_values) - set1 = set(output.split()) - set2 = set(correct_answer.split()) - intersect = set1.intersection(set2) - union = set1.union(set2) - - similarity = len(intersect) / len(union) - - is_similar = ( - True if similarity > settings_values["similarity_threshold"] else False + response = similarity_match( + input=EvaluatorInputInterface( + **{ + "inputs": {"prediction": output, "ground_truth": correct_answer}, + "settings": settings_values, + } + ) ) - result = Result(type="bool", value=is_similar) + result = Result(type="bool", value=response["outputs"]["success"]) return result except ValueError as e: return Result( @@ -948,7 +1131,20 @@ def auto_similarity_match( ) -async def semantic_similarity(output: str, correct_answer: str, api_key: str) -> float: +def similarity_match(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: + set1 = set(input.inputs["prediction"].split()) + set2 = set(input.inputs["ground_truth"].split()) + intersect = set1.intersection(set2) + union = set1.union(set2) + + similarity = len(intersect) / len(union) + is_similar = True if similarity > input.settings["similarity_threshold"] else False + return {"outputs": {"success": is_similar}} + + +async def semantic_similarity( + input: EvaluatorInputInterface, +) -> EvaluatorOutputInterface: """Calculate the semantic similarity score of the LLM app using OpenAI's Embeddings API. Args: @@ -959,6 +1155,7 @@ async def semantic_similarity(output: str, correct_answer: str, api_key: str) -> float: the semantic similarity score """ + api_key = input.credentials["OPENAI_API_KEY"] openai = AsyncOpenAI(api_key=api_key) async def encode(text: str): @@ -970,10 +1167,10 @@ async def encode(text: str): def cosine_similarity(output_vector: array, correct_answer_vector: array) -> float: return np.dot(output_vector, correct_answer_vector) - output_vector = await encode(output) - correct_answer_vector = await encode(correct_answer) + output_vector = await encode(input.inputs["prediction"]) + correct_answer_vector = await encode(input.inputs["ground_truth"]) similarity_score = cosine_similarity(output_vector, correct_answer_vector) - return similarity_score + return {"outputs": {"score": similarity_score}} def auto_semantic_similarity( @@ -986,15 +1183,20 @@ def auto_semantic_similarity( ) -> Result: try: loop = ensure_event_loop() - openai_api_key = lm_providers_keys["OPENAI_API_KEY"] - correct_answer = get_correct_answer(data_point, settings_values) - score = loop.run_until_complete( + correct_answer = get_correct_answer(data_point, settings_values) + inputs = {"prediction": output, "ground_truth": correct_answer} + response = loop.run_until_complete( semantic_similarity( - output=output, correct_answer=correct_answer, api_key=openai_api_key + input=EvaluatorInputInterface( + **{ + "inputs": inputs, + "credentials": lm_providers_keys, + } + ) ) ) - return Result(type="number", value=score) + return Result(type="number", value=response["outputs"]["score"]) except Exception: return Result( type="error", @@ -1009,7 +1211,7 @@ def auto_semantic_similarity( EVALUATOR_FUNCTIONS = { "auto_exact_match": auto_exact_match, "auto_regex_test": auto_regex_test, - "field_match_test": field_match_test, + "field_match_test": auto_field_match_test, "auto_webhook_test": auto_webhook_test, "auto_custom_code_run": auto_custom_code_run, "auto_ai_critique": auto_ai_critique, @@ -1027,6 +1229,25 @@ def auto_semantic_similarity( "rag_context_relevancy": rag_context_relevancy, } +NEW_EVALUATOR_FUNCTIONS = { + "auto_exact_match": exact_match, + "auto_regex_test": regex_test, + "auto_field_match_test": field_match_test, + "auto_webhook_test": webhook_test, + "auto_custom_code_run": custom_code_run, + "auto_ai_critique": ai_critique, + "auto_starts_with": starts_with, + "auto_ends_with": ends_with, + "auto_contains": contains, + "auto_contains_any": contains_any, + "auto_contains_all": contains_all, + "auto_contains_json": contains_json, + "auto_json_diff": json_diff, + "auto_levenshtein_distance": levenshtein_distance, + "auto_similarity_match": similarity_match, + "auto_semantic_similarity": semantic_similarity, +} + def evaluate( evaluator_key: str, @@ -1064,3 +1285,14 @@ def evaluate( stacktrace=str(exc), ), ) + + +def run( + evaluator_key: str, evaluator_input: EvaluatorInputInterface +) -> EvaluatorOutputInterface: + evaluator_function = NEW_EVALUATOR_FUNCTIONS.get(evaluator_key, None) + if not evaluator_function: + raise NotImplementedError(f"Evaluator {evaluator_key} not found") + + output = evaluator_function(evaluator_input) + return output From 6fb0b02d76501897678ab347859ea1cab2c8f27d Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 9 Aug 2024 11:34:56 +0100 Subject: [PATCH 009/149] chore (backend): remove interfaces that are redundant --- .../models/api/evaluation_model.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index f7cc7489f0..363706702e 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -82,22 +82,6 @@ class Evaluation(BaseModel): updated_at: datetime -class SimpleEvaluatorMappingInterface(BaseModel): - ground_truth: Any - prediction: Any - - -class RagEvaluatorMappingInterface(BaseModel): - question: Any - contexts: Any - answer: Any - - -class EvaluationSettingsInterface(BaseModel): - llm_provider: str - llm_api_key: str - - class EvaluatorInputInterface(BaseModel): inputs: Dict[str, Any] = Field(default_factory=dict) settings: Optional[Dict[str, Any]] = None From 0facf64a99086225f825c5977bfaeaef6fc9a214 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 12 Aug 2024 11:04:16 +0100 Subject: [PATCH 010/149] refactor (backend): convert evaluator functions to asynchronous --- .../services/evaluators_service.py | 174 +++++++++--------- 1 file changed, 83 insertions(+), 91 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index ab76c251c1..d317df8955 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -13,7 +13,6 @@ from agenta_backend.services.security import sandbox from agenta_backend.models.shared_models import Error, Result -from agenta_backend.utils.event_loop_utils import ensure_event_loop from agenta_backend.models.api.evaluation_model import ( EvaluatorInputInterface, EvaluatorOutputInterface, @@ -30,7 +29,7 @@ logger.setLevel(logging.DEBUG) -def map( +async def map( mapping_input: EvaluatorMappingInputInterface, ) -> EvaluatorMappingOutputInterface: """ @@ -40,7 +39,7 @@ def map( EvaluatorMappingOutputInterface: A dictionary containing the mapped evaluator inputs. """ - def get_nested_value(data: Dict[str, Any], key: str) -> Any: + async def get_nested_value(data: Dict[str, Any], key: str) -> Any: """ Retrieves the nested value from a dictionary based on a dotted key path, where list indices can be included in square brackets. @@ -87,7 +86,7 @@ def get_nested_value(data: Dict[str, Any], key: str) -> Any: return {"outputs": mapping_outputs} -def get_correct_answer( +async def get_correct_answer( data_point: Dict[str, Any], settings_values: Dict[str, Any] ) -> Any: """ @@ -113,7 +112,7 @@ def get_correct_answer( return data_point[correct_answer_key] -def auto_exact_match( +async def auto_exact_match( inputs: Dict[str, Any], # pylint: disable=unused-argument output: Union[str, Dict[str, Any]], data_point: Dict[str, Any], # pylint: disable=unused-argument @@ -169,7 +168,7 @@ def exact_match(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: return {"outputs": {"success": success}} -def auto_regex_test( +async def auto_regex_test( inputs: Dict[str, Any], # pylint: disable=unused-argument output: Union[str, Dict[str, Any]], data_point: Dict[str, Any], # pylint: disable=unused-argument @@ -181,7 +180,7 @@ def auto_regex_test( output = output.get("data", "") try: inputs = {"ground_truth": data_point, "prediction": output} - response = regex_test( + response = await regex_test( input=EvaluatorInputInterface( **{"inputs": inputs, "settings": settings_values} ) @@ -198,7 +197,7 @@ def auto_regex_test( ) -def regex_test(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: +async def regex_test(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: pattern = re.compile(input.settings["regex_pattern"], re.IGNORECASE) result = ( bool(pattern.search(input.inputs["prediction"])) @@ -207,7 +206,7 @@ def regex_test(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: return {"outputs": {"success": result}} -def auto_field_match_test( +async def auto_field_match_test( inputs: Dict[str, Any], # pylint: disable=unused-argument output: Union[str, Dict[str, Any]], data_point: Dict[str, Any], @@ -220,7 +219,9 @@ def auto_field_match_test( try: correct_answer = get_correct_answer(data_point, settings_values) inputs = {"ground_truth": correct_answer, "prediction": output} - response = field_match_test(input=EvaluatorInputInterface(**{"inputs": inputs})) + response = await field_match_test( + input=EvaluatorInputInterface(**{"inputs": inputs}) + ) return Result(type="bool", value=response["outputs"]["success"]) except ValueError as e: return Result( @@ -235,13 +236,13 @@ def auto_field_match_test( return Result(type="bool", value=False) -def field_match_test(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: +async def field_match_test(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: prediction_json = json.loads(input.inputs["prediction"]) result = prediction_json == input.inputs["ground_truth"] return {"outputs": {"success": result}} -def auto_webhook_test( +async def auto_webhook_test( inputs: Dict[str, Any], output: Union[str, Dict[str, Any]], data_point: Dict[str, Any], @@ -254,7 +255,7 @@ def auto_webhook_test( try: correct_answer = get_correct_answer(data_point, settings_values) inputs = {"prediction": output, "ground_truth": correct_answer} - response = webhook_test( + response = await webhook_test( input=EvaluatorInputInterface( **{"inputs": inputs, "settings": settings_values} ) @@ -289,7 +290,7 @@ def auto_webhook_test( ) -def webhook_test(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: +async def webhook_test(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: with httpx.Client() as client: payload = { "correct_answer": input.inputs["ground_truth"], @@ -303,7 +304,7 @@ def webhook_test(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: return {"outputs": {"score": score}} -def auto_custom_code_run( +async def auto_custom_code_run( inputs: Dict[str, Any], output: Union[str, Dict[str, Any]], data_point: Dict[str, Any], @@ -320,7 +321,7 @@ def auto_custom_code_run( "prediction": output, "ground_truth": correct_answer, } - response = custom_code_run( + response = await custom_code_run( input=EvaluatorInputInterface( **{"inputs": inputs, "settings": {"code": settings_values["code"]}} ) @@ -337,7 +338,7 @@ def auto_custom_code_run( ) -def custom_code_run(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: +async def custom_code_run(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: result = sandbox.execute_code_safely( app_params=input.inputs["app_config"], inputs=input.inputs, @@ -349,7 +350,7 @@ def custom_code_run(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: return {"outputs": {"score": result}} -def auto_ai_critique( +async def auto_ai_critique( inputs: Dict[str, Any], output: Union[str, Dict[str, Any]], data_point: Dict[str, Any], @@ -380,7 +381,7 @@ def auto_ai_critique( "prediction": output, "ground_truth": correct_answer, } - response = ai_critique( + response = await ai_critique( input=EvaluatorInputInterface( **{"inputs": inputs, "credentials": lm_providers_keys} ) @@ -397,7 +398,7 @@ def auto_ai_critique( ) -def ai_critique(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: +async def ai_critique(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: openai_api_key = input.credentials["OPENAI_API_KEY"] chain_run_args = { @@ -405,7 +406,6 @@ def ai_critique(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: "variant_output": input.inputs["prediction"], "correct_answer": input.inputs["ground_truth"], } - for key, value in input.inputs.items(): chain_run_args[key] = value @@ -415,15 +415,15 @@ def ai_critique(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: {"role": "user", "content": str(chain_run_args)}, ] - client = OpenAI(api_key=openai_api_key) - response = client.chat.completions.create( + client = AsyncOpenAI(api_key=openai_api_key) + response = await client.chat.completions.create( model="gpt-3.5-turbo", messages=messages, temperature=0.8 ) evaluation_output = response.choices[0].message.content.strip() - return {"outputs": {"score": evaluation_output}} + return {"outputs": {"score": float(evaluation_output)}} -def auto_starts_with( +async def auto_starts_with( inputs: Dict[str, Any], # pylint: disable=unused-argument output: Union[str, Dict[str, Any]], data_point: Dict[str, Any], # pylint: disable=unused-argument @@ -435,7 +435,7 @@ def auto_starts_with( output = output.get("data", "") try: inputs = {"prediction": output} - response = starts_with( + response = await starts_with( input=EvaluatorInputInterface( **{"inputs": inputs, "settings": settings_values} ) @@ -452,7 +452,7 @@ def auto_starts_with( ) -def starts_with(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: +async def starts_with(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: prefix = input.settings.get("prefix", "") case_sensitive = input.settings.get("case_sensitive", True) @@ -464,7 +464,7 @@ def starts_with(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: return {"outputs": {"success": result}} -def auto_ends_with( +async def auto_ends_with( inputs: Dict[str, Any], # pylint: disable=unused-argument output: Union[str, Dict[str, Any]], data_point: Dict[str, Any], # pylint: disable=unused-argument @@ -476,7 +476,7 @@ def auto_ends_with( output = output.get("data", "") try: inputs = {"prediction": output} - response = ends_with( + response = await ends_with( input=EvaluatorInputInterface( **{"inputs": inputs, "settings": settings_values} ) @@ -494,7 +494,7 @@ def auto_ends_with( ) -def ends_with(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: +async def ends_with(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: suffix = input.settings.get("suffix", "") case_sensitive = input.settings.get("case_sensitive", True) @@ -506,7 +506,7 @@ def ends_with(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: return {"outputs": {"success": result}} -def auto_contains( +async def auto_contains( inputs: Dict[str, Any], # pylint: disable=unused-argument output: Union[str, Dict[str, Any]], data_point: Dict[str, Any], # pylint: disable=unused-argument @@ -518,7 +518,7 @@ def auto_contains( output = output.get("data", "") try: inputs = {"prediction": output} - response = contains( + response = await contains( input=EvaluatorInputInterface( **{"inputs": inputs, "settings": settings_values} ) @@ -536,7 +536,7 @@ def auto_contains( ) -def contains(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: +async def contains(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: substring = input.settings.get("substring", "") case_sensitive = input.settings.get("case_sensitive", True) @@ -548,7 +548,7 @@ def contains(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: return {"outputs": {"success": result}} -def auto_contains_any( +async def auto_contains_any( inputs: Dict[str, Any], # pylint: disable=unused-argument output: Union[str, Dict[str, Any]], data_point: Dict[str, Any], # pylint: disable=unused-argument @@ -560,7 +560,7 @@ def auto_contains_any( output = output.get("data", "") try: inputs = {"prediction": output} - response = contains_any( + response = await contains_any( input=EvaluatorInputInterface( **{"inputs": inputs, "settings": settings_values} ) @@ -578,7 +578,7 @@ def auto_contains_any( ) -def contains_any(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: +async def contains_any(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: substrings_str = input.settings.get("substrings", "") substrings = [substring.strip() for substring in substrings_str.split(",")] case_sensitive = input.settings.get("case_sensitive", True) @@ -592,7 +592,7 @@ def contains_any(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: } -def auto_contains_all( +async def auto_contains_all( inputs: Dict[str, Any], # pylint: disable=unused-argument output: Union[str, Dict[str, Any]], data_point: Dict[str, Any], # pylint: disable=unused-argument @@ -603,7 +603,7 @@ def auto_contains_all( if not isinstance(output, str): output = output.get("data", "") try: - response = contains_all( + response = await contains_all( input=EvaluatorInputInterface( **{"inputs": {"prediction": output}, "settings": settings_values} ) @@ -621,7 +621,7 @@ def auto_contains_all( ) -def contains_all(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: +async def contains_all(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: substrings_str = input.settings.get("substrings", "") substrings = [substring.strip() for substring in substrings_str.split(",")] case_sensitive = input.settings.get("case_sensitive", True) @@ -634,7 +634,7 @@ def contains_all(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: return {"outputs": {"success": result}} -def auto_contains_json( +async def auto_contains_json( inputs: Dict[str, Any], # pylint: disable=unused-argument output: Union[str, Dict[str, Any]], data_point: Dict[str, Any], # pylint: disable=unused-argument @@ -645,7 +645,7 @@ def auto_contains_json( if not isinstance(output, str): output = output.get("data", "") try: - response = contains_json( + response = await contains_json( input=EvaluatorInputInterface(**{"inputs": {"prediction": output}}) ) return Result(type="bool", value=contains_json) @@ -660,7 +660,7 @@ def auto_contains_json( ) -def contains_json(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: +async def contains_json(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: start_index = str(input.inputs["prediction"]).index("{") end_index = str(input.inputs["prediction"]).rindex("}") + 1 potential_json = str(input.inputs["prediction"])[start_index:end_index] @@ -775,7 +775,7 @@ def diff(ground_truth: Any, app_output: Any, compare_schema_only: bool) -> float return average_score -def auto_json_diff( +async def auto_json_diff( inputs: Dict[str, Any], # pylint: disable=unused-argument output: Any, data_point: Dict[str, Any], # pylint: disable=unused-argument @@ -785,7 +785,7 @@ def auto_json_diff( ) -> Result: try: correct_answer = get_correct_answer(data_point, settings_values) - response = json_diff( + response = await json_diff( input=EvaluatorInputInterface( **{ "inputs": {"prediction": output, "ground_truth": correct_answer}, @@ -805,7 +805,7 @@ def auto_json_diff( ) -def json_diff(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: +async def json_diff(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: average_score = compare_jsons( ground_truth=input.inputs["ground_truth"], app_output=json.loads(input.inputs["prediction"]), @@ -814,7 +814,7 @@ def json_diff(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: return {"outputs": {"score": average_score}} -def rag_faithfulness( +async def rag_faithfulness( inputs: Dict[str, Any], # pylint: disable=unused-argument output: Union[str, Dict[str, Any]], data_point: Dict[str, Any], # pylint: disable=unused-argument @@ -878,12 +878,9 @@ def rag_faithfulness( ) # Initialize RAG evaluator to calculate faithfulness score - loop = ensure_event_loop() faithfulness = Faithfulness(api_key=openai_api_key) - eval_score = loop.run_until_complete( - faithfulness._run_eval_async( - output=answer_val, input=question_val, context=contexts_val - ) + eval_score = await faithfulness._run_eval_async( + output=answer_val, input=question_val, context=contexts_val ) return Result(type="number", value=eval_score.score) @@ -899,7 +896,7 @@ def rag_faithfulness( ) -def rag_context_relevancy( +async def rag_context_relevancy( inputs: Dict[str, Any], # pylint: disable=unused-argument output: Union[str, Dict[str, Any]], data_point: Dict[str, Any], # pylint: disable=unused-argument @@ -963,12 +960,9 @@ def rag_context_relevancy( ) # Initialize RAG evaluator to calculate context relevancy score - loop = ensure_event_loop() context_rel = ContextRelevancy(api_key=openai_api_key) - eval_score = loop.run_until_complete( - context_rel._run_eval_async( - output=answer_val, input=question_val, context=contexts_val - ) + eval_score = await context_rel._run_eval_async( + output=answer_val, input=question_val, context=contexts_val ) return Result(type="number", value=eval_score.score) @@ -983,15 +977,17 @@ def rag_context_relevancy( ) -def levenshtein_distance(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: +async def levenshtein_distance( + input: EvaluatorInputInterface, +) -> EvaluatorOutputInterface: prediction = input.inputs["prediction"] ground_truth = input.inputs["ground_truth"] - if len(prediction) < len(ground_truth): - return levenshtein_distance( - input=EvaluatorInputInterface( - **{"inputs": {"prediction": prediction, "ground_truth": ground_truth}} - ) - ) # pylint: disable=arguments-out-of-order + # if len(prediction) < len(ground_truth): + # return await levenshtein_distance( + # input=EvaluatorInputInterface( + # **{"inputs": {"prediction": prediction, "ground_truth": ground_truth}} + # ) + # ) # pylint: disable=arguments-out-of-order if len(ground_truth) == 0: return len(s1) @@ -1006,7 +1002,7 @@ def levenshtein_distance(input: EvaluatorInputInterface) -> EvaluatorOutputInter current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row - result = previous_row[-1] + distance = previous_row[-1] if "threshold" in input.settings: threshold = input.settings["threshold"] is_within_threshold = distance <= threshold @@ -1015,7 +1011,7 @@ def levenshtein_distance(input: EvaluatorInputInterface) -> EvaluatorOutputInter return {"outputs": {"score": distance}} -def auto_levenshtein_distance( +async def auto_levenshtein_distance( inputs: Dict[str, Any], # pylint: disable=unused-argument output: Union[str, Dict[str, Any]], data_point: Dict[str, Any], @@ -1027,7 +1023,7 @@ def auto_levenshtein_distance( output = output.get("data", "") try: correct_answer = get_correct_answer(data_point, settings_values) - response = levenshtein_distance( + response = await levenshtein_distance( input=EvaluatorInputInterface( **{"inputs": {"prediction": output, "ground_truth": correct_answer}} ) @@ -1053,7 +1049,7 @@ def auto_levenshtein_distance( ) -def auto_similarity_match( +async def auto_similarity_match( inputs: Dict[str, Any], output: Union[str, Dict[str, Any]], data_point: Dict[str, Any], @@ -1065,7 +1061,7 @@ def auto_similarity_match( output = output.get("data", "") try: correct_answer = get_correct_answer(data_point, settings_values) - response = similarity_match( + response = await similarity_match( input=EvaluatorInputInterface( **{ "inputs": {"prediction": output, "ground_truth": correct_answer}, @@ -1094,7 +1090,7 @@ def auto_similarity_match( ) -def similarity_match(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: +async def similarity_match(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: set1 = set(input.inputs["prediction"].split()) set2 = set(input.inputs["ground_truth"].split()) intersect = set1.intersection(set2) @@ -1111,14 +1107,14 @@ async def semantic_similarity( """Calculate the semantic similarity score of the LLM app using OpenAI's Embeddings API. Args: - output (str): the output text - correct_answer (str): the correct answer text + input (EvaluatorInputInterface): the evaluator input Returns: float: the semantic similarity score """ - if not isinstance(output, str): - output = output.get("data", "") + + if not isinstance(input.inputs["prediction"], str): + output = input.inputs["prediction"].get("data", "") api_key = input.credentials["OPENAI_API_KEY"] openai = AsyncOpenAI(api_key=api_key) @@ -1138,7 +1134,7 @@ def cosine_similarity(output_vector: array, correct_answer_vector: array) -> flo return {"outputs": {"score": similarity_score}} -def auto_semantic_similarity( +async def auto_semantic_similarity( inputs: Dict[str, Any], output: Union[str, Dict[str, Any]], data_point: Dict[str, Any], @@ -1149,18 +1145,14 @@ def auto_semantic_similarity( if not isinstance(output, str): output = output.get("data", "") try: - loop = ensure_event_loop() - correct_answer = get_correct_answer(data_point, settings_values) inputs = {"prediction": output, "ground_truth": correct_answer} - response = loop.run_until_complete( - semantic_similarity( - input=EvaluatorInputInterface( - **{ - "inputs": inputs, - "credentials": lm_providers_keys, - } - ) + response = await semantic_similarity( + input=EvaluatorInputInterface( + **{ + "inputs": inputs, + "credentials": lm_providers_keys, + } ) ) return Result(type="number", value=response["outputs"]["score"]) @@ -1175,7 +1167,7 @@ def auto_semantic_similarity( ) -EVALUATOR_FUNCTIONS = { +RUN_EVALUATOR_FUNCTIONS = { "auto_exact_match": auto_exact_match, "auto_regex_test": auto_regex_test, "field_match_test": auto_field_match_test, @@ -1216,7 +1208,7 @@ def auto_semantic_similarity( } -def evaluate( +async def evaluate( evaluator_key: str, inputs: Dict[str, Any], output: Union[str, Dict[str, Any]], @@ -1235,7 +1227,7 @@ def evaluate( ), ) try: - return evaluation_function( + return await evaluation_function( inputs, output, data_point, @@ -1254,12 +1246,12 @@ def evaluate( ) -def run( +async def run( evaluator_key: str, evaluator_input: EvaluatorInputInterface ) -> EvaluatorOutputInterface: - evaluator_function = NEW_EVALUATOR_FUNCTIONS.get(evaluator_key, None) + evaluator_function = RUN_EVALUATOR_FUNCTIONS.get(evaluator_key, None) if not evaluator_function: raise NotImplementedError(f"Evaluator {evaluator_key} not found") - output = evaluator_function(evaluator_input) + output = await evaluator_function(evaluator_input) return output From b81d01bdbe73242dc73f8b8ea68b548b357edf80 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 12 Aug 2024 11:22:00 +0100 Subject: [PATCH 011/149] refactor (backend): remove event_loop_utils module --- .../agenta_backend/utils/event_loop_utils.py | 22 ------------------- 1 file changed, 22 deletions(-) delete mode 100644 agenta-backend/agenta_backend/utils/event_loop_utils.py diff --git a/agenta-backend/agenta_backend/utils/event_loop_utils.py b/agenta-backend/agenta_backend/utils/event_loop_utils.py deleted file mode 100644 index 47e5ab326c..0000000000 --- a/agenta-backend/agenta_backend/utils/event_loop_utils.py +++ /dev/null @@ -1,22 +0,0 @@ -import asyncio - - -def ensure_event_loop() -> asyncio.AbstractEventLoop: - """ - Ensure that there is an event loop available in the current thread. - If there isn't one, create a new event loop and set it. - - Raises: - RuntimeError: There is no current event loop in thread 'AnyIO worker thread'. - - Returns: - asyncio.AbstractEventLoop: The event loop for the current thread. - """ - - try: - loop = asyncio.get_event_loop() - except RuntimeError as e: - if "There is no current event loop in thread" in str(e): - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - return loop From 53bb2ed72835217a99121fefa25e6ebeb3b97972 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 12 Aug 2024 11:25:09 +0100 Subject: [PATCH 012/149] refactor (backend): improve run evaluator endpoint to be asynchronous --- agenta-backend/agenta_backend/routers/evaluators_router.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluators_router.py b/agenta-backend/agenta_backend/routers/evaluators_router.py index 676c68b5f6..d43d0b67a9 100644 --- a/agenta-backend/agenta_backend/routers/evaluators_router.py +++ b/agenta-backend/agenta_backend/routers/evaluators_router.py @@ -85,7 +85,7 @@ async def evaluator_data_map( @router.post("/{evaluator_key}/run/", response_model=EvaluatorOutputInterface) -def evaluator_run( +async def evaluator_run( request: Request, evaluator_key: str, payload: EvaluatorInputInterface ): """Endpoint to evaluate LLM app run @@ -100,7 +100,7 @@ def evaluator_run( """ try: - result = evaluators_service.run( + result = await evaluators_service.run( evaluator_key=evaluator_key, evaluator_input=payload ) return result From 6f876f4be467f94419b60695b6d68849ac555a72 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 12 Aug 2024 14:16:11 +0100 Subject: [PATCH 013/149] refactor (backend): update rag faithfulness and context relevancy evaluators --- .../services/evaluators_service.py | 266 ++++++++++-------- 1 file changed, 146 insertions(+), 120 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index d317df8955..67fc44701b 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -523,7 +523,7 @@ async def auto_contains( **{"inputs": inputs, "settings": settings_values} ) ) - result = Result(type="bool", value=response["outputs"["success"]]) + result = Result(type="bool", value=response["outputs"]["success"]) return result except Exception as e: # pylint: disable=broad-except return Result( @@ -814,6 +814,67 @@ async def json_diff(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: return {"outputs": {"score": average_score}} +async def measure_rag_consistency( + input: EvaluatorInputInterface, +) -> EvaluatorOutputInterface: + if "prediction" in input.inputs and isinstance(input.inputs["prediction"], str): + logging.error("'prediction' is most likely not BaseResponse.") + raise NotImplementedError( + "Please update the SDK to the latest version, which supports RAG evaluators." + ) + + # Get required keys for rag evaluator + question_key: Union[str, None] = input.settings.get("question_key", None) + answer_key: Union[str, None] = input.settings.get("answer_key", None) + contexts_key: Union[str, None] = input.settings.get("contexts_key", None) + + if None in [question_key, answer_key, contexts_key]: + logging.error( + f"Missing evaluator settings ? {['question', question_key is None, 'answer', answer_key is None, 'context', contexts_key is None]}" + ) + raise ValueError( + "Missing required configuration keys: 'question_key', 'answer_key', or 'contexts_key'. Please check your evaluator settings and try again." + ) + + # Turn distributed trace into trace tree + trace = process_distributed_trace_into_trace_tree(input.inputs["trace"]) + + # Get value of required keys for rag evaluator + question_val: Any = get_field_value_from_trace_tree(trace, question_key) + answer_val: Any = get_field_value_from_trace_tree(trace, answer_key) + contexts_val: Any = get_field_value_from_trace_tree(trace, contexts_key) + + if None in [question_val, answer_val, contexts_val]: + logging.error( + f"Missing trace field ? {['question', question_val is None, 'answer', answer_val is None, 'context', contexts_val is None]}" + ) + + message = "" + if question_val is None: + message += f"'question_key' is set to {question_key} which can't be found. " + if answer_val is None: + message += f"'answer_key' is set to {answer_key} which can't be found. " + if contexts_val is None: + message += f"'contexts_key' is set to {contexts_key} which can't be found. " + message += "Please check your evaluator settings and try again." + + raise ValueError(message) + + openai_api_key = input.credentials.get("OPENAI_API_KEY", None) + + if not openai_api_key: + raise Exception( + "No LLM keys OpenAI key found. Please configure your OpenAI keys and try again." + ) + + # Initialize RAG evaluator to calculate faithfulness score + faithfulness = Faithfulness(api_key=openai_api_key) + eval_score = await faithfulness._run_eval_async( + output=answer_val, input=question_val, context=contexts_val + ) + return {"outputs": {"score": eval_score.score}} + + async def rag_faithfulness( inputs: Dict[str, Any], # pylint: disable=unused-argument output: Union[str, Dict[str, Any]], @@ -823,67 +884,16 @@ async def rag_faithfulness( lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: try: - if isinstance(output, str): - logging.error("'output' is most likely not BaseResponse.") - raise NotImplementedError( - "Please update the SDK to the latest version, which supports RAG evaluators." - ) - - # Get required keys for rag evaluator - question_key: Union[str, None] = settings_values.get("question_key", None) - answer_key: Union[str, None] = settings_values.get("answer_key", None) - contexts_key: Union[str, None] = settings_values.get("contexts_key", None) - - if None in [question_key, answer_key, contexts_key]: - logging.error( - f"Missing evaluator settings ? {['question', question_key is None, 'answer', answer_key is None, 'context', contexts_key is None]}" - ) - raise ValueError( - "Missing required configuration keys: 'question_key', 'answer_key', or 'contexts_key'. Please check your evaluator settings and try again." - ) - - # Turn distributed trace into trace tree - trace = process_distributed_trace_into_trace_tree(output["trace"]) - - # Get value of required keys for rag evaluator - question_val: Any = get_field_value_from_trace_tree(trace, question_key) - answer_val: Any = get_field_value_from_trace_tree(trace, answer_key) - contexts_val: Any = get_field_value_from_trace_tree(trace, contexts_key) - - if None in [question_val, answer_val, contexts_val]: - logging.error( - f"Missing trace field ? {['question', question_val is None, 'answer', answer_val is None, 'context', contexts_val is None]}" - ) - - message = "" - if question_val is None: - message += ( - f"'question_key' is set to {question_key} which can't be found. " - ) - if answer_val is None: - message += f"'answer_key' is set to {answer_key} which can't be found. " - if contexts_val is None: - message += ( - f"'contexts_key' is set to {contexts_key} which can't be found. " - ) - message += "Please check your evaluator settings and try again." - - raise ValueError(message) - - openai_api_key = lm_providers_keys.get("OPENAI_API_KEY", None) - - if not openai_api_key: - raise Exception( - "No LLM keys OpenAI key found. Please configure your OpenAI keys and try again." + faithfulness_score = await measure_rag_consistency( + input=EvaluatorInputInterface( + **{ + "inputs": {"prediction": output}, + "settings": settings_values, + "credentials": lm_providers_keys, + } ) - - # Initialize RAG evaluator to calculate faithfulness score - faithfulness = Faithfulness(api_key=openai_api_key) - eval_score = await faithfulness._run_eval_async( - output=answer_val, input=question_val, context=contexts_val ) - - return Result(type="number", value=eval_score.score) + return Result(type="number", value=faithfulness_score) except Exception: return Result( @@ -896,6 +906,70 @@ async def rag_faithfulness( ) +async def measure_context_coherence( + input: EvaluatorInputInterface, +) -> EvaluatorOutputInterface: + if "prediction" in input.inputs and isinstance(input.inputs["prediction"], str): + logging.error("'prediction' is most likely not BaseResponse.") + raise NotImplementedError( + "Please update the SDK to the latest version, which supports RAG evaluators." + ) + + # Get required keys for rag evaluator + question_key: Union[str, None] = input.settings.get("question_key", None) + answer_key: Union[str, None] = input.settings.get("answer_key", None) + contexts_key: Union[str, None] = input.settings.get("contexts_key", None) + + if None in [question_key, answer_key, contexts_key]: + logging.error( + f"Missing evaluator settings ? {['question', question_key is None, 'answer', answer_key is None, 'context', contexts_key is None]}" + ) + raise ValueError( + "Missing required configuration keys: 'question_key', 'answer_key', or 'contexts_key'. Please check your evaluator settings and try again." + ) + + # Turn distributed trace into trace tree + trace = process_distributed_trace_into_trace_tree(input.inputs["trace"]) + + # Get value of required keys for rag evaluator + question_val: Any = get_field_value_from_trace_tree(trace, question_key) + answer_val: Any = get_field_value_from_trace_tree(trace, answer_key) + contexts_val: Any = get_field_value_from_trace_tree(trace, contexts_key) + + if None in [question_val, answer_val, contexts_val]: + logging.error( + f"Missing trace field ? {['question', question_val is None, 'answer', answer_val is None, 'context', contexts_val is None]}" + ) + + message = "" + if question_val is None: + message += f"'question_key' is set to {question_key} which can't be found. " + if answer_val is None: + message += f"'answer_key' is set to {answer_key} which can't be found. " + if contexts_val is None: + message += f"'contexts_key' is set to {contexts_key} which can't be found. " + message += "Please check your evaluator settings and try again." + + raise ValueError(message) + + openai_api_key = input.credentials.get("OPENAI_API_KEY", None) + + if not openai_api_key: + raise Exception( + "No LLM keys OpenAI key found. Please configure your OpenAI keys and try again." + ) + + print("QV: ", question_val) + print("AV: ", answer_val) + print("CV: ", contexts_val) + # Initialize RAG evaluator to calculate context relevancy score + context_rel = ContextRelevancy(api_key=openai_api_key) + eval_score = await context_rel._run_eval_async( + output=answer_val, input=question_val, context=contexts_val + ) + return eval_score.score + + async def rag_context_relevancy( inputs: Dict[str, Any], # pylint: disable=unused-argument output: Union[str, Dict[str, Any]], @@ -905,66 +979,16 @@ async def rag_context_relevancy( lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: try: - if isinstance(output, str): - logging.error("'output' is most likely not BaseResponse.") - raise NotImplementedError( - "Please update the SDK to the latest version, which supports RAG evaluators." - ) - - # Get required keys for rag evaluator - question_key: Union[str, None] = settings_values.get("question_key", None) - answer_key: Union[str, None] = settings_values.get("answer_key", None) - contexts_key: Union[str, None] = settings_values.get("contexts_key", None) - - if None in [question_key, answer_key, contexts_key]: - logging.error( - f"Missing evaluator settings ? {['question', question_key is None, 'answer', answer_key is None, 'context', contexts_key is None]}" - ) - raise ValueError( - "Missing required configuration keys: 'question_key', 'answer_key', or 'contexts_key'. Please check your evaluator settings and try again." - ) - - # Turn distributed trace into trace tree - trace = process_distributed_trace_into_trace_tree(output["trace"]) - - # Get value of required keys for rag evaluator - question_val: Any = get_field_value_from_trace_tree(trace, question_key) - answer_val: Any = get_field_value_from_trace_tree(trace, answer_key) - contexts_val: Any = get_field_value_from_trace_tree(trace, contexts_key) - - if None in [question_val, answer_val, contexts_val]: - logging.error( - f"Missing trace field ? {['question', question_val is None, 'answer', answer_val is None, 'context', contexts_val is None]}" - ) - - message = "" - if question_val is None: - message += ( - f"'question_key' is set to {question_key} which can't be found. " - ) - if answer_val is None: - message += f"'answer_key' is set to {answer_key} which can't be found. " - if contexts_val is None: - message += ( - f"'contexts_key' is set to {contexts_key} which can't be found. " - ) - message += "Please check your evaluator settings and try again." - - raise ValueError(message) - - openai_api_key = lm_providers_keys.get("OPENAI_API_KEY", None) - - if not openai_api_key: - raise Exception( - "No LLM keys OpenAI key found. Please configure your OpenAI keys and try again." + context_relevancy_score = await measure_context_coherence( + input=EvaluatorInputInterface( + **{ + "inputs": {"prediction": output}, + "settings": settings_values, + "credentials": lm_providers_keys, + } ) - - # Initialize RAG evaluator to calculate context relevancy score - context_rel = ContextRelevancy(api_key=openai_api_key) - eval_score = await context_rel._run_eval_async( - output=answer_val, input=question_val, context=contexts_val ) - return Result(type="number", value=eval_score.score) + return Result(type="number", value=context_relevancy_score.score) except Exception: return Result( @@ -1167,7 +1191,7 @@ async def auto_semantic_similarity( ) -RUN_EVALUATOR_FUNCTIONS = { +EVALUATOR_FUNCTIONS = { "auto_exact_match": auto_exact_match, "auto_regex_test": auto_regex_test, "field_match_test": auto_field_match_test, @@ -1188,7 +1212,7 @@ async def auto_semantic_similarity( "rag_context_relevancy": rag_context_relevancy, } -NEW_EVALUATOR_FUNCTIONS = { +RUN_EVALUATOR_FUNCTIONS = { "auto_exact_match": exact_match, "auto_regex_test": regex_test, "auto_field_match_test": field_match_test, @@ -1205,6 +1229,8 @@ async def auto_semantic_similarity( "auto_levenshtein_distance": levenshtein_distance, "auto_similarity_match": similarity_match, "auto_semantic_similarity": semantic_similarity, + "rag_faithfulness": measure_rag_consistency, + "rag_context_relevancy": measure_context_coherence } From 52141083b93159cf193713a2304f7811fa630dc7 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 12 Aug 2024 16:46:20 +0100 Subject: [PATCH 014/149] refactor (backend): ensure rag evaluators is compatible with evaluate and run interface handlers --- .../services/evaluators_service.py | 253 ++++++++++-------- 1 file changed, 140 insertions(+), 113 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 67fc44701b..f1dad33059 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -86,7 +86,7 @@ async def get_nested_value(data: Dict[str, Any], key: str) -> Any: return {"outputs": mapping_outputs} -async def get_correct_answer( +def get_correct_answer( data_point: Dict[str, Any], settings_values: Dict[str, Any] ) -> Any: """ @@ -456,8 +456,9 @@ async def starts_with(input: EvaluatorInputInterface) -> EvaluatorOutputInterfac prefix = input.settings.get("prefix", "") case_sensitive = input.settings.get("case_sensitive", True) + output = str(input.inputs["prediction"]) if not case_sensitive: - output = str(input.inputs["prediction"]).lower() + output = output.lower() prefix = prefix.lower() result = output.startswith(prefix) @@ -498,8 +499,9 @@ async def ends_with(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: suffix = input.settings.get("suffix", "") case_sensitive = input.settings.get("case_sensitive", True) + output = str(input.inputs["prediction"]) if not case_sensitive: - output = str(input.inputs["prediction"]).lower() + output = output.lower() suffix = suffix.lower() result = output.endswith(suffix) @@ -540,8 +542,9 @@ async def contains(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: substring = input.settings.get("substring", "") case_sensitive = input.settings.get("case_sensitive", True) + output = str(input.inputs["prediction"]) if not case_sensitive: - output = str(input.inputs["prediction"]).lower() + output = output.lower() substring = substring.lower() result = substring in output @@ -583,8 +586,9 @@ async def contains_any(input: EvaluatorInputInterface) -> EvaluatorOutputInterfa substrings = [substring.strip() for substring in substrings_str.split(",")] case_sensitive = input.settings.get("case_sensitive", True) + output = str(input.inputs["prediction"]) if not case_sensitive: - output = str(input.inputs["prediction"]).lower() + output = output.lower() substrings = [substring.lower() for substring in substrings] return { @@ -626,8 +630,9 @@ async def contains_all(input: EvaluatorInputInterface) -> EvaluatorOutputInterfa substrings = [substring.strip() for substring in substrings_str.split(",")] case_sensitive = input.settings.get("case_sensitive", True) + output = str(input.inputs["prediction"]) if not case_sensitive: - output = str(input.inputs["prediction"]).lower() + output = output.lower() substrings = [substring.lower() for substring in substrings] result = all(substring in output for substring in substrings) @@ -648,7 +653,7 @@ async def auto_contains_json( response = await contains_json( input=EvaluatorInputInterface(**{"inputs": {"prediction": output}}) ) - return Result(type="bool", value=contains_json) + return Result(type="bool", value=response["outputs"]["success"]) except Exception as e: # pylint: disable=broad-except return Result( type="error", @@ -661,11 +666,10 @@ async def auto_contains_json( async def contains_json(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: - start_index = str(input.inputs["prediction"]).index("{") - end_index = str(input.inputs["prediction"]).rindex("}") + 1 - potential_json = str(input.inputs["prediction"])[start_index:end_index] - try: + start_index = str(input.inputs["prediction"]).index("{") + end_index = str(input.inputs["prediction"]).rindex("}") + 1 + potential_json = str(input.inputs["prediction"])[start_index:end_index] json.loads(potential_json) contains_json = True except (ValueError, json.JSONDecodeError): @@ -817,51 +821,7 @@ async def json_diff(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: async def measure_rag_consistency( input: EvaluatorInputInterface, ) -> EvaluatorOutputInterface: - if "prediction" in input.inputs and isinstance(input.inputs["prediction"], str): - logging.error("'prediction' is most likely not BaseResponse.") - raise NotImplementedError( - "Please update the SDK to the latest version, which supports RAG evaluators." - ) - - # Get required keys for rag evaluator - question_key: Union[str, None] = input.settings.get("question_key", None) - answer_key: Union[str, None] = input.settings.get("answer_key", None) - contexts_key: Union[str, None] = input.settings.get("contexts_key", None) - - if None in [question_key, answer_key, contexts_key]: - logging.error( - f"Missing evaluator settings ? {['question', question_key is None, 'answer', answer_key is None, 'context', contexts_key is None]}" - ) - raise ValueError( - "Missing required configuration keys: 'question_key', 'answer_key', or 'contexts_key'. Please check your evaluator settings and try again." - ) - - # Turn distributed trace into trace tree - trace = process_distributed_trace_into_trace_tree(input.inputs["trace"]) - - # Get value of required keys for rag evaluator - question_val: Any = get_field_value_from_trace_tree(trace, question_key) - answer_val: Any = get_field_value_from_trace_tree(trace, answer_key) - contexts_val: Any = get_field_value_from_trace_tree(trace, contexts_key) - - if None in [question_val, answer_val, contexts_val]: - logging.error( - f"Missing trace field ? {['question', question_val is None, 'answer', answer_val is None, 'context', contexts_val is None]}" - ) - - message = "" - if question_val is None: - message += f"'question_key' is set to {question_key} which can't be found. " - if answer_val is None: - message += f"'answer_key' is set to {answer_key} which can't be found. " - if contexts_val is None: - message += f"'contexts_key' is set to {contexts_key} which can't be found. " - message += "Please check your evaluator settings and try again." - - raise ValueError(message) - openai_api_key = input.credentials.get("OPENAI_API_KEY", None) - if not openai_api_key: raise Exception( "No LLM keys OpenAI key found. Please configure your OpenAI keys and try again." @@ -870,7 +830,9 @@ async def measure_rag_consistency( # Initialize RAG evaluator to calculate faithfulness score faithfulness = Faithfulness(api_key=openai_api_key) eval_score = await faithfulness._run_eval_async( - output=answer_val, input=question_val, context=contexts_val + output=input.inputs["answer"], + input=input.inputs["question"], + context=input.inputs["context"], ) return {"outputs": {"score": eval_score.score}} @@ -884,16 +846,67 @@ async def rag_faithfulness( lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: try: - faithfulness_score = await measure_rag_consistency( + if isinstance(output, str): + logging.error("'output' is most likely not BaseResponse.") + raise NotImplementedError( + "Please update the SDK to the latest version, which supports RAG evaluators." + ) + + # Get required keys for rag evaluator + question_key: Union[str, None] = settings_values.get("question_key", None) + answer_key: Union[str, None] = settings_values.get("answer_key", None) + contexts_key: Union[str, None] = settings_values.get("contexts_key", None) + + if None in [question_key, answer_key, contexts_key]: + logging.error( + f"Missing evaluator settings ? {['question', question_key is None, 'answer', answer_key is None, 'context', contexts_key is None]}" + ) + raise ValueError( + "Missing required configuration keys: 'question_key', 'answer_key', or 'contexts_key'. Please check your evaluator settings and try again." + ) + + # Turn distributed trace into trace tree + trace = process_distributed_trace_into_trace_tree(output["trace"]) + + # Get value of required keys for rag evaluator + question_val: Any = get_field_value_from_trace_tree(trace, question_key) + answer_val: Any = get_field_value_from_trace_tree(trace, answer_key) + contexts_val: Any = get_field_value_from_trace_tree(trace, contexts_key) + + if None in [question_val, answer_val, contexts_val]: + logging.error( + f"Missing trace field ? {['question', question_val is None, 'answer', answer_val is None, 'context', contexts_val is None]}" + ) + + message = "" + if question_val is None: + message += ( + f"'question_key' is set to {question_key} which can't be found. " + ) + if answer_val is None: + message += f"'answer_key' is set to {answer_key} which can't be found. " + if contexts_val is None: + message += ( + f"'contexts_key' is set to {contexts_key} which can't be found. " + ) + message += "Please check your evaluator settings and try again." + + raise ValueError(message) + + measurement = await measure_rag_consistency( input=EvaluatorInputInterface( **{ - "inputs": {"prediction": output}, + "inputs": { + "question": question_val, + "context": contexts_val, + "answer": answer_val, + }, "settings": settings_values, "credentials": lm_providers_keys, } ) ) - return Result(type="number", value=faithfulness_score) + return Result(type="number", value=measurement["outputs"]["score"]) except Exception: return Result( @@ -909,49 +922,6 @@ async def rag_faithfulness( async def measure_context_coherence( input: EvaluatorInputInterface, ) -> EvaluatorOutputInterface: - if "prediction" in input.inputs and isinstance(input.inputs["prediction"], str): - logging.error("'prediction' is most likely not BaseResponse.") - raise NotImplementedError( - "Please update the SDK to the latest version, which supports RAG evaluators." - ) - - # Get required keys for rag evaluator - question_key: Union[str, None] = input.settings.get("question_key", None) - answer_key: Union[str, None] = input.settings.get("answer_key", None) - contexts_key: Union[str, None] = input.settings.get("contexts_key", None) - - if None in [question_key, answer_key, contexts_key]: - logging.error( - f"Missing evaluator settings ? {['question', question_key is None, 'answer', answer_key is None, 'context', contexts_key is None]}" - ) - raise ValueError( - "Missing required configuration keys: 'question_key', 'answer_key', or 'contexts_key'. Please check your evaluator settings and try again." - ) - - # Turn distributed trace into trace tree - trace = process_distributed_trace_into_trace_tree(input.inputs["trace"]) - - # Get value of required keys for rag evaluator - question_val: Any = get_field_value_from_trace_tree(trace, question_key) - answer_val: Any = get_field_value_from_trace_tree(trace, answer_key) - contexts_val: Any = get_field_value_from_trace_tree(trace, contexts_key) - - if None in [question_val, answer_val, contexts_val]: - logging.error( - f"Missing trace field ? {['question', question_val is None, 'answer', answer_val is None, 'context', contexts_val is None]}" - ) - - message = "" - if question_val is None: - message += f"'question_key' is set to {question_key} which can't be found. " - if answer_val is None: - message += f"'answer_key' is set to {answer_key} which can't be found. " - if contexts_val is None: - message += f"'contexts_key' is set to {contexts_key} which can't be found. " - message += "Please check your evaluator settings and try again." - - raise ValueError(message) - openai_api_key = input.credentials.get("OPENAI_API_KEY", None) if not openai_api_key: @@ -959,15 +929,14 @@ async def measure_context_coherence( "No LLM keys OpenAI key found. Please configure your OpenAI keys and try again." ) - print("QV: ", question_val) - print("AV: ", answer_val) - print("CV: ", contexts_val) # Initialize RAG evaluator to calculate context relevancy score context_rel = ContextRelevancy(api_key=openai_api_key) eval_score = await context_rel._run_eval_async( - output=answer_val, input=question_val, context=contexts_val + output=input.inputs["answer"], + input=input.inputs["question"], + context=input.inputs["context"], ) - return eval_score.score + return {"outputs": {"score": eval_score.score}} async def rag_context_relevancy( @@ -979,16 +948,67 @@ async def rag_context_relevancy( lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: try: - context_relevancy_score = await measure_context_coherence( + if isinstance(output, str): + logging.error("'output' is most likely not BaseResponse.") + raise NotImplementedError( + "Please update the SDK to the latest version, which supports RAG evaluators." + ) + + # Get required keys for rag evaluator + question_key: Union[str, None] = settings_values.get("question_key", None) + answer_key: Union[str, None] = settings_values.get("answer_key", None) + contexts_key: Union[str, None] = settings_values.get("contexts_key", None) + + if None in [question_key, answer_key, contexts_key]: + logging.error( + f"Missing evaluator settings ? {['question', question_key is None, 'answer', answer_key is None, 'context', contexts_key is None]}" + ) + raise ValueError( + "Missing required configuration keys: 'question_key', 'answer_key', or 'contexts_key'. Please check your evaluator settings and try again." + ) + + # Turn distributed trace into trace tree + trace = process_distributed_trace_into_trace_tree(output["trace"]) + + # Get value of required keys for rag evaluator + question_val: Any = get_field_value_from_trace_tree(trace, question_key) + answer_val: Any = get_field_value_from_trace_tree(trace, answer_key) + contexts_val: Any = get_field_value_from_trace_tree(trace, contexts_key) + + if None in [question_val, answer_val, contexts_val]: + logging.error( + f"Missing trace field ? {['question', question_val is None, 'answer', answer_val is None, 'context', contexts_val is None]}" + ) + + message = "" + if question_val is None: + message += ( + f"'question_key' is set to {question_key} which can't be found. " + ) + if answer_val is None: + message += f"'answer_key' is set to {answer_key} which can't be found. " + if contexts_val is None: + message += ( + f"'contexts_key' is set to {contexts_key} which can't be found. " + ) + message += "Please check your evaluator settings and try again." + + raise ValueError(message) + + measurement = await measure_context_coherence( input=EvaluatorInputInterface( **{ - "inputs": {"prediction": output}, + "inputs": { + "question": question_val, + "context": contexts_val, + "answer": answer_val, + }, "settings": settings_values, "credentials": lm_providers_keys, } ) ) - return Result(type="number", value=context_relevancy_score.score) + return Result(type="number", value=measurement["outputs"]["score"]) except Exception: return Result( @@ -1049,12 +1069,18 @@ async def auto_levenshtein_distance( correct_answer = get_correct_answer(data_point, settings_values) response = await levenshtein_distance( input=EvaluatorInputInterface( - **{"inputs": {"prediction": output, "ground_truth": correct_answer}} + **{ + "inputs": {"prediction": output, "ground_truth": correct_answer}, + "settings": settings_values, + } ) ) - return Result(type="number", value=response["outputs"].get("score", "success")) + if "success" in response["outputs"]: + return Result(type="number", value=response["outputs"]["success"]) + return Result(type="number", value=response["outputs"]["score"]) except ValueError as e: + print("Exception: ", traceback.format_exc()) return Result( type="error", value=None, @@ -1063,6 +1089,7 @@ async def auto_levenshtein_distance( ), ) except Exception as e: # pylint: disable=broad-except + print("Exception: ", traceback.format_exc()) return Result( type="error", value=None, @@ -1230,7 +1257,7 @@ async def auto_semantic_similarity( "auto_similarity_match": similarity_match, "auto_semantic_similarity": semantic_similarity, "rag_faithfulness": measure_rag_consistency, - "rag_context_relevancy": measure_context_coherence + "rag_context_relevancy": measure_context_coherence, } From 133fc739e3f3c9f79ea0eaab1801f21ebb1452ea Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 12 Aug 2024 16:46:50 +0100 Subject: [PATCH 015/149] refactor (tests): convert unit tests to async using pytest.mark.asyncio --- .../routers/evaluators_router.py | 4 +- .../tests/unit/test_evaluators.py | 57 ++++++++++++------- 2 files changed, 36 insertions(+), 25 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluators_router.py b/agenta-backend/agenta_backend/routers/evaluators_router.py index d43d0b67a9..a3fa097c64 100644 --- a/agenta-backend/agenta_backend/routers/evaluators_router.py +++ b/agenta-backend/agenta_backend/routers/evaluators_router.py @@ -57,9 +57,7 @@ async def get_evaluators_endpoint(): @router.post("/map/", response_model=EvaluatorMappingOutputInterface) -async def evaluator_data_map( - request: Request, payload: EvaluatorMappingInputInterface -): +async def evaluator_data_map(request: Request, payload: EvaluatorMappingInputInterface): """Endpoint to map the experiment data tree to evaluator interface. Args: diff --git a/agenta-backend/agenta_backend/tests/unit/test_evaluators.py b/agenta-backend/agenta_backend/tests/unit/test_evaluators.py index 809ab94271..c0bfbfade8 100644 --- a/agenta-backend/agenta_backend/tests/unit/test_evaluators.py +++ b/agenta-backend/agenta_backend/tests/unit/test_evaluators.py @@ -59,8 +59,9 @@ ), ], ) -def test_auto_starts_with(output, settings_values, expected): - result = auto_starts_with( +@pytest.mark.asyncio +async def test_auto_starts_with(output, settings_values, expected): + result = await auto_starts_with( inputs={}, output=output, data_point={}, @@ -83,8 +84,9 @@ def test_auto_starts_with(output, settings_values, expected): ("Hello world", "Hello", True, False), ], ) -def test_auto_ends_with(output, suffix, case_sensitive, expected): - result = auto_ends_with( +@pytest.mark.asyncio +async def test_auto_ends_with(output, suffix, case_sensitive, expected): + result = await auto_ends_with( {}, output, {}, @@ -106,8 +108,9 @@ def test_auto_ends_with(output, suffix, case_sensitive, expected): ("Hello world", "abc", True, False), ], ) -def test_auto_contains(output, substring, case_sensitive, expected): - result = auto_contains( +@pytest.mark.asyncio +async def test_auto_contains(output, substring, case_sensitive, expected): + result = await auto_contains( {}, output, {}, @@ -130,8 +133,9 @@ def test_auto_contains(output, substring, case_sensitive, expected): ("Hello world", "abc,xyz", True, False), ], ) -def test_auto_contains_any(output, substrings, case_sensitive, expected): - result = auto_contains_any( +@pytest.mark.asyncio +async def test_auto_contains_any(output, substrings, case_sensitive, expected): + result = await auto_contains_any( {}, output, {}, @@ -154,8 +158,9 @@ def test_auto_contains_any(output, substrings, case_sensitive, expected): ("Hello world", "world,universe", True, False), ], ) -def test_auto_contains_all(output, substrings, case_sensitive, expected): - result = auto_contains_all( +@pytest.mark.asyncio +async def test_auto_contains_all(output, substrings, case_sensitive, expected): + result = await auto_contains_all( {}, output, {}, @@ -176,8 +181,9 @@ def test_auto_contains_all(output, substrings, case_sensitive, expected): ('{"valid": "json", "number": 123}', True), ], ) -def test_auto_contains_json(output, expected): - result = auto_contains_json({}, output, {}, {}, {}, {}) +@pytest.mark.asyncio +async def test_auto_contains_json(output, expected): + result = await auto_contains_json({}, output, {}, {}, {}, {}) assert result.value == expected @@ -243,10 +249,11 @@ def test_auto_contains_json(output, expected): ), ], ) -def test_auto_json_diff( +@pytest.mark.asyncio +async def test_auto_json_diff( ground_truth, app_output, settings_values, expected_min, expected_max ): - result = auto_json_diff({}, app_output, ground_truth, {}, settings_values, {}) + result = await auto_json_diff({}, app_output, ground_truth, {}, settings_values, {}) assert expected_min <= result.value <= expected_max @@ -282,10 +289,11 @@ def test_auto_json_diff( ), ], ) -def test_auto_semantic_similarity_match( +@pytest.mark.asyncio +async def test_auto_semantic_similarity_match( ground_truth, app_output, settings_values, expected_min, expected_max ): - result = auto_semantic_similarity( + result = await auto_semantic_similarity( {}, app_output, ground_truth, @@ -337,8 +345,9 @@ def test_auto_semantic_similarity_match( ), ], ) -def test_auto_levenshtein_distance(output, data_point, settings_values, expected): - result = auto_levenshtein_distance( +@pytest.mark.asyncio +async def test_auto_levenshtein_distance(output, data_point, settings_values, expected): + result = await auto_levenshtein_distance( inputs={}, output=output, data_point=data_point, @@ -364,8 +373,9 @@ def test_auto_levenshtein_distance(output, data_point, settings_values, expected # add more use cases ], ) -def test_rag_faithfulness_evaluator(settings_values, expected_min, expected_max): - result = rag_faithfulness( +@pytest.mark.asyncio +async def test_rag_faithfulness_evaluator(settings_values, expected_min, expected_max): + result = await rag_faithfulness( {}, simple_rag_trace, {}, @@ -392,8 +402,11 @@ def test_rag_faithfulness_evaluator(settings_values, expected_min, expected_max) # add more use cases ], ) -def test_rag_context_relevancy_evaluator(settings_values, expected_min, expected_max): - result = rag_context_relevancy( +@pytest.mark.asyncio +async def test_rag_context_relevancy_evaluator( + settings_values, expected_min, expected_max +): + result = await rag_context_relevancy( {}, simple_rag_trace, {}, From 7e229b3cc08e263f64834f880de89555a5623322 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 12 Aug 2024 18:39:09 +0100 Subject: [PATCH 016/149] refactor (backend): resolve pr#1956 comments (r1714066376, r1714068517, r1714064544, r1714061425) --- .../agenta_backend/routers/evaluators_router.py | 2 +- .../agenta_backend/services/evaluators_service.py | 11 ++++------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluators_router.py b/agenta-backend/agenta_backend/routers/evaluators_router.py index a3fa097c64..afc8258541 100644 --- a/agenta-backend/agenta_backend/routers/evaluators_router.py +++ b/agenta-backend/agenta_backend/routers/evaluators_router.py @@ -69,7 +69,7 @@ async def evaluator_data_map(request: Request, payload: EvaluatorMappingInputInt """ try: - mapped_outputs = evaluators_service.map(mapping_input=payload) + mapped_outputs = await evaluators_service.map(mapping_input=payload) return mapped_outputs except Exception as e: logger.error(f"Error mapping data tree: {str(e)}") diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index f1dad33059..65a96da165 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -81,8 +81,9 @@ async def get_nested_value(data: Dict[str, Any], key: str) -> Any: return data mapping_outputs = {} + trace = process_distributed_trace_into_trace_tree(mapping_input.inputs["trace"]) for to_key, from_key in mapping_input.mapping.items(): - mapping_outputs[to_key] = get_nested_value(mapping_input.inputs, from_key) + mapping_outputs[to_key] = get_field_value_from_trace_tree(trace, from_key) return {"outputs": mapping_outputs} @@ -1080,7 +1081,6 @@ async def auto_levenshtein_distance( return Result(type="number", value=response["outputs"]["score"]) except ValueError as e: - print("Exception: ", traceback.format_exc()) return Result( type="error", value=None, @@ -1089,7 +1089,6 @@ async def auto_levenshtein_distance( ), ) except Exception as e: # pylint: disable=broad-except - print("Exception: ", traceback.format_exc()) return Result( type="error", value=None, @@ -1164,9 +1163,6 @@ async def semantic_similarity( float: the semantic similarity score """ - if not isinstance(input.inputs["prediction"], str): - output = input.inputs["prediction"].get("data", "") - api_key = input.credentials["OPENAI_API_KEY"] openai = AsyncOpenAI(api_key=api_key) @@ -1195,6 +1191,7 @@ async def auto_semantic_similarity( ) -> Result: if not isinstance(output, str): output = output.get("data", "") + try: correct_answer = get_correct_answer(data_point, settings_values) inputs = {"prediction": output, "ground_truth": correct_answer} @@ -1242,7 +1239,7 @@ async def auto_semantic_similarity( RUN_EVALUATOR_FUNCTIONS = { "auto_exact_match": exact_match, "auto_regex_test": regex_test, - "auto_field_match_test": field_match_test, + "field_match_test": field_match_test, "auto_webhook_test": webhook_test, "auto_custom_code_run": custom_code_run, "auto_ai_critique": ai_critique, From bbbb9258c2dfea00b8229df04713eb2032f0b69c Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 12 Aug 2024 18:41:45 +0100 Subject: [PATCH 017/149] minor refactor (backend): remove redundant 'get_nested_value' function --- .../services/evaluators_service.py | 41 ------------------- 1 file changed, 41 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 65a96da165..e0d0a2cc34 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -39,47 +39,6 @@ async def map( EvaluatorMappingOutputInterface: A dictionary containing the mapped evaluator inputs. """ - async def get_nested_value(data: Dict[str, Any], key: str) -> Any: - """ - Retrieves the nested value from a dictionary based on a dotted key path, - where list indices can be included in square brackets. - - Args: - data (Dict[str, Any]): The data dictionary to retrieve the value from. - key (str): The key path to the desired value, with possible list indices. - - Returns: - Any: The value found at the specified key path, or None if not found. - - Example: - >>> data = { - ... 'rag': { - ... 'summarizer': [{'outputs': {'report': 'The answer is 42'}}] - ... } - ... } - >>> key = 'rag.summarizer[0].outputs.report' - >>> get_nested_value(data, key) - 'The answer is 42' - """ - - pattern = re.compile(r"([^\[\].]+|\[\d+\])") - keys = pattern.findall(key) - - for k in keys: - if k.startswith("[") and k.endswith("]"): - # Convert list index from '[index]' to integer - k = int(k[1:-1]) - if isinstance(data, list): - data = data[k] if 0 <= k < len(data) else None - else: - return None - else: - if isinstance(data, dict): - data = data.get(k, None) - else: - return None - return data - mapping_outputs = {} trace = process_distributed_trace_into_trace_tree(mapping_input.inputs["trace"]) for to_key, from_key in mapping_input.mapping.items(): From dd805a2c01f63f0502dd0319d9c516935547bfd8 Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 13 Aug 2024 11:40:37 +0100 Subject: [PATCH 018/149] refactor (backend): run evaluator_service 'evaluate' asynchronous from celery task --- .../services/evaluators_service.py | 2 +- .../agenta_backend/tasks/evaluations.py | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 8b61a2cb14..f158253571 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -283,7 +283,7 @@ async def auto_custom_code_run( } response = await custom_code_run( input=EvaluatorInputInterface( - **{"inputs": inputs, "settings": {"code": settings_values["code"]}} + **{"inputs": inputs, "settings": settings_values} ) ) return Result(type="number", value=response["outputs"]["score"]) diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py index aa1a7f9727..8a1f662c41 100644 --- a/agenta-backend/agenta_backend/tasks/evaluations.py +++ b/agenta-backend/agenta_backend/tasks/evaluations.py @@ -233,14 +233,16 @@ def evaluate( ) logger.debug(f"Evaluating with evaluator: {evaluator_config_db}") - result = evaluators_service.evaluate( - evaluator_key=evaluator_config_db.evaluator_key, - output=app_output.result.value, - data_point=data_point, - settings_values=evaluator_config_db.settings_values, - app_params=app_variant_parameters, # type: ignore - inputs=data_point, - lm_providers_keys=lm_providers_keys, + result = loop.run_until_complete( + evaluators_service.evaluate( + evaluator_key=evaluator_config_db.evaluator_key, + output=app_output.result.value, + data_point=data_point, + settings_values=evaluator_config_db.settings_values, + app_params=app_variant_parameters, # type: ignore + inputs=data_point, + lm_providers_keys=lm_providers_keys, + ) ) # Update evaluators aggregated data From d3f73159fcac64edfe04df304796c594d65c6873 Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 13 Aug 2024 15:15:56 +0100 Subject: [PATCH 019/149] refactor (backend): improve error handling for auto_contains_json evaluator --- .../agenta_backend/routers/evaluators_router.py | 4 ++-- .../agenta_backend/services/evaluators_service.py | 13 +++++++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluators_router.py b/agenta-backend/agenta_backend/routers/evaluators_router.py index afc8258541..dcfe672d5b 100644 --- a/agenta-backend/agenta_backend/routers/evaluators_router.py +++ b/agenta-backend/agenta_backend/routers/evaluators_router.py @@ -103,11 +103,11 @@ async def evaluator_run( ) return result except Exception as e: - logger.error(f"Error while running evaluator: {str(e)}") + logger.error(f"Error while running {evaluator_key} evaluator: {str(e)}") raise HTTPException( status_code=500, detail={ - "message": "Error while running evaluator", + "message": f"Error while running {evaluator_key} evaluator", "stacktrace": traceback.format_exc(), }, ) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index f158253571..2e378c2bad 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -607,9 +607,18 @@ async def auto_contains_json( settings_values: Dict[str, Any], # pylint: disable=unused-argument lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: - if not isinstance(output, str): - output = output.get("data", "") try: + if not isinstance(output, str): + # Attempt to retrieve 'data' key from output if it's a dictionary + output = output.get("data", "") if isinstance(output, dict) else output + + # If output is still not a string, raise an exception + if not isinstance(output, str): + raise Exception( + f"Evaluator 'contains_json' requires the output to be a string, but received {type(output).__name__} instead. " + f"Please ensure the output of the application is a valid string, or that the 'data' key in the dictionary contains a string." + ) + response = await contains_json( input=EvaluatorInputInterface(**{"inputs": {"prediction": output}}) ) From 08b9e8774d02c477a2f58be6b01ecc7f177ac176 Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 13 Aug 2024 15:17:56 +0100 Subject: [PATCH 020/149] feat (tests): add tests for dictionary-based output handling in contains_json evaluator --- agenta-backend/agenta_backend/tests/unit/test_evaluators.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/agenta-backend/agenta_backend/tests/unit/test_evaluators.py b/agenta-backend/agenta_backend/tests/unit/test_evaluators.py index c0bfbfade8..418299ddf6 100644 --- a/agenta-backend/agenta_backend/tests/unit/test_evaluators.py +++ b/agenta-backend/agenta_backend/tests/unit/test_evaluators.py @@ -179,6 +179,9 @@ async def test_auto_contains_all(output, substrings, case_sensitive, expected): ("No JSON here!", False), ("{Malformed JSON, nope!}", False), ('{"valid": "json", "number": 123}', True), + ({"data": {"message": "The capital of Azerbaijan is Baku."}}, None), + ({"data": '{"message": "The capital of Azerbaijan is Baku."}'}, True), + ({"data": "The capital of Azerbaijan is Baku."}, False), ], ) @pytest.mark.asyncio From 28320b8e52142b6c642445f2dfd48283689ddb37 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 14 Aug 2024 07:58:23 +0100 Subject: [PATCH 021/149] refactor (backend): add check for OpenAI API key with clear exception message --- .../services/evaluators_service.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index f158253571..b2a3f2c5e0 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -359,7 +359,12 @@ async def auto_ai_critique( async def ai_critique(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: - openai_api_key = input.credentials["OPENAI_API_KEY"] + openai_api_key = input.credentials.get("OPENAI_API_KEY", None) + + if not openai_api_key: + raise Exception( + "No OpenAI key was found. AI Critique evaluator requires a valid OpenAI API key to function. Please configure your OpenAI API and try again." + ) chain_run_args = { "llm_app_prompt_template": input.inputs.get("prompt_user", ""), @@ -786,7 +791,7 @@ async def measure_rag_consistency( openai_api_key = input.credentials.get("OPENAI_API_KEY", None) if not openai_api_key: raise Exception( - "No LLM keys OpenAI key found. Please configure your OpenAI keys and try again." + "No OpenAI key was found. RAG evaluator requires a valid OpenAI API key to function. Please configure your OpenAI API and try again." ) # Initialize RAG evaluator to calculate faithfulness score @@ -885,10 +890,9 @@ async def measure_context_coherence( input: EvaluatorInputInterface, ) -> EvaluatorOutputInterface: openai_api_key = input.credentials.get("OPENAI_API_KEY", None) - if not openai_api_key: raise Exception( - "No LLM keys OpenAI key found. Please configure your OpenAI keys and try again." + "No OpenAI key was found. RAG evaluator requires a valid OpenAI API key to function. Please configure your OpenAI API and try again." ) # Initialize RAG evaluator to calculate context relevancy score @@ -1124,8 +1128,13 @@ async def semantic_similarity( float: the semantic similarity score """ - api_key = input.credentials["OPENAI_API_KEY"] - openai = AsyncOpenAI(api_key=api_key) + openai_api_key = input.credentials.get("OPENAI_API_KEY", None) + if not openai_api_key: + raise Exception( + "No OpenAI key was found. Semantic evaluator requires a valid OpenAI API key to function. Please configure your OpenAI API and try again." + ) + + openai = AsyncOpenAI(api_key=openai_api_key) async def encode(text: str): response = await openai.embeddings.create( From d8a1bbdc265805c62373e1197af0a78fa2c0ccf6 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 14 Aug 2024 08:00:10 +0100 Subject: [PATCH 022/149] feat (tests): add test case for auto_ai_critique and evaluators requiring OpenAI API key --- .../tests/unit/test_evaluators.py | 107 ++++++++++++++++-- 1 file changed, 99 insertions(+), 8 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/unit/test_evaluators.py b/agenta-backend/agenta_backend/tests/unit/test_evaluators.py index c0bfbfade8..7fa391ccad 100644 --- a/agenta-backend/agenta_backend/tests/unit/test_evaluators.py +++ b/agenta-backend/agenta_backend/tests/unit/test_evaluators.py @@ -5,6 +5,7 @@ from agenta_backend.services.evaluators_service import ( auto_levenshtein_distance, + auto_ai_critique, auto_starts_with, auto_ends_with, auto_contains, @@ -18,6 +19,53 @@ ) +@pytest.mark.parametrize( + "ground_truth, output, settings_values, openai_api_key, expected_min, expected_max", + [ + ( + {"correct_answer": "The capital of Kiribati is Tarawa."}, + "The capital of Kiribati is South Tarawa.", + { + "prompt_template": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below:\nEvaluation strategy: 0 to 10 0 is very bad and 10 is very good.\nPrompt: {llm_app_prompt_template}\nInputs: country: {country}\nExpected Answer Column:{correct_answer}\nEvaluate this: {variant_output}\n\nAnswer ONLY with one of the given grading or evaluation options.", + "correct_answer_key": "correct_answer", + }, + os.environ.get("OPENAI_API_KEY"), + 0, + 10, + ), + ( + {"correct_answer": "The capital of Kiribati is Tarawa."}, + "The capital of Kiribati is South Tarawa.", + { + "prompt_template": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below:\nEvaluation strategy: 0 to 10 0 is very bad and 10 is very good.\nPrompt: {llm_app_prompt_template}\nInputs: country: {country}\nExpected Answer Column:{correct_answer}\nEvaluate this: {variant_output}\n\nAnswer ONLY with one of the given grading or evaluation options.", + "correct_answer_key": "correct_answer", + }, + None, + None, + None, + ), + ], +) +@pytest.mark.asyncio +async def test_auto_ai_critique_evaluator( + ground_truth, output, settings_values, openai_api_key, expected_min, expected_max +): + result = await auto_ai_critique( + {}, + output, + ground_truth, + {}, + settings_values, + {"OPENAI_API_KEY": openai_api_key}, + ) + try: + assert expected_min <= round(result.value, 1) <= expected_max + except TypeError as error: + # exceptions + # - raised by evaluator (agenta) -> TypeError + assert not isinstance(result.value, float) or not isinstance(result.value, int) + + @pytest.mark.parametrize( "output, settings_values, expected", [ @@ -287,6 +335,15 @@ async def test_auto_json_diff( 0.0, 1.0, ), + ( + {"correct_answer": "The capital of Namibia is Windhoek."}, + "Windhoek is the capital of Namibia.", + { + "correct_answer_key": "correct_answer", + }, + None, + None, + ), ], ) @pytest.mark.asyncio @@ -301,7 +358,12 @@ async def test_auto_semantic_similarity_match( settings_values, {"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")}, ) - assert expected_min <= round(result.value, 3) <= expected_max + try: + assert expected_min <= round(result.value, 1) <= expected_max + except TypeError as error: + # exceptions + # - raised by evaluator (agenta) -> TypeError + assert not isinstance(result.value, float) or not isinstance(result.value, int) @pytest.mark.parametrize( @@ -359,7 +421,7 @@ async def test_auto_levenshtein_distance(output, data_point, settings_values, ex @pytest.mark.parametrize( - "settings_values, expected_min, expected_max", + "settings_values, expected_min, openai_api_key, expected_max", [ ( { @@ -367,28 +429,46 @@ async def test_auto_levenshtein_distance(output, data_point, settings_values, ex "answer_key": "rag.reporter.outputs.report", "contexts_key": "rag.retriever.outputs.movies", }, + os.environ.get("OPENAI_API_KEY"), 0.0, 1.0, ), + ( + { + "question_key": "rag.retriever.internals.prompt", + "answer_key": "rag.reporter.outputs.report", + "contexts_key": "rag.retriever.outputs.movies", + }, + None, + None, + None, + ), # add more use cases ], ) @pytest.mark.asyncio -async def test_rag_faithfulness_evaluator(settings_values, expected_min, expected_max): +async def test_rag_faithfulness_evaluator( + settings_values, expected_min, openai_api_key, expected_max +): result = await rag_faithfulness( {}, simple_rag_trace, {}, {}, settings_values, - {"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")}, + {"OPENAI_API_KEY": openai_api_key}, ) - assert expected_min <= round(result.value, 1) <= expected_max + try: + assert expected_min <= round(result.value, 1) <= expected_max + except TypeError as error: + # exceptions + # - raised by evaluator (agenta) -> TypeError + assert not isinstance(result.value, float) or not isinstance(result.value, int) @pytest.mark.parametrize( - "settings_values, expected_min, expected_max", + "settings_values, expected_min, openai_api_key, expected_max", [ ( { @@ -396,15 +476,26 @@ async def test_rag_faithfulness_evaluator(settings_values, expected_min, expecte "answer_key": "rag.reporter.outputs.report", "contexts_key": "rag.retriever.outputs.movies", }, + os.environ.get("OPENAI_API_KEY"), 0.0, 1.0, ), + ( + { + "question_key": "rag.retriever.internals.prompt", + "answer_key": "rag.reporter.outputs.report", + "contexts_key": "rag.retriever.outputs.movies", + }, + None, + None, + None, + ), # add more use cases ], ) @pytest.mark.asyncio async def test_rag_context_relevancy_evaluator( - settings_values, expected_min, expected_max + settings_values, expected_min, openai_api_key, expected_max ): result = await rag_context_relevancy( {}, @@ -412,7 +503,7 @@ async def test_rag_context_relevancy_evaluator( {}, {}, settings_values, - {"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")}, + {"OPENAI_API_KEY": openai_api_key}, ) try: From 9c10025310c20e4e91e87312c580b63ba0032d55 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 18 Aug 2024 19:02:40 +0100 Subject: [PATCH 023/149] feat (tests): added mock trace data for a simple finance assisstant --- .../agenta_backend/tests/unit/test_traces.py | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/agenta-backend/agenta_backend/tests/unit/test_traces.py b/agenta-backend/agenta_backend/tests/unit/test_traces.py index 2357ee22ed..664eb4a4bb 100644 --- a/agenta-backend/agenta_backend/tests/unit/test_traces.py +++ b/agenta-backend/agenta_backend/tests/unit/test_traces.py @@ -69,3 +69,71 @@ ], }, } + + +simple_finance_assisstant_trace = { + "data": {}, + "trace": { + "trace_id": "66a61777a1e481ab498bc7b5", + "cost": None, + "usage": None, + "latency": 12.372497, + "spans": [ + { + "id": "66a61777a1e481ab498bc7b4", + "name": "diversify", + "parent_span_id": None, + "start_time": "2024-07-25T17:06:46.141563Z", + "end_time": "2024-07-25T17:06:46.885700Z", + "spankind": "WORKFLOW", + "metadata": {"cost": None, "latency": 2.641, "usage": None}, + "user_id": "—", + "inputs": { + "currency": "USD", + "amount": 800000, + "stocks": [], + "real_estate_properties": "Konga KFI, Almord City, Cambridge Lounge", + "percentage_returns": "6%, 9%, 15%", + "durations": "6 months, 9 months, 15 months", + }, + "internals": None, + "outputs": { + "report": [ + "**Investment Amount:**\nUSD 800,000\n\n**Real Estate Properties:**\n1. Konga KFI: 6% return, 6 months duration\n2. Almord City: 9% return, 9 months duration\n3. Cambridge Lounge: 15% return, 15 months duration\n\n**Allocation Strategy:**\nTo optimize the investment by balancing risk and return potential, I will allocate a higher percentage to properties with higher returns and longer durations while still maintaining diversification.\n\n**Allocation Breakdown:**\n1. Konga KFI: 30%\n2. Almord City: 30%\n3. Cambridge Lounge: 40%\n\n**Final Allocation:**\n1. Konga KFI: USD 240,000\n2. Almord City: USD 240,000\n3. Cambridge Lounge: USD 320,000" + ] + }, + "config": { + "temperature": 0.7, + "prompt_system": "You are a financial advisor that helps users allocate their investments. Users will provide an amount of money they wish to invest along with details about stocks and real estate properties. Your goal is to diversify this amount effectively.\n\nUser Inputs: Investment Amount: The total amount the user wants to invest.\nStocks: A list of stocks the user is interested in.\nReal Estate Properties: A list of properties, including their expected returns and investment durations.", + "prompt_user": "\nMy currency is {currency}. The total amount I want to invest is {amount}.\n", + "max_tokens": 2000, + "model": "gpt-4o", + "top_p": 1.0, + "invest_in_stocks": 0, + "invest_in_realestate": 1, + "frequence_penalty": 0.0, + "presence_penalty": 0.0, + }, + }, + { + "id": "66a61777a1e481ab498bc7b6", + "name": "reporter", + "parent_span_id": "66a61777a1e481ab498bc7b4", + "start_time": "2024-07-25T17:06:46.141563Z", + "end_time": "2024-07-25T17:06:46.885700Z", + "spankind": "LLM", + "metadata": {"cost": None, "latency": 2.64, "usage": None}, + "user_id": "—", + "inputs": { + "user_prompt": "\nMy currency is USD. The total amount I want to invest is 800000.\n\nThe user wants to invest in the following stocks: [].\n\nThe user wants to invest in the following real estate properties: Konga KFI, Almord City, Cambridge Lounge. The percentage returns for these properties are 6%, 9%, 15%, and the investment durations are 6 months, 9 months, 15 months.\n" + }, + "internals": None, + "outputs": { + "report": [ + "**Investment Amount:**\nUSD 800,000\n\n**Real Estate Properties:**\n1. Konga KFI: 6% return, 6 months duration\n2. Almord City: 9% return, 9 months duration\n3. Cambridge Lounge: 15% return, 15 months duration\n\n**Allocation Strategy:**\nTo optimize the investment by balancing risk and return potential, I will allocate a higher percentage to properties with higher returns and longer durations while still maintaining diversification.\n\n**Allocation Breakdown:**\n1. Konga KFI: 30%\n2. Almord City: 30%\n3. Cambridge Lounge: 40%\n\n**Final Allocation:**\n1. Konga KFI: USD 240,000\n2. Almord City: USD 240,000\n3. Cambridge Lounge: USD 320,000" + ] + }, + }, + ], + }, +} From 03e28ee081ce1caf37d5779609d719d0d636f6ad Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 18 Aug 2024 19:04:44 +0100 Subject: [PATCH 024/149] feat (tests): created fixtures for evaluator experiment tree map and run endpoints --- .../tests/variants_main_router/conftest.py | 176 ++++++++++++++++++ 1 file changed, 176 insertions(+) diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py index 0d86e074c9..7c3d20f50b 100644 --- a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py @@ -13,6 +13,10 @@ ImageDB, AppVariantDB, ) +from agenta_backend.tests.unit.test_traces import ( + simple_rag_trace, + simple_finance_assisstant_trace, +) import httpx from sqlalchemy.future import select @@ -280,3 +284,175 @@ def auto_ai_critique_evaluator_config(): @pytest.fixture() def deploy_to_environment_payload(): return {"environment_name": "string", "variant_id": "string"} + + +@pytest.fixture() +def rag_experiment_data_tree(): + return simple_rag_trace + + +@pytest.fixture() +def simple_experiment_data_tree(): + return simple_finance_assisstant_trace + + +@pytest.fixture() +def mapper_to_run_auto_exact_match_evaluation(): + return { + "prediction": "diversify.reporter.outputs.report[0]", + } + + +@pytest.fixture() +def mapper_to_run_rag_faithfulness_evaluation(): + return { + "question": "rag.retriever.internals.prompt", + "contexts": "rag.retriever.outputs.movies", + "answer": "rag.reporter.outputs.report", + } + + +@pytest.fixture() +def rag_faithfulness_evaluator_run_inputs(): + return { + "question": "List 6 movies about witches in the genre of fiction.", + "context": [ + "The Craft (1996) in ['Drama', 'Fantasy', 'Horror']: A newcomer to a Catholic prep high school falls in with a trio of outcast teenage girls who practice witchcraft and they all soon conjure up various spells and curses against those who even slightly anger them.", + "Oz the Great and Powerful (2013) in ['Adventure', 'Family', 'Fantasy']: A small-time magician is swept away to an enchanted land and is forced into a power struggle between three witches.", + "Snow White: A Tale of Terror (1997) in ['Fantasy', 'Horror']: In this dark take on the fairy tale, the growing hatred of a noblewoman, secretly a practitioner of the dark arts, for her stepdaughter, and the witch's horrifying attempts to kill her.", + "Into the Woods (2014) in ['Adventure', 'Fantasy', 'Musical']: A witch tasks a childless baker and his wife with procuring magical items from classic fairy tales to reverse the curse put on their family tree.", + "Wicked Stepmother (1989) in ['Comedy', 'Fantasy']: A mother/daughter pair of witches descend on a yuppie family's home and cause havoc, one at a time since they share one body & the other must live in a cat the rest of the time. Now it's up...", + "Hocus Pocus (1993) in ['Comedy', 'Family', 'Fantasy']: After three centuries, three witch sisters are resurrected in Salem Massachusetts on Halloween night, and it is up to two teen-agers, a young girl, and an immortal cat to put an end to the witches' reign of terror once and for all.", + "Warlock (1989) in ['Action', 'Fantasy', 'Horror']: A warlock flees from the 17th to the 20th century, with a witch-hunter in hot pursuit.", + "The Hexer (2001) in ['Adventure', 'Fantasy']: The adventures of Geralt of Rivea, \"The Witcher\".", + "Heavy Metal (1981) in ['Animation', 'Adventure', 'Fantasy']: A glowing orb terrorizes a young girl with a collection of stories of dark fantasy, eroticism and horror.", + ], + "answer": 'Witches in fiction are depicted through a mix of horror, fantasy, and dark comedy. \n\n"The Craft" (1996) delves into the complexities of teenage witchcraft, showcasing both empowerment and the darker repercussions of their actions. \n"Snow White: A Tale of Terror" (1997) offers a sinister twist on the classic story, highlighting the witch\'s envy and vengeful nature. \n"Hocus Pocus" (1993) delivers a comedic and adventurous take on witchcraft, as three resurrected witches wreak havoc in contemporary Salem', + } + + +@pytest.fixture() +def custom_code_snippet(): + return "from typing import Dict\nfrom random import uniform\n\ndef evaluate(\n app_params: Dict[str, str],\n inputs: Dict[str, str],\n output: str, # output of the llm app\n datapoint: Dict[str, str] # contains the testset row\n) -> float:\n return uniform(0.1, 0.9)" + + +@pytest.fixture() +def evaluators_payload_data(custom_code_snippet): + prompt_template = "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below:\nEvaluation strategy: 0 to 10 0 is very bad and 10 is very good.\nPrompt: {llm_app_prompt_template}\nInputs: country: {country}\nExpected Answer Column:{correct_answer}\nEvaluate this: {variant_output}\n\nAnswer ONLY with one of the given grading or evaluation options." + return { + "auto_regex_test": { + "inputs": { + "ground_truth": "The correct answer is 42", + "prediction": "The answer is 42", + }, + "settings": { + "regex_pattern": r"The\s+answer\s+is\s+42[.,]?", + "regex_should_match": True, + }, + }, + "field_match_test": { + "inputs": { + "ground_truth": {"message": "The correct answer is 42"}, + "prediction": '{"message": "The correct answer is 42"}', + }, + "settings": {"json_field": "ground_truth"}, + }, + "auto_custom_code_run": { + "inputs": { + "ground_truth": "The correct answer is 42", + "prediction": "The answer is 42", + "app_config": {}, + }, + "settings": { + "code": custom_code_snippet, + "correct_answer_key": "correct_answer", + }, + }, + "auto_ai_critique": { + "inputs": { + "ground_truth": "The correct answer is 42", + "prediction": "The answer is 42", + }, + "settings": { + "prompt_template": prompt_template, + "correct_answer_key": "correct_answer", + }, + "credentials": {"OPENAI_API_KEY": os.environ["OPENAI_API_KEY"]}, + }, + "auto_starts_with": { + "inputs": { + "ground_truth": "The correct answer is 42", + "prediction": "The answer is 42", + }, + "settings": {"prefix": "The", "case_sensitive": False}, + }, + "auto_ends_with": { + "inputs": { + "ground_truth": "The correct answer is 42", + "prediction": "The answer is 42", + }, + "settings": {"suffix": "42", "case_sensitive": False}, + }, + "auto_contains": { + "inputs": { + "ground_truth": "The correct answer is 42", + "prediction": "The answer is 42", + }, + "settings": {"substring": "answer is", "case_sensitive": False}, + }, + "auto_contains_any": { + "inputs": { + "ground_truth": "The correct answer is 42", + "prediction": "The answer is 42", + }, + "settings": {"substrings": "The,answer,42", "case_sensitive": False}, + }, + "auto_contains_all": { + "inputs": { + "ground_truth": "The correct answer is 42", + "prediction": "The answer is 42", + }, + "settings": {"substrings": "The,answer,is,42", "case_sensitive": False}, + }, + "auto_contains_json": { + "inputs": { + "ground_truth": "The correct answer is 42", + "prediction": '{"message": "The answer is 42"}', + }, + }, + "auto_json_diff": { + "inputs": { + "ground_truth": '{"message": "The correct answer is 42"}', + "prediction": '{"message": "The answer is 42"}', + }, + "settings": { + "compare_schema_only": True, + "predict_keys": True, + "case_insensitive_keys": False, + }, + }, + "auto_levenshtein_distance": { + "inputs": { + "ground_truth": "The correct answer is 42", + "prediction": "The answer is 42", + }, + "settings": {"threshold": 0.4}, + }, + "auto_similarity_match": { + "inputs": { + "ground_truth": "The correct answer is 42", + "prediction": "The answer is 42", + }, + "settings": { + "similarity_threshold": 0.4, + "correct_answer_key": "correct_answer", + }, + }, + "auto_semantic_similarity": { + "inputs": { + "ground_truth": "The correct answer is 42", + "prediction": "The answer is 42", + }, + "credentials": {"OPENAI_API_KEY": os.environ["OPENAI_API_KEY"]}, + }, + } From 6b6b6d6e3d3d0faacc5ec391850f50dbaf911bf6 Mon Sep 17 00:00:00 2001 From: Abram Date: Sun, 18 Aug 2024 19:05:22 +0100 Subject: [PATCH 025/149] feat (tests): created tests for evaluator experiment tree map and run endpoint --- .../test_variant_evaluators_router.py | 100 ++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py index ecd5e0a02d..4d28655b43 100644 --- a/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py @@ -358,3 +358,103 @@ async def test_remove_running_template_app_container(): assert True except: assert False + + +@pytest.mark.asyncio +async def test_rag_experiment_tree_maps_correctly( + rag_experiment_data_tree, mapper_to_run_rag_faithfulness_evaluation +): + payload = { + "inputs": rag_experiment_data_tree, + "mapping": mapper_to_run_rag_faithfulness_evaluation, + } + response = await test_client.post( + f"{BACKEND_API_HOST}/evaluators/map/", + json=payload, + timeout=timeout, + ) + response_data = response.json() + assert response.status_code == 200 + assert ( + "question" in response_data["outputs"] + and "contexts" in response_data["outputs"] + and "answer" in response_data["outputs"] + ) == True + + +@pytest.mark.asyncio +async def test_simple_experiment_tree_maps_correctly( + simple_experiment_data_tree, mapper_to_run_auto_exact_match_evaluation +): + payload = { + "inputs": simple_experiment_data_tree, + "mapping": mapper_to_run_auto_exact_match_evaluation, + } + response = await test_client.post( + f"{BACKEND_API_HOST}/evaluators/map/", + json=payload, + timeout=timeout, + ) + response_data = response.json() + assert response.status_code == 200 + assert ( + "prediction" in response_data["outputs"] + and isinstance(response_data["outputs"]["prediction"], str) + ) == True + + +@pytest.mark.asyncio +async def test_rag_faithfulness_evaluator_run( + rag_faithfulness_evaluator_run_inputs, +): + payload = { + "inputs": rag_faithfulness_evaluator_run_inputs, + "credentials": {"OPENAI_API_KEY": os.environ["OPENAI_API_KEY"]}, + } + response = await test_client.post( + f"{BACKEND_API_HOST}/evaluators/rag_faithfulness/run/", + json=payload, + timeout=timeout, + ) + assert response.status_code == 200 + assert 0.0 <= response.json()["outputs"]["score"] <= 1.0 + assert isinstance(response.json()["outputs"]["score"], float) + + +@pytest.mark.asyncio +async def test_custom_code_evaluator_run(custom_code_snippet): + payload = { + "inputs": { + "ground_truth": "The correct answer is 42", + "prediction": "The answer is 42", + "app_config": {}, + }, + "settings": { + "code": custom_code_snippet, + "correct_answer_key": "correct_answer", + }, + } + response = await test_client.post( + f"{BACKEND_API_HOST}/evaluators/auto_custom_code_run/run/", + json=payload, + timeout=timeout, + ) + assert response.status_code == 200 + assert 0.0 <= response.json()["outputs"]["score"] <= 1.0 + assert isinstance(response.json()["outputs"]["score"], float) + + +@pytest.mark.asyncio +async def test_run_evaluators_via_api( + evaluators_payload_data, +): + evaluators_response_status_code = [] + for evaluator_key, evaluator_payload in evaluators_payload_data.items(): + response = await test_client.post( + f"{BACKEND_API_HOST}/evaluators/{evaluator_key}/run/", + json=evaluator_payload, + timeout=timeout, + ) + evaluators_response_status_code.append(response.status_code) + + assert evaluators_response_status_code.count(200) == 14 From e02fefaa53491d15f77eb2de1f9ee40681006aaf Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 19 Aug 2024 23:50:52 +0100 Subject: [PATCH 026/149] refactor (backend): rewrite db function to check if evaluators exist in evaluators --- .../agenta_backend/services/db_manager.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py index 08ac7b00f9..077fb07ef4 100644 --- a/agenta-backend/agenta_backend/services/db_manager.py +++ b/agenta-backend/agenta_backend/services/db_manager.py @@ -2963,13 +2963,17 @@ async def fetch_evaluator_config(evaluator_config_id: str): return evaluator_config -async def check_if_ai_critique_exists_in_list_of_evaluators_configs( - evaluators_configs_ids: List[str], +async def check_if_evaluators_exist_in_list_of_evaluators_configs( + evaluators_configs_ids: List[str], evaluators_keys: List[str] ) -> bool: - """Fetch evaluator configurations from the database. + """Check if the provided evaluators exist in the database within the given evaluator configurations. + + Arguments: + evaluators_configs_ids (List[str]): List of evaluator configuration IDs to search within. + evaluators_keys (List[str]): List of evaluator keys to check for existence. Returns: - EvaluatorConfigDB: the evaluator configuration object. + bool: True if all evaluators exist, False otherwise. """ async with db_engine.get_session() as session: @@ -2978,15 +2982,18 @@ async def check_if_ai_critique_exists_in_list_of_evaluators_configs( for evaluator_config_id in evaluators_configs_ids ] - query = select(EvaluatorConfigDB).where( + query = select(EvaluatorConfigDB.id, EvaluatorConfigDB.evaluator_key).where( EvaluatorConfigDB.id.in_(evaluator_config_uuids), - EvaluatorConfigDB.evaluator_key == "auto_ai_critique", + EvaluatorConfigDB.evaluator_key.in_(evaluators_keys), ) - result = await session.execute(query) - evaluators_configs = result.scalars().all() - return bool(evaluators_configs) + # NOTE: result.all() returns the records as a list of tuples + # 0 is the evaluator_id and 1 is evaluator_key + fetched_evaluators_keys = {config[1] for config in result.all()} + + # Ensure the passed evaluators are found in the fetched evaluator keys + return any(key in fetched_evaluators_keys for key in evaluators_keys) async def fetch_evaluator_config_by_appId( From 4cee49fa9acc04e635e321e854e1a6b27b7fbc74 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 19 Aug 2024 23:51:24 +0100 Subject: [PATCH 027/149] chore (backend): remove deprecated function 'check_ai_critique_inputs' --- .../services/evaluator_manager.py | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluator_manager.py b/agenta-backend/agenta_backend/services/evaluator_manager.py index 586c59b282..84dd456e2d 100644 --- a/agenta-backend/agenta_backend/services/evaluator_manager.py +++ b/agenta-backend/agenta_backend/services/evaluator_manager.py @@ -166,28 +166,3 @@ async def create_ready_to_use_evaluators(app: AppDB): evaluator_key=evaluator.key, settings_values=settings_values, ) - - -async def check_ai_critique_inputs( - evaluators_configs: List[str], lm_providers_keys: Optional[Dict[str, Any]] -) -> Tuple[bool, Optional[JSONResponse]]: - """ - Checks if AI critique exists in evaluators configs and validates lm_providers_keys. - - Args: - evaluators_configs (List[str]): List of evaluator configurations. - lm_providers_keys (Optional[Dict[str, Any]]): Language model provider keys. - - Returns: - Tuple[bool, Optional[JSONResponse]]: Returns a tuple containing a boolean indicating success, - and a JSONResponse in case of error. - """ - if await db_manager.check_if_ai_critique_exists_in_list_of_evaluators_configs( - evaluators_configs - ): - if not lm_providers_keys: - return False, JSONResponse( - {"detail": "Missing LM provider Key"}, - status_code=400, - ) - return True, None From c6ee3c8c8989f1bedb5e6e4b695fc0c7dfebf333 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 19 Aug 2024 23:53:54 +0100 Subject: [PATCH 028/149] feat (backend): implemented helper functions to: - format llm provider keys - and to ensure required llm keys exists in the provided evaluator configs --- .../agenta_backend/services/helpers.py | 66 ++++++++++++++++++- 1 file changed, 64 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/services/helpers.py b/agenta-backend/agenta_backend/services/helpers.py index 7b9510a0b7..04208291f3 100644 --- a/agenta-backend/agenta_backend/services/helpers.py +++ b/agenta-backend/agenta_backend/services/helpers.py @@ -1,6 +1,9 @@ import json -from typing import List, Dict, Any, Tuple, Union -from datetime import datetime, timedelta, timezone +from datetime import datetime, timezone +from typing import List, Dict, Any, Union, Tuple + +from agenta_backend.services import db_manager +from agenta_backend.models.api.evaluation_model import LMProvidersEnum def format_inputs(list_of_dictionaries: List[Dict[str, Any]]) -> Dict: @@ -76,3 +79,62 @@ def convert_to_utc_datetime(dt: Union[datetime, str, None]) -> datetime: if dt.tzinfo is None: return dt.replace(tzinfo=timezone.utc) return dt + + +def format_llm_provider_keys( + llm_provider_keys: Dict[LMProvidersEnum, str] +) -> Dict[str, str]: + """Formats a dictionary of LLM provider keys into a dictionary of strings. + + Args: + llm_provider_keys (Dict[LMProvidersEnum, str]): LLM provider keys + + Returns: + Dict[str, str]: formatted llm provided keys + + Example: + Input: {: '...', ...} + Output: {'MISTRAL_API_KEY': '...', ...} + """ + + llm_provider_keys = {key.value: value for key, value in llm_provider_keys.items()} + return llm_provider_keys + + +async def ensure_required_llm_keys_exist( + evaluator_configs: List[str], llm_provider_keys: Dict[str, str] +) -> Tuple[bool, None]: + """ + Validates if necessary LLM API keys are present when required evaluators are used. + + Args: + evaluator_configs (List[str]): List of evaluator configurations to check. + llm_provider_keys (Dict[str, str]): Dictionary of LLM provider keys (e.g., {"OPENAI_API_KEY": "your-key"}). + + Returns: + Tuple[bool, None]: Returns (True, None) if validation passes. + + Raises: + ValueError: If an evaluator requiring LLM keys is configured but no LLM API key is provided. + + """ + + evaluators_requiring_llm_keys = [ + "rag_context_relevancy", + "rag_faithfulness", + "auto_ai_critique", + "auto_semantic_similarity", + ] + + evaluators_found = ( + await db_manager.check_if_evaluators_exist_in_list_of_evaluators_configs( + evaluator_configs, evaluators_requiring_llm_keys + ) + ) + + if evaluators_found and "OPENAI_API_KEY" not in llm_provider_keys: + raise ValueError( + "OpenAI API key is required to run one or more of the specified evaluators." + ) + + return True, None From a8c1273bbc5e0da03c2aef0edc3be65a36e79070 Mon Sep 17 00:00:00 2001 From: Abram Date: Mon, 19 Aug 2024 23:54:52 +0100 Subject: [PATCH 029/149] refactor (backend): update evaluator_router to: - properly format llm provider keys - and check that the required llm keys exists --- .../agenta_backend/routers/evaluation_router.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index 3ebe171772..19bb6ec2af 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -5,6 +5,7 @@ from fastapi.responses import JSONResponse from fastapi import HTTPException, Request, status, Response, Query +from agenta_backend.services import helpers from agenta_backend.models import converters from agenta_backend.tasks.evaluations import evaluate from agenta_backend.utils.common import APIRouter, isCloudEE @@ -15,9 +16,6 @@ NewEvaluation, DeleteEvaluation, ) -from agenta_backend.services.evaluator_manager import ( - check_ai_critique_inputs, -) if isCloudEE(): from agenta_backend.commons.models.shared_models import Permission @@ -112,8 +110,9 @@ async def create_evaluation( status_code=403, ) - success, response = await check_ai_critique_inputs( - payload.evaluators_configs, payload.lm_providers_keys + llm_provider_keys = helpers.format_llm_provider_keys(payload.lm_providers_keys) + success, response = await helpers.ensure_required_llm_keys_exist( + payload.evaluators_configs, llm_provider_keys ) if not success: return response @@ -134,8 +133,8 @@ async def create_evaluation( evaluators_config_ids=payload.evaluators_configs, testset_id=payload.testset_id, evaluation_id=evaluation.id, - rate_limit_config=payload.rate_limit.dict(), - lm_providers_keys=payload.lm_providers_keys, + rate_limit_config=payload.rate_limit.model_dump(), + lm_providers_keys=llm_provider_keys, ) evaluations.append(evaluation) From f3367ef4476fd5f20aa7cfa7bac60b290aefafcb Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 20 Aug 2024 08:16:32 +0100 Subject: [PATCH 030/149] feat (tests): added test to create evaluation with no llm keys --- .../tests/variants_main_router/conftest.py | 10 ++++ .../test_variant_evaluators_router.py | 57 +++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py index 0d86e074c9..d636052b93 100644 --- a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py @@ -217,6 +217,16 @@ def app_variant_parameters_updated(): } +@pytest.fixture() +def evaluators_requiring_llm_keys(): + return [ + "rag_context_relevancy", + "rag_faithfulness", + "auto_ai_critique", + "auto_semantic_similarity", + ] + + @pytest.fixture() def auto_exact_match_evaluator_config(): return { diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py index ecd5e0a02d..15ef905f29 100644 --- a/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py @@ -269,6 +269,63 @@ async def create_evaluation_with_evaluator(evaluator_config_name): await wait_for_evaluation_to_finish(evaluation_id) +@pytest.mark.asyncio +async def test_create_evaluation_with_no_llm_keys(evaluators_requiring_llm_keys): + async with db_engine.get_session() as session: + app_result = await session.execute(select(AppDB).filter_by(app_name=APP_NAME)) + app = app_result.scalars().first() + + app_variant_result = await session.execute( + select(AppVariantDB).filter_by(app_id=app.id) + ) + app_variant = app_variant_result.scalars().first() + + testset_result = await session.execute( + select(TestSetDB).filter_by(app_id=app.id) + ) + testset = testset_result.scalars().first() + + # Prepare payload + payload = { + "app_id": str(app.id), + "variant_ids": [str(app_variant.id)], + "evaluators_configs": [], + "testset_id": str(testset.id), + "lm_providers_keys": {"MISTRAL_API_KEY": OPEN_AI_KEY}, + "rate_limit": { + "batch_size": 10, + "max_retries": 3, + "retry_delay": 3, + "delay_between_batches": 5, + }, + } + + # Fetch evaluator configs + response = await test_client.get( + f"{BACKEND_API_HOST}/evaluators/configs/?app_id={payload['app_id']}", + timeout=timeout, + ) + list_of_configs_ids = [] + evaluator_configs = response.json() + for evaluator_config in evaluator_configs: + if evaluator_config["evaluator_key"] in evaluators_requiring_llm_keys: + list_of_configs_ids.append(evaluator_config["id"]) + + # Update payload with list of configs ids + payload["evaluators_configs"] = list_of_configs_ids + + # Make request to create evaluation + response = await test_client.post( + f"{BACKEND_API_HOST}/evaluations/", json=payload, timeout=timeout + ) + + assert response.status_code == 500 + assert ( + response.json()["detail"] + == "OpenAI API key is required to run one or more of the specified evaluators." + ) + + @pytest.mark.asyncio async def test_create_evaluation_auto_exact_match(): await create_evaluation_with_evaluator("auto_exact_match_evaluator_config") From c499a192a3cb21044e00cb72526ae865dae814fb Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 20 Aug 2024 10:33:16 +0100 Subject: [PATCH 031/149] refactor (backend): added - configurable setting to evaluators requiring llm api keys - update fixture to make use of centralized evaluators --- .../resources/evaluators/evaluators.py | 139 +++++++++++++++++- .../agenta_backend/services/helpers.py | 10 +- .../tests/variants_main_router/conftest.py | 12 +- 3 files changed, 150 insertions(+), 11 deletions(-) diff --git a/agenta-backend/agenta_backend/resources/evaluators/evaluators.py b/agenta-backend/agenta_backend/resources/evaluators/evaluators.py index 55e1105e16..c902bf025a 100644 --- a/agenta-backend/agenta_backend/resources/evaluators/evaluators.py +++ b/agenta-backend/agenta_backend/resources/evaluators/evaluators.py @@ -1,4 +1,12 @@ rag_evaluator_settings_template = { + "requires_llm_api_keys": { + "label": "Requires LLM API Key(s)", + "type": "boolean", + "required": True, + "default": True, + "advanced": True, + "description": "Indicates whether the evaluation requires LLM API key(s) to function.", + }, "question_key": { "label": "Question Key", "default": "", @@ -30,6 +38,14 @@ "key": "auto_exact_match", "direct_use": True, "settings_template": { + "requires_llm_api_keys": { + "label": "Requires LLM API Key(s)", + "type": "boolean", + "required": True, + "default": False, + "advanced": True, + "description": "Indicates whether the evaluation requires LLM API key(s) to function.", + }, "correct_answer_key": { "label": "Expected Answer Column", "default": "correct_answer", @@ -46,7 +62,16 @@ "name": "Contains Json", "key": "auto_contains_json", "direct_use": True, - "settings_template": {}, + "settings_template": { + "requires_llm_api_keys": { + "label": "Requires LLM API Key(s)", + "type": "boolean", + "required": True, + "default": False, + "advanced": True, + "description": "Indicates whether the evaluation requires LLM API key(s) to function.", + }, + }, "description": "Contains Json evaluator checks if the output contains the specified JSON structure.", "oss": True, }, @@ -55,6 +80,14 @@ "key": "auto_similarity_match", "direct_use": False, "settings_template": { + "requires_llm_api_keys": { + "label": "Requires LLM API Key(s)", + "type": "boolean", + "required": True, + "default": False, + "advanced": True, + "description": "Indicates whether the evaluation requires LLM API key(s) to function.", + }, "similarity_threshold": { "label": "Similarity Threshold", "type": "number", @@ -82,6 +115,14 @@ "direct_use": False, "description": "Semantic Similarity Match evaluator measures the similarity between two pieces of text by analyzing their meaning and context. It compares the semantic content, providing a score that reflects how closely the texts match in terms of meaning, rather than just exact word matches.", "settings_template": { + "requires_llm_api_keys": { + "label": "Requires LLM API Key(s)", + "type": "boolean", + "required": True, + "advanced": True, + "default": True, + "description": "Indicates whether the evaluation requires LLM API key(s) to function.", + }, "correct_answer_key": { "label": "Expected Answer Column", "default": "correct_answer", @@ -99,6 +140,14 @@ "direct_use": False, "description": "Regex Test evaluator checks if the generated answer matches a regular expression pattern. You need to provide the regex expression and specify whether an answer is correct if it matches or does not match the regex.", "settings_template": { + "requires_llm_api_keys": { + "label": "Requires LLM API Key(s)", + "type": "boolean", + "required": True, + "default": False, + "advanced": True, + "description": "Indicates whether the evaluation requires LLM API key(s) to function.", + }, "regex_pattern": { "label": "Regex Pattern", "type": "regex", @@ -120,6 +169,14 @@ "key": "field_match_test", "direct_use": False, "settings_template": { + "requires_llm_api_keys": { + "label": "Requires LLM API Key(s)", + "type": "boolean", + "required": True, + "default": False, + "advanced": True, + "description": "Indicates whether the evaluation requires LLM API key(s) to function.", + }, "json_field": { "label": "JSON Field", "type": "string", @@ -145,6 +202,14 @@ "direct_use": False, "description": "Compares the generated JSON output to a ground truth JSON and returns a normalized score between 0 and 1 based on their differences.", "settings_template": { + "requires_llm_api_keys": { + "label": "Requires LLM API Key(s)", + "type": "boolean", + "required": True, + "default": False, + "advanced": True, + "description": "Indicates whether the evaluation requires LLM API key(s) to function.", + }, "compare_schema_only": { "label": "Compare Schema Only", "type": "boolean", @@ -182,6 +247,14 @@ "key": "auto_ai_critique", "direct_use": False, "settings_template": { + "requires_llm_api_keys": { + "label": "Requires LLM API Key(s)", + "type": "boolean", + "required": True, + "default": True, + "advanced": True, + "description": "Indicates whether the evaluation requires LLM API key(s) to function.", + }, "prompt_template": { "label": "Prompt Template", "type": "text", @@ -206,6 +279,14 @@ "key": "auto_custom_code_run", "direct_use": False, "settings_template": { + "requires_llm_api_keys": { + "label": "Requires LLM API Key(s)", + "type": "boolean", + "required": True, + "default": False, + "advanced": True, + "description": "Indicates whether the evaluation requires LLM API key(s) to function.", + }, "code": { "label": "Evaluation Code", "type": "code", @@ -230,6 +311,14 @@ "key": "auto_webhook_test", "direct_use": False, "settings_template": { + "requires_llm_api_keys": { + "label": "Requires LLM API Key(s)", + "type": "boolean", + "required": True, + "default": False, + "advanced": True, + "description": "Indicates whether the evaluation requires LLM API key(s) to function.", + }, "webhook_url": { "label": "Webhook URL", "type": "string", @@ -253,6 +342,14 @@ "key": "auto_starts_with", "direct_use": False, "settings_template": { + "requires_llm_api_keys": { + "label": "Requires LLM API Key(s)", + "type": "boolean", + "required": True, + "default": False, + "advanced": True, + "description": "Indicates whether the evaluation requires LLM API key(s) to function.", + }, "prefix": { "label": "prefix", "type": "string", @@ -274,6 +371,14 @@ "key": "auto_ends_with", "direct_use": False, "settings_template": { + "requires_llm_api_keys": { + "label": "Requires LLM API Key(s)", + "type": "boolean", + "required": True, + "default": False, + "advanced": True, + "description": "Indicates whether the evaluation requires LLM API key(s) to function.", + }, "case_sensitive": { "label": "Case Sensitive", "type": "boolean", @@ -295,6 +400,14 @@ "key": "auto_contains", "direct_use": False, "settings_template": { + "requires_llm_api_keys": { + "label": "Requires LLM API Key(s)", + "type": "boolean", + "required": True, + "default": False, + "advanced": True, + "description": "Indicates whether the evaluation requires LLM API key(s) to function.", + }, "case_sensitive": { "label": "Case Sensitive", "type": "boolean", @@ -316,6 +429,14 @@ "key": "auto_contains_any", "direct_use": False, "settings_template": { + "requires_llm_api_keys": { + "label": "Requires LLM API Key(s)", + "type": "boolean", + "required": True, + "default": False, + "advanced": True, + "description": "Indicates whether the evaluation requires LLM API key(s) to function.", + }, "case_sensitive": { "label": "Case Sensitive", "type": "boolean", @@ -337,6 +458,14 @@ "key": "auto_contains_all", "direct_use": False, "settings_template": { + "requires_llm_api_keys": { + "label": "Requires LLM API Key(s)", + "type": "boolean", + "required": True, + "default": False, + "advanced": True, + "description": "Indicates whether the evaluation requires LLM API key(s) to function.", + }, "case_sensitive": { "label": "Case Sensitive", "type": "boolean", @@ -358,6 +487,14 @@ "key": "auto_levenshtein_distance", "direct_use": False, "settings_template": { + "requires_llm_api_keys": { + "label": "Requires LLM API Key(s)", + "type": "boolean", + "required": True, + "default": False, + "advanced": True, + "description": "Indicates whether the evaluation requires LLM API key(s) to function.", + }, "threshold": { "label": "Threshold", "type": "number", diff --git a/agenta-backend/agenta_backend/services/helpers.py b/agenta-backend/agenta_backend/services/helpers.py index 04208291f3..1731dfe64f 100644 --- a/agenta-backend/agenta_backend/services/helpers.py +++ b/agenta-backend/agenta_backend/services/helpers.py @@ -4,6 +4,7 @@ from agenta_backend.services import db_manager from agenta_backend.models.api.evaluation_model import LMProvidersEnum +from agenta_backend.resources.evaluators.evaluators import get_all_evaluators def format_inputs(list_of_dictionaries: List[Dict[str, Any]]) -> Dict: @@ -120,12 +121,11 @@ async def ensure_required_llm_keys_exist( """ evaluators_requiring_llm_keys = [ - "rag_context_relevancy", - "rag_faithfulness", - "auto_ai_critique", - "auto_semantic_similarity", + evaluator["key"] + for evaluator in get_all_evaluators() + if evaluator["settings_template"]["requires_llm_api_keys"].get("default", False) + is True ] - evaluators_found = ( await db_manager.check_if_evaluators_exist_in_list_of_evaluators_configs( evaluator_configs, evaluators_requiring_llm_keys diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py index d636052b93..f0bb764814 100644 --- a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py @@ -13,6 +13,7 @@ ImageDB, AppVariantDB, ) +from agenta_backend.resources.evaluators.evaluators import get_all_evaluators import httpx from sqlalchemy.future import select @@ -219,12 +220,13 @@ def app_variant_parameters_updated(): @pytest.fixture() def evaluators_requiring_llm_keys(): - return [ - "rag_context_relevancy", - "rag_faithfulness", - "auto_ai_critique", - "auto_semantic_similarity", + evaluators_requiring_llm_keys = [ + evaluator["key"] + for evaluator in get_all_evaluators() + if evaluator["settings_template"]["requires_llm_api_keys"].get("default", False) + is True ] + return evaluators_requiring_llm_keys @pytest.fixture() From d1fe5aac7b58535dd52c2254f1b3b10d411b92b9 Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 20 Aug 2024 11:25:19 +0100 Subject: [PATCH 032/149] chore (backend): remove redundant error message --- agenta-backend/agenta_backend/services/evaluators_service.py | 1 - 1 file changed, 1 deletion(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 2e378c2bad..e4d96e72cd 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -616,7 +616,6 @@ async def auto_contains_json( if not isinstance(output, str): raise Exception( f"Evaluator 'contains_json' requires the output to be a string, but received {type(output).__name__} instead. " - f"Please ensure the output of the application is a valid string, or that the 'data' key in the dictionary contains a string." ) response = await contains_json( From 169a9946d6131e9faefbc9d6690cb9c66c374246 Mon Sep 17 00:00:00 2001 From: Abram Date: Tue, 20 Aug 2024 14:35:48 +0100 Subject: [PATCH 033/149] chore (backend): cleanup in levenshtein distance evaluato --- .../agenta_backend/services/evaluators_service.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index f158253571..417c874899 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -988,15 +988,9 @@ async def levenshtein_distance( ) -> EvaluatorOutputInterface: prediction = input.inputs["prediction"] ground_truth = input.inputs["ground_truth"] - # if len(prediction) < len(ground_truth): - # return await levenshtein_distance( - # input=EvaluatorInputInterface( - # **{"inputs": {"prediction": prediction, "ground_truth": ground_truth}} - # ) - # ) # pylint: disable=arguments-out-of-order if len(ground_truth) == 0: - return len(s1) + return len(prediction) previous_row = range(len(ground_truth) + 1) for i, c1 in enumerate(prediction): From c663fb4b08fad5c89d708cd1ebc49cf6ad453b43 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 21 Aug 2024 18:19:11 +0100 Subject: [PATCH 034/149] style (website): format cookbooks with black@23.12.0 --- website/scripts/generate_cookbooks.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/website/scripts/generate_cookbooks.py b/website/scripts/generate_cookbooks.py index b84cd2ee3d..6b8a809224 100644 --- a/website/scripts/generate_cookbooks.py +++ b/website/scripts/generate_cookbooks.py @@ -5,6 +5,7 @@ import os import sys + def make_header(notebook_path): github_uri = "Agenta-AI/agenta/blob/main/cookbook" github_path = f"https://github.com/{github_uri}/{os.path.basename(notebook_path)}" @@ -52,11 +53,11 @@ def export_notebook(notebook_path, output_path): title = convert_to_title_case(os.path.basename(notebook_path)) # Add the title to the top of the markdown file - title_header = f"---\ntitle: \"{title}\"\n---\n\n" - + title_header = f'---\ntitle: "{title}"\n---\n\n' + # Add the header below the title header = make_header(notebook_path) - + # Combine the title, header, and the output markdown content output = title_header + header + output @@ -103,4 +104,4 @@ def main(notebook_filename=None): if __name__ == "__main__": # Get the filename argument from the command line notebook_filename = sys.argv[1] if len(sys.argv) > 1 else None - main(notebook_filename) \ No newline at end of file + main(notebook_filename) From 23be8b6edd5dab2c0af9e30dadd3fe68ff53636e Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 21 Aug 2024 21:04:07 +0100 Subject: [PATCH 035/149] refactor (backend): centralize validation of string and json output and use functions in evaluators --- .../services/evaluators_service.py | 136 +++++++++++------- 1 file changed, 81 insertions(+), 55 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 4132388716..e3bd9ae6a7 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -29,6 +29,69 @@ logger.setLevel(logging.DEBUG) +def validate_string_output( + evaluator_key: str, output: Union[str, Dict[str, Any]] +) -> str: + """Checks and validate the output to be of type string. + + Args: + evaluator_key (str): the key of the evaluator + output (Union[str, Dict[str, Any]]): the llm response + + Raises: + Exception: requires output to be a string + + Returns: + str: output + """ + + output = output.get("data", "") if isinstance(output, dict) else output + if not isinstance(output, str): + raise Exception( + f"Evaluator {evaluator_key} requires the output to be a string, but received {type(output).__name__} instead. " + ) + return output + + +def validate_json_output(evaluator_key: str, output: Union[str, Dict[str, Any]]) -> str: + """Checks and validate the output to be of type JSON string. + + Args: + evaluator_key (str): the key of the evaluator + output (Union[str, Dict[str, Any]]): the llm response + + Raises: + Exception: requires output to be a JSON string + + Returns: + str: output + """ + + output = output.get("data", "") if isinstance(output, dict) else output + if isinstance(output, dict): + output = json.dumps(output) + elif isinstance(output, str): + try: + json.loads(output) + except json.JSONDecodeError: + raise Exception( + f"Evaluator {evaluator_key} requires the output to be a JSON string." + ) + + if not isinstance( + output, + ( + str, + dict, + ), + ): + raise Exception( + f"Evaluator {evaluator_key} requires the output to be either a JSON string or object, but received {type(output).__name__} instead." + ) + + return output + + async def map( mapping_input: EvaluatorMappingInputInterface, ) -> EvaluatorMappingOutputInterface: @@ -94,9 +157,9 @@ async def auto_exact_match( Returns: Result: A Result object containing the evaluation result. """ - if not isinstance(output, str): - output = output.get("data", "") + try: + output = validate_string_output("exact_match", output) correct_answer = get_correct_answer(data_point, settings_values) inputs = {"ground_truth": correct_answer, "prediction": output} response = exact_match(input=EvaluatorInputInterface(**{"inputs": inputs})) @@ -136,9 +199,8 @@ async def auto_regex_test( settings_values: Dict[str, Any], lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: - if not isinstance(output, str): - output = output.get("data", "") try: + output = validate_string_output("regex_test", output) inputs = {"ground_truth": data_point, "prediction": output} response = await regex_test( input=EvaluatorInputInterface( @@ -174,9 +236,8 @@ async def auto_field_match_test( settings_values: Dict[str, Any], lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: - if not isinstance(output, str): - output = output.get("data", "") try: + output = validate_string_output("field_match_test", output) correct_answer = get_correct_answer(data_point, settings_values) inputs = {"ground_truth": correct_answer, "prediction": output} response = await field_match_test( @@ -210,9 +271,8 @@ async def auto_webhook_test( settings_values: Dict[str, Any], lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: - if not isinstance(output, str): - output = output.get("data", "") try: + output = validate_string_output("webhook_test", output) correct_answer = get_correct_answer(data_point, settings_values) inputs = {"prediction": output, "ground_truth": correct_answer} response = await webhook_test( @@ -272,9 +332,8 @@ async def auto_custom_code_run( settings_values: Dict[str, Any], lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: - if not isinstance(output, str): - output = output.get("data", "") try: + output = validate_string_output("custom_code_run", output) correct_answer = get_correct_answer(data_point, settings_values) inputs = { "app_config": app_params, @@ -332,9 +391,9 @@ async def auto_ai_critique( Returns: Result: Evaluation result. """ - if not isinstance(output, str): - output = output.get("data", "") + try: + output = validate_string_output("ai_critique", output) correct_answer = get_correct_answer(data_point, settings_values) inputs = { "prompt_user": app_params.get("prompt_user", ""), @@ -391,9 +450,8 @@ async def auto_starts_with( settings_values: Dict[str, Any], lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: - if not isinstance(output, str): - output = output.get("data", "") try: + output = validate_string_output("starts_with", output) inputs = {"prediction": output} response = await starts_with( input=EvaluatorInputInterface( @@ -433,9 +491,8 @@ async def auto_ends_with( settings_values: Dict[str, Any], lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: - if not isinstance(output, str): - output = output.get("data", "") try: + output = validate_string_output("ends_with", output) inputs = {"prediction": output} response = await ends_with( input=EvaluatorInputInterface( @@ -476,9 +533,8 @@ async def auto_contains( settings_values: Dict[str, Any], lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: - if not isinstance(output, str): - output = output.get("data", "") try: + output = validate_string_output("contains", output) inputs = {"prediction": output} response = await contains( input=EvaluatorInputInterface( @@ -519,9 +575,8 @@ async def auto_contains_any( settings_values: Dict[str, Any], lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: - if not isinstance(output, str): - output = output.get("data", "") try: + output = validate_string_output("contains_any", output) inputs = {"prediction": output} response = await contains_any( input=EvaluatorInputInterface( @@ -564,9 +619,8 @@ async def auto_contains_all( settings_values: Dict[str, Any], lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: - if not isinstance(output, str): - output = output.get("data", "") try: + output = validate_string_output("contains_all", output) response = await contains_all( input=EvaluatorInputInterface( **{"inputs": {"prediction": output}, "settings": settings_values} @@ -608,16 +662,7 @@ async def auto_contains_json( lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: try: - if not isinstance(output, str): - # Attempt to retrieve 'data' key from output if it's a dictionary - output = output.get("data", "") if isinstance(output, dict) else output - - # If output is still not a string, raise an exception - if not isinstance(output, str): - raise Exception( - f"Evaluator 'contains_json' requires the output to be a string, but received {type(output).__name__} instead. " - ) - + output = validate_json_output("contains_json", output) response = await contains_json( input=EvaluatorInputInterface(**{"inputs": {"prediction": output}}) ) @@ -758,22 +803,7 @@ async def auto_json_diff( lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: try: - output = output.get("data", "") if isinstance(output, dict) else output - - if isinstance(output, dict): - output = json.dumps(output) - elif isinstance(output, str): - try: - json.loads(output) - except: - raise Exception( - f"Evaluator 'auto_json_diff' requires string outputs to be JSON strings." - ) - else: - raise Exception( - f"Evaluator 'auto_json_diff' requires the output to be either a JSON string or a JSON object, but received {type(output).__name__} instead." - ) - + output = validate_json_output("json_diff", output) correct_answer = get_correct_answer(data_point, settings_values) response = await json_diff( input=EvaluatorInputInterface( @@ -1043,9 +1073,8 @@ async def auto_levenshtein_distance( settings_values: Dict[str, Any], lm_providers_keys: Dict[str, Any], # pylint: disable=unused-argument ) -> Result: - if not isinstance(output, str): - output = output.get("data", "") try: + output = validate_string_output("levenshtein_distance", output) correct_answer = get_correct_answer(data_point, settings_values) response = await levenshtein_distance( input=EvaluatorInputInterface( @@ -1086,9 +1115,8 @@ async def auto_similarity_match( settings_values: Dict[str, Any], lm_providers_keys: Dict[str, Any], ) -> Result: - if not isinstance(output, str): - output = output.get("data", "") try: + output = validate_string_output("similarity_match", output) correct_answer = get_correct_answer(data_point, settings_values) response = await similarity_match( input=EvaluatorInputInterface( @@ -1168,10 +1196,8 @@ async def auto_semantic_similarity( settings_values: Dict[str, Any], lm_providers_keys: Dict[str, Any], ) -> Result: - if not isinstance(output, str): - output = output.get("data", "") - try: + output = validate_string_output("semantic_similarity", output) correct_answer = get_correct_answer(data_point, settings_values) inputs = {"prediction": output, "ground_truth": correct_answer} response = await semantic_similarity( From b6db4f15d4891bdf97fc9d4540a17af53a589ede Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 21 Aug 2024 21:09:19 +0100 Subject: [PATCH 036/149] feat (tests): update parameters for BaseResponse compatibility and reflect changes in test cases - Added parameters in 'test_auto_json_diff' for BaseResponse compatibility - Updated parameters in 'test_auto_contains_json' to align with recent changes --- .../tests/unit/test_evaluators.py | 47 ++++++++++++++++--- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/unit/test_evaluators.py b/agenta-backend/agenta_backend/tests/unit/test_evaluators.py index 87388eca4b..eba88ac1f4 100644 --- a/agenta-backend/agenta_backend/tests/unit/test_evaluators.py +++ b/agenta-backend/agenta_backend/tests/unit/test_evaluators.py @@ -1,8 +1,7 @@ import os import pytest -from test_traces import simple_rag_trace - +from agenta_backend.tests.unit.test_traces import simple_rag_trace from agenta_backend.services.evaluators_service import ( auto_levenshtein_distance, auto_starts_with, @@ -175,13 +174,13 @@ async def test_auto_contains_all(output, substrings, case_sensitive, expected): @pytest.mark.parametrize( "output, expected", [ - ('Some random text {"key": "value"} more text', True), - ("No JSON here!", False), - ("{Malformed JSON, nope!}", False), + ('Some random text {"key": "value"} more text', None), + ("No JSON here!", None), + ("{Malformed JSON, nope!}", None), ('{"valid": "json", "number": 123}', True), - ({"data": {"message": "The capital of Azerbaijan is Baku."}}, None), + ({"data": {"message": "The capital of Azerbaijan is Baku."}}, True), ({"data": '{"message": "The capital of Azerbaijan is Baku."}'}, True), - ({"data": "The capital of Azerbaijan is Baku."}, False), + ({"data": "The capital of Azerbaijan is Baku."}, None), ], ) @pytest.mark.asyncio @@ -235,6 +234,40 @@ async def test_auto_contains_json(output, expected): 0.0, 1.0, ), + ( + { + "correct_answer": '{"user": {"name": "John", "details": {"age": 30, "location": "New York"}}}' + }, + { + "data": '{"USER": {"NAME": "John", "DETAILS": {"AGE": 30, "LOCATION": "New York"}}}' + }, + { + "predict_keys": True, + "compare_schema_only": False, + "case_insensitive_keys": True, + "correct_answer_key": "correct_answer", + }, + 0.0, + 1.0, + ), + ( + { + "correct_answer": '{"user": {"name": "John", "details": {"age": 30, "location": "New York"}}}' + }, + { + "data": { + "output": '{"USER": {"NAME": "John", "DETAILS": {"AGE": 30, "LOCATION": "New York"}}}' + } + }, + { + "predict_keys": True, + "compare_schema_only": False, + "case_insensitive_keys": True, + "correct_answer_key": "correct_answer", + }, + 0.0, + 1.0, + ), ], ) @pytest.mark.asyncio From 80f3effb11db8cc992d6ba7171e44a2bbe3b896a Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 21 Aug 2024 21:10:58 +0100 Subject: [PATCH 037/149] minor refactor (backend): update 'validate_json_output' function return-type and docstring --- .../agenta_backend/services/evaluators_service.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index e3bd9ae6a7..2078601462 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -53,8 +53,8 @@ def validate_string_output( return output -def validate_json_output(evaluator_key: str, output: Union[str, Dict[str, Any]]) -> str: - """Checks and validate the output to be of type JSON string. +def validate_json_output(evaluator_key: str, output: Union[str, Dict[str, Any]]) -> Union[str, dict]: + """Checks and validate the output to be of type JSON string or dictionary. Args: evaluator_key (str): the key of the evaluator @@ -64,7 +64,7 @@ def validate_json_output(evaluator_key: str, output: Union[str, Dict[str, Any]]) Exception: requires output to be a JSON string Returns: - str: output + str, dict: output """ output = output.get("data", "") if isinstance(output, dict) else output From 892a351be6d1fb72b68d9020e275e6b017e1de13 Mon Sep 17 00:00:00 2001 From: Abram Date: Wed, 21 Aug 2024 21:12:19 +0100 Subject: [PATCH 038/149] chore (style): format evaluators_service with black@23.12.0 --- agenta-backend/agenta_backend/services/evaluators_service.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 2078601462..0f70a32a51 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -53,7 +53,9 @@ def validate_string_output( return output -def validate_json_output(evaluator_key: str, output: Union[str, Dict[str, Any]]) -> Union[str, dict]: +def validate_json_output( + evaluator_key: str, output: Union[str, Dict[str, Any]] +) -> Union[str, dict]: """Checks and validate the output to be of type JSON string or dictionary. Args: From 33e6e170267061c1f9df73d6eca8974f1caa59bd Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 22 Aug 2024 01:04:21 +0100 Subject: [PATCH 039/149] refactor (backend): clean up LLM key checks in evaluators - Removed `requires_llm_api_keys` from evaluators that don't require LLM API keys - Ensured evaluators requiring LLM keys have `requires_llm_api_keys` set to `True` by default --- .../resources/evaluators/evaluators.py | 127 +----------------- .../agenta_backend/services/helpers.py | 8 +- .../tests/variants_main_router/conftest.py | 8 +- 3 files changed, 17 insertions(+), 126 deletions(-) diff --git a/agenta-backend/agenta_backend/resources/evaluators/evaluators.py b/agenta-backend/agenta_backend/resources/evaluators/evaluators.py index 99cd1006c2..1a8f6f5b77 100644 --- a/agenta-backend/agenta_backend/resources/evaluators/evaluators.py +++ b/agenta-backend/agenta_backend/resources/evaluators/evaluators.py @@ -1,12 +1,4 @@ rag_evaluator_settings_template = { - "requires_llm_api_keys": { - "label": "Requires LLM API Key(s)", - "type": "boolean", - "required": True, - "default": True, - "advanced": True, - "description": "Indicates whether the evaluation requires LLM API key(s) to function.", - }, "question_key": { "label": "Question Key", "default": "", @@ -38,14 +30,6 @@ "key": "auto_exact_match", "direct_use": True, "settings_template": { - "requires_llm_api_keys": { - "label": "Requires LLM API Key(s)", - "type": "boolean", - "required": True, - "default": False, - "advanced": True, - "description": "Indicates whether the evaluation requires LLM API key(s) to function.", - }, "correct_answer_key": { "label": "Expected Answer Column", "default": "correct_answer", @@ -62,16 +46,7 @@ "name": "Contains JSON", "key": "auto_contains_json", "direct_use": True, - "settings_template": { - "requires_llm_api_keys": { - "label": "Requires LLM API Key(s)", - "type": "boolean", - "required": True, - "default": False, - "advanced": True, - "description": "Indicates whether the evaluation requires LLM API key(s) to function.", - }, - }, + "settings_template": {}, "description": "'Contains JSON' evaluator checks if the output contains the a valid JSON.", "oss": True, }, @@ -80,14 +55,6 @@ "key": "auto_similarity_match", "direct_use": False, "settings_template": { - "requires_llm_api_keys": { - "label": "Requires LLM API Key(s)", - "type": "boolean", - "required": True, - "default": False, - "advanced": True, - "description": "Indicates whether the evaluation requires LLM API key(s) to function.", - }, "similarity_threshold": { "label": "Similarity Threshold", "type": "number", @@ -113,16 +80,9 @@ "name": "Semantic Similarity Match", "key": "auto_semantic_similarity", "direct_use": False, + "requires_llm_api_keys": True, "description": "Semantic Similarity Match evaluator measures the similarity between two pieces of text by analyzing their meaning and context. It compares the semantic content, providing a score that reflects how closely the texts match in terms of meaning, rather than just exact word matches.", "settings_template": { - "requires_llm_api_keys": { - "label": "Requires LLM API Key(s)", - "type": "boolean", - "required": True, - "advanced": True, - "default": True, - "description": "Indicates whether the evaluation requires LLM API key(s) to function.", - }, "correct_answer_key": { "label": "Expected Answer Column", "default": "correct_answer", @@ -140,14 +100,6 @@ "direct_use": False, "description": "Regex Test evaluator checks if the generated answer matches a regular expression pattern. You need to provide the regex expression and specify whether an answer is correct if it matches or does not match the regex.", "settings_template": { - "requires_llm_api_keys": { - "label": "Requires LLM API Key(s)", - "type": "boolean", - "required": True, - "default": False, - "advanced": True, - "description": "Indicates whether the evaluation requires LLM API key(s) to function.", - }, "regex_pattern": { "label": "Regex Pattern", "type": "regex", @@ -169,14 +121,6 @@ "key": "field_match_test", "direct_use": False, "settings_template": { - "requires_llm_api_keys": { - "label": "Requires LLM API Key(s)", - "type": "boolean", - "required": True, - "default": False, - "advanced": True, - "description": "Indicates whether the evaluation requires LLM API key(s) to function.", - }, "json_field": { "label": "JSON Field", "type": "string", @@ -202,14 +146,6 @@ "direct_use": False, "description": "Compares the generated JSON output to a ground truth JSON and returns a normalized score between 0 and 1 based on their differences.", "settings_template": { - "requires_llm_api_keys": { - "label": "Requires LLM API Key(s)", - "type": "boolean", - "required": True, - "default": False, - "advanced": True, - "description": "Indicates whether the evaluation requires LLM API key(s) to function.", - }, "compare_schema_only": { "label": "Compare Schema Only", "type": "boolean", @@ -246,15 +182,8 @@ "name": "LLM-as-a-judge", "key": "auto_ai_critique", "direct_use": False, + "requires_llm_api_keys": True, "settings_template": { - "requires_llm_api_keys": { - "label": "Requires LLM API Key(s)", - "type": "boolean", - "required": True, - "default": True, - "advanced": True, - "description": "Indicates whether the evaluation requires LLM API key(s) to function.", - }, "prompt_template": { "label": "Prompt Template", "type": "text", @@ -342,14 +271,6 @@ "key": "auto_starts_with", "direct_use": False, "settings_template": { - "requires_llm_api_keys": { - "label": "Requires LLM API Key(s)", - "type": "boolean", - "required": True, - "default": False, - "advanced": True, - "description": "Indicates whether the evaluation requires LLM API key(s) to function.", - }, "prefix": { "label": "prefix", "type": "string", @@ -371,14 +292,6 @@ "key": "auto_ends_with", "direct_use": False, "settings_template": { - "requires_llm_api_keys": { - "label": "Requires LLM API Key(s)", - "type": "boolean", - "required": True, - "default": False, - "advanced": True, - "description": "Indicates whether the evaluation requires LLM API key(s) to function.", - }, "case_sensitive": { "label": "Case Sensitive", "type": "boolean", @@ -400,14 +313,6 @@ "key": "auto_contains", "direct_use": False, "settings_template": { - "requires_llm_api_keys": { - "label": "Requires LLM API Key(s)", - "type": "boolean", - "required": True, - "default": False, - "advanced": True, - "description": "Indicates whether the evaluation requires LLM API key(s) to function.", - }, "case_sensitive": { "label": "Case Sensitive", "type": "boolean", @@ -429,14 +334,6 @@ "key": "auto_contains_any", "direct_use": False, "settings_template": { - "requires_llm_api_keys": { - "label": "Requires LLM API Key(s)", - "type": "boolean", - "required": True, - "default": False, - "advanced": True, - "description": "Indicates whether the evaluation requires LLM API key(s) to function.", - }, "case_sensitive": { "label": "Case Sensitive", "type": "boolean", @@ -458,14 +355,6 @@ "key": "auto_contains_all", "direct_use": False, "settings_template": { - "requires_llm_api_keys": { - "label": "Requires LLM API Key(s)", - "type": "boolean", - "required": True, - "default": False, - "advanced": True, - "description": "Indicates whether the evaluation requires LLM API key(s) to function.", - }, "case_sensitive": { "label": "Case Sensitive", "type": "boolean", @@ -487,14 +376,6 @@ "key": "auto_levenshtein_distance", "direct_use": False, "settings_template": { - "requires_llm_api_keys": { - "label": "Requires LLM API Key(s)", - "type": "boolean", - "required": True, - "default": False, - "advanced": True, - "description": "Indicates whether the evaluation requires LLM API key(s) to function.", - }, "threshold": { "label": "Threshold", "type": "number", @@ -517,6 +398,7 @@ "name": "RAG Faithfulness", "key": "rag_faithfulness", "direct_use": False, + "requires_llm_api_keys": True, "settings_template": rag_evaluator_settings_template, "description": "RAG Faithfulness evaluator assesses the accuracy and reliability of responses generated by Retrieval-Augmented Generation (RAG) models. It evaluates how faithfully the responses adhere to the retrieved documents or sources, ensuring that the generated text accurately reflects the information from the original sources.", }, @@ -524,6 +406,7 @@ "name": "RAG Context Relevancy", "key": "rag_context_relevancy", "direct_use": False, + "requires_llm_api_keys": True, "settings_template": rag_evaluator_settings_template, "description": "RAG Context Relevancy evaluator measures how relevant the retrieved documents or contexts are to the given question or prompt. It ensures that the selected documents provide the necessary information for generating accurate and meaningful responses, improving the overall quality of the RAG model's output.", }, diff --git a/agenta-backend/agenta_backend/services/helpers.py b/agenta-backend/agenta_backend/services/helpers.py index 1731dfe64f..18951ad6f7 100644 --- a/agenta-backend/agenta_backend/services/helpers.py +++ b/agenta-backend/agenta_backend/services/helpers.py @@ -123,8 +123,12 @@ async def ensure_required_llm_keys_exist( evaluators_requiring_llm_keys = [ evaluator["key"] for evaluator in get_all_evaluators() - if evaluator["settings_template"]["requires_llm_api_keys"].get("default", False) - is True + if evaluator.get("requires_llm_api_keys", False) + or ( + evaluator.get("settings_template", {}) + .get("requires_llm_api_keys", {}) + .get("default", False) + ) ] evaluators_found = ( await db_manager.check_if_evaluators_exist_in_list_of_evaluators_configs( diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py index 4fec2e75d6..5356ad0e9c 100644 --- a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py @@ -227,8 +227,12 @@ def evaluators_requiring_llm_keys(): evaluators_requiring_llm_keys = [ evaluator["key"] for evaluator in get_all_evaluators() - if evaluator["settings_template"]["requires_llm_api_keys"].get("default", False) - is True + if evaluator.get("requires_llm_api_keys", False) + or ( + evaluator.get("settings_template", {}) + .get("requires_llm_api_keys", {}) + .get("default", False) + ) ] return evaluators_requiring_llm_keys From 7c28f6d14878d009a2eb38cd5924dafee76d745e Mon Sep 17 00:00:00 2001 From: Abram Date: Thu, 22 Aug 2024 01:05:45 +0100 Subject: [PATCH 040/149] chore (tests): add '@pytest.mark.asyncio' to test cases in test_user_profile --- .../variants_main_router/test_variant_evaluators_router.py | 1 - .../tests/variants_user_profile_router/test_user_profile.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py index dcbea2d10d..a2067da77e 100644 --- a/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py +++ b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py @@ -176,7 +176,6 @@ async def fetch_evaluation_results(evaluation_id): f"{BACKEND_API_HOST}/evaluations/{evaluation_id}/results/", timeout=timeout ) response_data = response.json() - print("Response Data: ", response_data) assert response.status_code == 200 assert response_data["evaluation_id"] == evaluation_id diff --git a/agenta-backend/agenta_backend/tests/variants_user_profile_router/test_user_profile.py b/agenta-backend/agenta_backend/tests/variants_user_profile_router/test_user_profile.py index 1fd8a4aec6..d7fd237994 100644 --- a/agenta-backend/agenta_backend/tests/variants_user_profile_router/test_user_profile.py +++ b/agenta-backend/agenta_backend/tests/variants_user_profile_router/test_user_profile.py @@ -48,6 +48,7 @@ async def test_fetch_user_profile_without_user_id(): assert response.json()["username"] == user_db_dict["username"] +@pytest.mark.asyncio async def test_fetch_user_profile_with_valid_user_id(): async with db_engine.get_session() as session: result = await session.execute(select(UserDB).filter_by(uid="0")) @@ -75,6 +76,7 @@ async def test_fetch_user_profile_with_valid_user_id(): assert response.json()["username"] == user_db_dict["username"] +@pytest.mark.asyncio async def test_fetch_user_profile_with_non_existent_user_id_error(): user_non_existent_id = str(uuid4()) response = await test_client.get( From 3cad5dbeba185be7375a8a419cbef138d6fa4138 Mon Sep 17 00:00:00 2001 From: Juan Pablo Vega Date: Fri, 23 Aug 2024 13:11:47 +0200 Subject: [PATCH 041/149] Enforce in Union[str, Dict[str, Any]] in BaseResponse in SDK --- .../agenta/sdk/decorators/llm_entrypoint.py | 21 ++++++++++++------- agenta-cli/pyproject.toml | 2 +- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/agenta-cli/agenta/sdk/decorators/llm_entrypoint.py b/agenta-cli/agenta/sdk/decorators/llm_entrypoint.py index 025b55a9b0..e7f6cf9dd5 100644 --- a/agenta-cli/agenta/sdk/decorators/llm_entrypoint.py +++ b/agenta-cli/agenta/sdk/decorators/llm_entrypoint.py @@ -216,9 +216,11 @@ async def wrapper(*args, **kwargs) -> Any: { "func": func.__name__, "endpoint": route, - "params": {**config_params, **func_signature.parameters} - if not config - else func_signature.parameters, + "params": ( + {**config_params, **func_signature.parameters} + if not config + else func_signature.parameters + ), "config": config, } ) @@ -229,9 +231,11 @@ async def wrapper(*args, **kwargs) -> Any: { "func": func.__name__, "endpoint": route, - "params": {**config_params, **func_signature.parameters} - if not config - else func_signature.parameters, + "params": ( + {**config_params, **func_signature.parameters} + if not config + else func_signature.parameters + ), "config": config, } ) @@ -402,7 +406,7 @@ async def execute_function( # PATCH : if result is not a dict, make it a dict if not isinstance(result, dict): - data = result + data = str(result) else: # PATCH : if result is a legacy dict, clean it up if ( @@ -410,7 +414,8 @@ async def execute_function( and "cost" in result.keys() and "usage" in result.keys() ): - data = result["message"] + data = str(result["message"]) + # END OF PATH if data is None: diff --git a/agenta-cli/pyproject.toml b/agenta-cli/pyproject.toml index d015e2923b..89acb8a67c 100644 --- a/agenta-cli/pyproject.toml +++ b/agenta-cli/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "agenta" -version = "0.24.0" +version = "0.24.1a0" description = "The SDK for agenta is an open-source LLMOps platform." readme = "README.md" authors = ["Mahmoud Mabrouk "] From 91d23d8f9e87cb4ec7eac6f6dcfffdbdf8c0522b Mon Sep 17 00:00:00 2001 From: Juan Pablo Vega Date: Fri, 23 Aug 2024 14:14:37 +0200 Subject: [PATCH 042/149] fix ai critique --- .../services/evaluators_service.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 392fee8b85..2328184ae6 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -337,13 +337,20 @@ async def auto_ai_critique( try: correct_answer = get_correct_answer(data_point, settings_values) inputs = { - "prompt_user": app_params.get("prompt_user", ""), + "prompt_user": app_params.get("prompt_user", "").format(**data_point), "prediction": output, "ground_truth": correct_answer, } + settings = { + "prompt_template": settings_values.get("prompt_template", ""), + } response = await ai_critique( input=EvaluatorInputInterface( - **{"inputs": inputs, "credentials": lm_providers_keys} + **{ + "inputs": inputs, + "settings": settings, + "credentials": lm_providers_keys, + } ) ) return Result(type="text", value=response["outputs"]["score"]) @@ -374,12 +381,14 @@ async def ai_critique(input: EvaluatorInputInterface) -> EvaluatorOutputInterfac for key, value in input.inputs.items(): chain_run_args[key] = value - prompt_template = input.settings["prompt_template"] + prompt_system = input.settings.get("prompt_system", "") messages = [ - {"role": "system", "content": prompt_template}, + {"role": "system", "content": prompt_system}, {"role": "user", "content": str(chain_run_args)}, ] + print(input) + client = AsyncOpenAI(api_key=openai_api_key) response = await client.chat.completions.create( model="gpt-3.5-turbo", messages=messages, temperature=0.8 From cd2546a7284e7a742ef3db4bfc7450f7b0b8b4be Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Fri, 23 Aug 2024 13:31:14 +0100 Subject: [PATCH 043/149] initial commit: setup configure evaluator modal --- agenta-web/src/components/Sidebar/config.tsx | 48 +--- .../autoEvaluation/AutoEvaluation.tsx | 241 ++++++++++++++++++ .../evaluators/ConfigureEvaluatorModal.tsx | 143 +++++++++++ .../evaluators/EvaluatorCard.tsx | 148 +++++++++++ .../evaluators/EvaluatorList.tsx | 128 ++++++++++ .../pages/apps/[app_id]/evaluations/index.tsx | 74 ++++++ 6 files changed, 738 insertions(+), 44 deletions(-) create mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx create mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/ConfigureEvaluatorModal.tsx create mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/EvaluatorCard.tsx create mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/EvaluatorList.tsx create mode 100644 agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx diff --git a/agenta-web/src/components/Sidebar/config.tsx b/agenta-web/src/components/Sidebar/config.tsx index 07c4bc06b5..bdf9a25a6e 100644 --- a/agenta-web/src/components/Sidebar/config.tsx +++ b/agenta-web/src/components/Sidebar/config.tsx @@ -93,51 +93,11 @@ export const useSidebarConfig = () => { isHidden: !appId && !recentlyVisitedAppId, }, { - key: "app-auto-evaluations-link", - title: "Automatic Evaluation", - icon: , + key: "app-evaluations-link", + title: "Evaluations", + link: `/apps/${appId || recentlyVisitedAppId}/evaluations`, isHidden: !appId && !recentlyVisitedAppId, - submenu: [ - { - key: "app-evaluators-link", - title: "Evaluators", - tooltip: - "Select and customize evaluators such as custom code or regex evaluators.", - link: `/apps/${appId || recentlyVisitedAppId}/evaluations/new-evaluator`, - icon: , - }, - { - key: "app-evaluations-results-link", - title: "Results", - tooltip: "Choose your variants and evaluators to start the evaluation process.", - link: `/apps/${appId || recentlyVisitedAppId}/evaluations/results`, - icon: , - }, - ], - }, - { - key: "app-human-evaluations-link", - title: "Human Evaluation", - icon: , - isHidden: !appId && !recentlyVisitedAppId, - submenu: [ - { - key: "app-human-ab-testing-link", - title: "A/B Evaluation", - tooltip: - "A/B tests allow you to compare the performance of two different variants manually.", - link: `/apps/${appId || recentlyVisitedAppId}/annotations/human_a_b_testing`, - icon: , - }, - { - key: "app-single-model-test-link", - title: "Single Model Eval.", - tooltip: - "Single model test allows you to score the performance of a single LLM app manually.", - link: `/apps/${appId || recentlyVisitedAppId}/annotations/single_model_test`, - icon: , - }, - ], + icon: , }, { key: "app-observability-link", diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx new file mode 100644 index 0000000000..8480f2d75b --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx @@ -0,0 +1,241 @@ +import {_Evaluation, JSSTheme} from "@/lib/Types" +import { + ArrowsLeftRight, + Columns, + Database, + Gauge, + GearSix, + Note, + Plus, + Rocket, + Trash, +} from "@phosphor-icons/react" +import {Button, Dropdown, Space, Spin, Table} from "antd" +import React, {useState} from "react" +import {createUseStyles} from "react-jss" +import ConfigureEvaluatorModal from "./evaluators/ConfigureEvaluatorModal" +import {ColumnsType} from "antd/es/table" +import {MoreOutlined} from "@ant-design/icons" + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + button: { + display: "flex", + alignItems: "center", + }, +})) + +const AutoEvaluation = () => { + const classes = useStyles() + const [selectedRowKeys, setSelectedRowKeys] = useState([]) + const [isConfigEvaluatorModalOpen, setIsConfigEvaluatorModalOpen] = useState(false) + + const columns: ColumnsType<_Evaluation> = [ + { + title: "Variant", + dataIndex: "variants", + key: "variants", + fixed: "left", + onHeaderCell: () => ({ + style: {minWidth: 160}, + }), + }, + { + title: "Test set", + dataIndex: "testsetName", + key: "testsetName", + onHeaderCell: () => ({ + style: {minWidth: 160}, + }), + }, + { + title: "Status", + dataIndex: "status", + key: "status", + onHeaderCell: () => ({ + style: {minWidth: 240}, + }), + }, + { + title: "Results", + children: [ + { + title: "Evaluator 1", + dataIndex: "aggregated_results", + key: "results", + onHeaderCell: () => ({ + style: {minWidth: 240}, + }), + }, + { + title: "Evaluator 2", + dataIndex: "aggregated_results", + key: "results", + onHeaderCell: () => ({ + style: {minWidth: 240}, + }), + }, + { + title: "Evaluator 3", + dataIndex: "aggregated_results", + key: "results", + onHeaderCell: () => ({ + style: {minWidth: 240}, + }), + }, + ], + }, + { + title: "Created on", + dataIndex: "created_at", + key: "createdAt", + onHeaderCell: () => ({ + style: {minWidth: 160}, + }), + }, + { + title: "Avg. Latency", + dataIndex: "average_latency", + key: "average_latency", + onHeaderCell: () => ({ + style: {minWidth: 160}, + }), + }, + { + title: "Total Cost", + dataIndex: "average_cost", + key: "average_cost", + onHeaderCell: () => ({ + style: {minWidth: 160}, + }), + }, + { + title: , + key: "key", + width: 56, + fixed: "right", + align: "center", + render: (_, record) => { + return ( + , + onClick: (e) => { + e.domEvent.stopPropagation() + }, + }, + { + key: "variant", + label: "View variant", + icon: , + onClick: (e) => { + e.domEvent.stopPropagation() + }, + }, + { + key: "view_testset", + label: "View test set", + icon: , + onClick: (e) => { + e.domEvent.stopPropagation() + }, + }, + {type: "divider"}, + { + key: "delete_eval", + label: "Delete", + icon: , + danger: true, + onClick: (e) => { + e.domEvent.stopPropagation() + }, + }, + ], + }} + > + + + + + + + + + + + + { + setSelectedRowKeys(selectedRowKeys) + }, + }} + className="ph-no-capture" + columns={columns} + rowKey={"id"} + dataSource={[]} + scroll={{x: true}} + bordered + pagination={false} + onRow={(record) => ({ + style: {cursor: "pointer"}, + onClick: () => {}, + })} + /> + + + setIsConfigEvaluatorModalOpen(false)} + /> + + ) +} + +export default AutoEvaluation diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/ConfigureEvaluatorModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/ConfigureEvaluatorModal.tsx new file mode 100644 index 0000000000..1926ab3459 --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/ConfigureEvaluatorModal.tsx @@ -0,0 +1,143 @@ +import {useAppId} from "@/hooks/useAppId" +import {JSSTheme} from "@/lib/Types" +import {CloseOutlined, PlusOutlined} from "@ant-design/icons" +import {Cards, Table} from "@phosphor-icons/react" +import {Button, Divider, Input, Modal, Radio, Space, Typography} from "antd" +import React, {useEffect, useState} from "react" +import {createUseStyles} from "react-jss" +import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" +import {fetchAllEvaluatorConfigs, fetchAllEvaluators} from "@/services/evaluations/api" +import {useAtom} from "jotai" +import EvaluatorCard from "./EvaluatorCard" +import EvaluatorList from "./EvaluatorList" + +type ConfigureEvaluatorModalProps = {} & React.ComponentProps + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + titleContainer: { + display: "flex", + alignItems: "center", + justifyContent: "space-between", + "& h1": { + fontSize: theme.fontSizeLG, + fontWeight: theme.fontWeightStrong, + lineHeight: theme.lineHeightLG, + }, + }, + bodyContainer: { + padding: `${theme.padding}px 0`, + "& > div:nth-of-type(1)": { + backgroundColor: theme.colorBgContainer, + position: "sticky", + top: 0, + }, + "& > div:nth-of-type(2)": { + height: 800, + overflowY: "auto", + }, + }, + radioBtnContainer: { + display: "flex", + alignItems: "center", + gap: theme.marginXS, + "& .ant-radio-button-wrapper": { + borderRadius: theme.borderRadius, + borderInlineStartWidth: "initial", + "&:before": { + width: 0, + }, + "&:not(.ant-radio-button-wrapper-checked)": { + border: "none", + "&:hover": { + backgroundColor: theme.colorBgTextHover, + }, + }, + }, + }, +})) + +const ConfigureEvaluatorModal = ({...props}: ConfigureEvaluatorModalProps) => { + const classes = useStyles() + const appId = useAppId() + const setEvaluators = useAtom(evaluatorsAtom)[1] + const [evaluatorConfigs, setEvaluatorConfigs] = useAtom(evaluatorConfigsAtom) + const [evaluatorsDisplay, setEvaluatorsDisplay] = useState("card") + const [selectedEvaluatorCategory, setSelectedEvaluatorCategory] = useState("view_all") + + useEffect(() => { + Promise.all([fetchAllEvaluators(), fetchAllEvaluatorConfigs(appId)]).then( + ([evaluators, configs]) => { + setEvaluators(evaluators) + setEvaluatorConfigs(configs) + }, + ) + }, [appId]) + + return ( + + Configure evaluators + + + + props.onCancel?.({} as any)} /> + + + } + {...props} + > +
+
+
+ setSelectedEvaluatorCategory(e.target.value)} + > + View all + + {["RAG", "Classifiers", "Similarity", "AI / LLM", "Functional"].map( + (val, idx) => ( + + {val} + + ), + )} + + + + setEvaluatorsDisplay(e.target.value)} + > + +
+ + + + + + + + + + +
+ {evaluatorsDisplay === "list" ? ( + + ) : ( + + )} +
+ + + ) +} + +export default ConfigureEvaluatorModal diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/EvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/EvaluatorCard.tsx new file mode 100644 index 0000000000..3533683ac2 --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/EvaluatorCard.tsx @@ -0,0 +1,148 @@ +import {EvaluatorConfig, JSSTheme} from "@/lib/Types" +import {MoreOutlined} from "@ant-design/icons" +import {Copy, Note, Trash} from "@phosphor-icons/react" +import {Button, Card, Dropdown, Tag, Typography} from "antd" +import React from "react" +import {createUseStyles} from "react-jss" + +interface EvaluatorCardProps { + evaluatorConfigs: EvaluatorConfig[] +} + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + container: { + display: "flex", + flexDirection: "column", + gap: theme.paddingLG, + }, + cardTitle: { + fontSize: theme.fontSizeLG, + lineHeight: theme.lineHeightLG, + fontWeight: theme.fontWeightMedium, + }, + evaluatorCard: { + width: 276, + display: "flex", + flexDirection: "column", + transition: "all 0.025s ease-in", + cursor: "pointer", + "& > .ant-card-head": { + minHeight: 0, + padding: theme.paddingSM, + + "& .ant-card-head-title": { + fontSize: theme.fontSize, + fontWeight: theme.fontWeightMedium, + lineHeight: theme.lineHeight, + }, + }, + "& > .ant-card-body": { + padding: theme.paddingSM, + display: "flex", + flexDirection: "column", + gap: theme.marginXS, + "& div": { + display: "flex", + alignItems: "center", + justifyContent: "space-between", + }, + }, + "&:hover": {}, + }, +})) + +const EvaluatorCard = ({evaluatorConfigs}: EvaluatorCardProps) => { + const classes = useStyles() + + const formatEvluatorConfigs = Object.entries( + evaluatorConfigs.reduce( + (acc, curr) => { + if (!acc[curr.evaluator_key]) { + acc[curr.evaluator_key] = [] + } + acc[curr.evaluator_key].push(curr) + return acc + }, + {} as Record, + ), + ).map(([title, items]) => ({ + title, + items, + })) + + return ( +
+ {formatEvluatorConfigs.map(({title, items}) => ( +
+ {title} +
+ {items.map((item) => ( + , + onClick: (e: any) => { + e.domEvent.stopPropagation() + }, + }, + { + key: "clone", + label: "Clone", + icon: , + onClick: (e: any) => { + e.domEvent.stopPropagation() + }, + }, + {type: "divider"}, + { + key: "delete_app", + label: "Delete", + icon: , + danger: true, + onClick: (e: any) => { + e.domEvent.stopPropagation() + }, + }, + ], + }} + > +
+
+ ))} +
+ ) +} + +export default EvaluatorCard diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/EvaluatorList.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/EvaluatorList.tsx new file mode 100644 index 0000000000..810f1e3f06 --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/EvaluatorList.tsx @@ -0,0 +1,128 @@ +import {EvaluatorConfig} from "@/lib/Types" +import {MoreOutlined} from "@ant-design/icons" +import {Copy, GearSix, Note, Trash} from "@phosphor-icons/react" +import {Button, Dropdown, Table} from "antd" +import {ColumnsType} from "antd/es/table" +import React, {useState} from "react" + +interface EvaluatorListProps { + evaluatorConfigs: EvaluatorConfig[] +} + +const EvaluatorList = ({evaluatorConfigs}: EvaluatorListProps) => { + const [selectedRowKeys, setSelectedRowKeys] = useState([]) + + const columns: ColumnsType = [ + { + title: "Version", + dataIndex: "version", + key: "version", + onHeaderCell: () => ({ + style: {minWidth: 80}, + }), + }, + { + title: "Name", + dataIndex: "name", + key: "name", + onHeaderCell: () => ({ + style: {minWidth: 400}, + }), + }, + { + title: "Type", + dataIndex: "type", + key: "type", + onHeaderCell: () => ({ + style: {minWidth: 200}, + }), + }, + { + title: "Tags", + dataIndex: "tags", + key: "tags", + onHeaderCell: () => ({ + style: {minWidth: 400}, + }), + }, + { + title: , + key: "key", + width: 56, + fixed: "right", + align: "center", + render: (_, record) => { + return ( + , + onClick: (e: any) => { + e.domEvent.stopPropagation() + }, + }, + { + key: "clone", + label: "Clone", + icon: , + onClick: (e: any) => { + e.domEvent.stopPropagation() + }, + }, + {type: "divider"}, + { + key: "delete_app", + label: "Delete", + icon: , + danger: true, + onClick: (e: any) => { + e.domEvent.stopPropagation() + }, + }, + ], + }} + > +
{ + setSelectedRowKeys(selectedRowKeys) + }, + fixed: "left", + }} + className="ph-no-capture" + columns={columns} + rowKey={"id"} + dataSource={evaluatorConfigs} + scroll={{x: true}} + bordered + pagination={false} + onRow={(record) => ({ + style: {cursor: "pointer"}, + onClick: () => {}, + })} + /> + ) +} + +export default EvaluatorList diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx new file mode 100644 index 0000000000..7edef5283f --- /dev/null +++ b/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx @@ -0,0 +1,74 @@ +import AutoEvaluation from "@/components/pages/evaluations/autoEvaluation/AutoEvaluation" +import {useQueryParam} from "@/hooks/useQuery" +import {JSSTheme} from "@/lib/Types" +import {ChartDonut, ListChecks, TestTube} from "@phosphor-icons/react" +import {Tabs, TabsProps, Typography} from "antd" +import React from "react" +import {createUseStyles} from "react-jss" + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + container: { + display: "flex", + flexDirection: "column", + gap: theme.marginLG, + }, + title: { + fontSize: theme.fontSizeLG, + fontWeight: theme.fontWeightMedium, + lineHeight: theme.lineHeightHeading4, + }, + evaluationTabContainer: { + "& .ant-tabs-nav": { + marginBottom: theme.marginLG, + }, + "& .ant-tabs-tab-btn": { + display: "flex", + alignItems: "center", + "& .ant-tabs-tab-icon": { + display: "flex", + }, + }, + }, +})) + +const EvaluationsPage = () => { + const classes = useStyles() + const [selectedEvaluation, setSelectedEvaluation] = useQueryParam( + "selectedEvaluation", + "auto_evaluation", + ) + + const items: TabsProps["items"] = [ + { + key: "auto_evaluation", + label: "Automatic Evaluation", + icon: , + children: , + }, + { + key: "ab_testing_evaluation", + label: "A/B Testing Evaluation", + icon: , + }, + { + key: "single_model_evaluation", + label: "Single Model Evaluation", + icon: , + }, + ] + + return ( +
+ Evaluations + + +
+ ) +} + +export default EvaluationsPage From 31d10e1f46765ee950d7ab215b4685e49c0a829d Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 23 Aug 2024 13:31:52 +0100 Subject: [PATCH 044/149] minor refactor (backend): include ai_critique evaluator settings_values to EvaluatorInputInterface --- agenta-backend/agenta_backend/services/evaluators_service.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index cd5b556638..1311fc0667 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -343,10 +343,10 @@ async def auto_ai_critique( } response = await ai_critique( input=EvaluatorInputInterface( - **{"inputs": inputs, "credentials": lm_providers_keys} + **{"inputs": inputs, "settings": settings_values, "credentials": lm_providers_keys} ) ) - return Result(type="text", value=response["outputs"]["score"]) + return Result(type="text", value=str(response["outputs"]["score"])) except Exception as e: # pylint: disable=broad-except return Result( type="error", From b224f1052fedee34e8c8e4d6128d33f5d1105d1a Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 23 Aug 2024 13:36:28 +0100 Subject: [PATCH 045/149] chore (style): format evaluators_service with black@23.12.0 --- .../agenta_backend/services/evaluators_service.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 1311fc0667..8cf8ec6948 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -343,7 +343,11 @@ async def auto_ai_critique( } response = await ai_critique( input=EvaluatorInputInterface( - **{"inputs": inputs, "settings": settings_values, "credentials": lm_providers_keys} + **{ + "inputs": inputs, + "settings": settings_values, + "credentials": lm_providers_keys, + } ) ) return Result(type="text", value=str(response["outputs"]["score"])) From ca81cea88ace482dabaa832f0714483cfc26bb17 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 23 Aug 2024 14:32:10 +0100 Subject: [PATCH 046/149] minor refactor (backend): resolve ValueError when casting string to float for ai critique evaluator --- agenta-backend/agenta_backend/services/evaluators_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 2328184ae6..bfb5861589 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -394,7 +394,7 @@ async def ai_critique(input: EvaluatorInputInterface) -> EvaluatorOutputInterfac model="gpt-3.5-turbo", messages=messages, temperature=0.8 ) evaluation_output = response.choices[0].message.content.strip() - return {"outputs": {"score": float(evaluation_output)}} + return {"outputs": {"score": evaluation_output}} async def auto_starts_with( From 2402f94b120e0408ced3993ebbb2afea4be91132 Mon Sep 17 00:00:00 2001 From: Juan Pablo Vega Date: Fri, 23 Aug 2024 16:03:54 +0200 Subject: [PATCH 047/149] fix exception message and bump SDK out of pre-release --- agenta-backend/agenta_backend/services/evaluators_service.py | 2 +- agenta-cli/pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 0f70a32a51..ce40e091ca 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -77,7 +77,7 @@ def validate_json_output( json.loads(output) except json.JSONDecodeError: raise Exception( - f"Evaluator {evaluator_key} requires the output to be a JSON string." + f"Evaluator {evaluator_key} requires the output to be a JSON string or object." ) if not isinstance( diff --git a/agenta-cli/pyproject.toml b/agenta-cli/pyproject.toml index 89acb8a67c..6c0546dac0 100644 --- a/agenta-cli/pyproject.toml +++ b/agenta-cli/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "agenta" -version = "0.24.1a0" +version = "0.24.1" description = "The SDK for agenta is an open-source LLMOps platform." readme = "README.md" authors = ["Mahmoud Mabrouk "] From c160b72ae38ec5f4e2b6c946e2b7e3ab3ce35887 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Sat, 24 Aug 2024 01:33:33 +0100 Subject: [PATCH 048/149] improved file structure(frontend) --- .../ConfigureEvaluators}/EvaluatorCard.tsx | 0 .../ConfigureEvaluators}/EvaluatorList.tsx | 0 .../ConfigureEvaluators/index.tsx} | 89 ++++++++----------- 3 files changed, 37 insertions(+), 52 deletions(-) rename agenta-web/src/components/pages/evaluations/autoEvaluation/{evaluators => EvaluatorsModal/ConfigureEvaluators}/EvaluatorCard.tsx (100%) rename agenta-web/src/components/pages/evaluations/autoEvaluation/{evaluators => EvaluatorsModal/ConfigureEvaluators}/EvaluatorList.tsx (100%) rename agenta-web/src/components/pages/evaluations/autoEvaluation/{evaluators/ConfigureEvaluatorModal.tsx => EvaluatorsModal/ConfigureEvaluators/index.tsx} (62%) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/EvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorCard.tsx similarity index 100% rename from agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/EvaluatorCard.tsx rename to agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorCard.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/EvaluatorList.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorList.tsx similarity index 100% rename from agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/EvaluatorList.tsx rename to agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorList.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/ConfigureEvaluatorModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx similarity index 62% rename from agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/ConfigureEvaluatorModal.tsx rename to agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx index 1926ab3459..d15ce09e7d 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/evaluators/ConfigureEvaluatorModal.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx @@ -1,17 +1,19 @@ -import {useAppId} from "@/hooks/useAppId" -import {JSSTheme} from "@/lib/Types" +import {EvaluatorConfig, JSSTheme} from "@/lib/Types" import {CloseOutlined, PlusOutlined} from "@ant-design/icons" import {Cards, Table} from "@phosphor-icons/react" -import {Button, Divider, Input, Modal, Radio, Space, Typography} from "antd" -import React, {useEffect, useState} from "react" +import {Button, Divider, Input, Radio, Space, Typography} from "antd" +import React, {useState} from "react" import {createUseStyles} from "react-jss" -import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" -import {fetchAllEvaluatorConfigs, fetchAllEvaluators} from "@/services/evaluations/api" -import {useAtom} from "jotai" import EvaluatorCard from "./EvaluatorCard" import EvaluatorList from "./EvaluatorList" -type ConfigureEvaluatorModalProps = {} & React.ComponentProps +type ConfigureEvaluatorModalProps = { + evaluatorConfigs: EvaluatorConfig[] + handleOnCancel: () => void + selectedEvaluatorCategory: string + setSelectedEvaluatorCategory: React.Dispatch> + setCurrent: React.Dispatch> +} const useStyles = createUseStyles((theme: JSSTheme) => ({ titleContainer: { @@ -24,17 +26,10 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ lineHeight: theme.lineHeightLG, }, }, - bodyContainer: { - padding: `${theme.padding}px 0`, - "& > div:nth-of-type(1)": { - backgroundColor: theme.colorBgContainer, - position: "sticky", - top: 0, - }, - "& > div:nth-of-type(2)": { - height: 800, - overflowY: "auto", - }, + header: { + display: "flex", + flexDirection: "column", + gap: theme.padding, }, radioBtnContainer: { display: "flex", @@ -56,43 +51,33 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ }, })) -const ConfigureEvaluatorModal = ({...props}: ConfigureEvaluatorModalProps) => { +const ConfigureEvaluatorModal = ({ + evaluatorConfigs, + handleOnCancel, + selectedEvaluatorCategory, + setSelectedEvaluatorCategory, + setCurrent, +}: ConfigureEvaluatorModalProps) => { const classes = useStyles() - const appId = useAppId() - const setEvaluators = useAtom(evaluatorsAtom)[1] - const [evaluatorConfigs, setEvaluatorConfigs] = useAtom(evaluatorConfigsAtom) const [evaluatorsDisplay, setEvaluatorsDisplay] = useState("card") - const [selectedEvaluatorCategory, setSelectedEvaluatorCategory] = useState("view_all") - - useEffect(() => { - Promise.all([fetchAllEvaluators(), fetchAllEvaluatorConfigs(appId)]).then( - ([evaluators, configs]) => { - setEvaluators(evaluators) - setEvaluatorConfigs(configs) - }, - ) - }, [appId]) return ( - +
Configure evaluators - - props.onCancel?.({} as any)} /> +
- } - {...props} - > -
{
+
-
- {evaluatorsDisplay === "list" ? ( - - ) : ( - - )} -
+
+ {evaluatorsDisplay === "list" ? ( + + ) : ( + + )}
- +
) } From d5eb285011559581a550255f5fdbaf7ec70e9191 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Sat, 24 Aug 2024 01:36:09 +0100 Subject: [PATCH 049/149] design(frontend): added evaluator modal component steps --- .../autoEvaluation/AutoEvaluation.tsx | 4 +- .../ConfigureNewEvaluator/index.tsx | 16 ++++ .../CreateNewEvaluator/index.tsx | 17 +++++ .../EvaluatorsModal/EvaluatorsModal.tsx | 75 +++++++++++++++++++ 4 files changed, 110 insertions(+), 2 deletions(-) create mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx create mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx create mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx index 8480f2d75b..11fcea7585 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx @@ -13,9 +13,9 @@ import { import {Button, Dropdown, Space, Spin, Table} from "antd" import React, {useState} from "react" import {createUseStyles} from "react-jss" -import ConfigureEvaluatorModal from "./evaluators/ConfigureEvaluatorModal" import {ColumnsType} from "antd/es/table" import {MoreOutlined} from "@ant-design/icons" +import EvaluatorsModal from "./EvaluatorsModal/EvaluatorsModal" const useStyles = createUseStyles((theme: JSSTheme) => ({ button: { @@ -230,7 +230,7 @@ const AutoEvaluation = () => { /> - setIsConfigEvaluatorModalOpen(false)} /> diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx new file mode 100644 index 0000000000..81166922e6 --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx @@ -0,0 +1,16 @@ +import {Button} from "antd" +import React from "react" + +type ConfigureNewEvaluatorProps = { + setCurrent: React.Dispatch> +} + +const ConfigureNewEvaluator = ({setCurrent}: ConfigureNewEvaluatorProps) => { + return ( +
+ ConfigureNewEvaluator +
+ ) +} + +export default ConfigureNewEvaluator diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx new file mode 100644 index 0000000000..7371cd6621 --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx @@ -0,0 +1,17 @@ +import {Button} from "antd" +import React from "react" + +type CreateNewEvaluatorProps = { + setCurrent: React.Dispatch> +} + +const CreateNewEvaluator = ({setCurrent}: CreateNewEvaluatorProps) => { + return ( +
+ CreateNewEvaluator + +
+ ) +} + +export default CreateNewEvaluator diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx new file mode 100644 index 0000000000..8c417c02df --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx @@ -0,0 +1,75 @@ +import {useAppId} from "@/hooks/useAppId" +import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" +import {JSSTheme} from "@/lib/Types" +import {fetchAllEvaluatorConfigs, fetchAllEvaluators} from "@/services/evaluations/api" +import {Modal} from "antd" +import {useAtom} from "jotai" +import React, {useEffect, useState} from "react" +import {createUseStyles} from "react-jss" +import ConfigureEvaluators from "@/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators" +import CreateNewEvaluator from "./CreateNewEvaluator" +import ConfigureNewEvaluator from "./ConfigureNewEvaluator" + +type EvaluatorsModalProps = {} & React.ComponentProps + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + modalWrapper: { + "& .ant-modal-content": { + height: 800, + overflowY: "auto", + }, + }, +})) + +const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { + const classes = useStyles() + const appId = useAppId() + const [current, setCurrent] = useState(0) + const [evaluators, setEvaluators] = useAtom(evaluatorsAtom) + const [evaluatorConfigs, setEvaluatorConfigs] = useAtom(evaluatorConfigsAtom) + const [selectedEvaluatorCategory, setSelectedEvaluatorCategory] = useState("view_all") + + useEffect(() => { + Promise.all([fetchAllEvaluators(), fetchAllEvaluatorConfigs(appId)]).then( + ([evaluators, configs]) => { + setEvaluators(evaluators) + setEvaluatorConfigs(configs) + }, + ) + }, [appId]) + + const steps = [ + { + content: ( + props.onCancel?.({} as any)} + selectedEvaluatorCategory={selectedEvaluatorCategory} + setSelectedEvaluatorCategory={setSelectedEvaluatorCategory} + setCurrent={setCurrent} + /> + ), + }, + { + content: , + }, + { + content: , + }, + ] + + return ( + + {steps[current].content} + + ) +} + +export default EvaluatorsModal From e8ee4114309d7ca6aa508abc890c0eba8b2f99d5 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Sat, 24 Aug 2024 23:02:40 +0100 Subject: [PATCH 050/149] fix(frontend): passed prop --- .../ConfigureEvaluators/EvaluatorList.tsx | 2 +- .../ConfigureEvaluators/index.tsx | 9 +- .../ConfigureNewEvaluator/index.tsx | 1 + .../CreateNewEvaluator/index.tsx | 98 ++++++++++++++++++- .../EvaluatorsModal/EvaluatorsModal.tsx | 21 ++-- 5 files changed, 112 insertions(+), 19 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorList.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorList.tsx index 810f1e3f06..4fd72e4848 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorList.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorList.tsx @@ -114,7 +114,7 @@ const EvaluatorList = ({evaluatorConfigs}: EvaluatorListProps) => { columns={columns} rowKey={"id"} dataSource={evaluatorConfigs} - scroll={{x: true}} + scroll={{x: true, y: 600}} bordered pagination={false} onRow={(record) => ({ diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx index d15ce09e7d..cc090631a7 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx @@ -10,8 +10,6 @@ import EvaluatorList from "./EvaluatorList" type ConfigureEvaluatorModalProps = { evaluatorConfigs: EvaluatorConfig[] handleOnCancel: () => void - selectedEvaluatorCategory: string - setSelectedEvaluatorCategory: React.Dispatch> setCurrent: React.Dispatch> } @@ -54,12 +52,11 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ const ConfigureEvaluatorModal = ({ evaluatorConfigs, handleOnCancel, - selectedEvaluatorCategory, - setSelectedEvaluatorCategory, setCurrent, }: ConfigureEvaluatorModalProps) => { const classes = useStyles() const [evaluatorsDisplay, setEvaluatorsDisplay] = useState("card") + const [selectedEvaluatorCategory, setSelectedEvaluatorCategory] = useState("view_all") return (
@@ -75,7 +72,7 @@ const ConfigureEvaluatorModal = ({ > Create new evaluator - +
@@ -114,7 +111,7 @@ const ConfigureEvaluatorModal = ({
-
+
{evaluatorsDisplay === "list" ? ( ) : ( diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx index 81166922e6..d49a7dd6ee 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx @@ -3,6 +3,7 @@ import React from "react" type ConfigureNewEvaluatorProps = { setCurrent: React.Dispatch> + handleOnCancel: () => void } const ConfigureNewEvaluator = ({setCurrent}: ConfigureNewEvaluatorProps) => { diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx index 7371cd6621..e8b732b55c 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx @@ -1,15 +1,103 @@ -import {Button} from "antd" -import React from "react" +import {Evaluator, JSSTheme} from "@/lib/Types" +import {CloseOutlined} from "@ant-design/icons" +import {ArrowLeft, Cards, Table} from "@phosphor-icons/react" +import {Button, Divider, Input, Radio, Space, Typography} from "antd" +import React, {useState} from "react" +import {createUseStyles} from "react-jss" type CreateNewEvaluatorProps = { setCurrent: React.Dispatch> + handleOnCancel: () => void + evaluators: Evaluator[] } -const CreateNewEvaluator = ({setCurrent}: CreateNewEvaluatorProps) => { +const useStyles = createUseStyles((theme: JSSTheme) => ({ + title: { + display: "flex", + alignItems: "center", + justifyContent: "space-between", + "& h1": { + fontSize: theme.fontSizeHeading5, + fontWeight: theme.fontWeightStrong, + lineHeight: theme.lineHeightLG, + }, + }, + radioBtnContainer: { + display: "flex", + alignItems: "center", + gap: theme.marginXS, + "& .ant-radio-button-wrapper": { + borderRadius: theme.borderRadius, + borderInlineStartWidth: "initial", + "&:before": { + width: 0, + }, + "&:not(.ant-radio-button-wrapper-checked)": { + border: "none", + "&:hover": { + backgroundColor: theme.colorBgTextHover, + }, + }, + }, + }, +})) + +const CreateNewEvaluator = ({evaluators, setCurrent, handleOnCancel}: CreateNewEvaluatorProps) => { + const classes = useStyles() + const [evaluatorsDisplay, setEvaluatorsDisplay] = useState("card") + const [selectedEvaluatorCategory, setSelectedEvaluatorCategory] = useState("view_all") + return (
- CreateNewEvaluator - +
+
+ +
+
+
+ setSelectedEvaluatorCategory(e.target.value)} + > + View all + + {["RAG", "Classifiers", "Similarity", "AI / LLM", "Functional"].map( + (val, idx) => ( + + {val} + + ), + )} + + + + setEvaluatorsDisplay(e.target.value)} + > + +
+ + + + + + + + + + +
body
) } diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx index 8c417c02df..1125033329 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx @@ -6,7 +6,7 @@ import {Modal} from "antd" import {useAtom} from "jotai" import React, {useEffect, useState} from "react" import {createUseStyles} from "react-jss" -import ConfigureEvaluators from "@/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators" +import ConfigureEvaluators from "./ConfigureEvaluators" import CreateNewEvaluator from "./CreateNewEvaluator" import ConfigureNewEvaluator from "./ConfigureNewEvaluator" @@ -16,7 +16,6 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ modalWrapper: { "& .ant-modal-content": { height: 800, - overflowY: "auto", }, }, })) @@ -27,7 +26,6 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { const [current, setCurrent] = useState(0) const [evaluators, setEvaluators] = useAtom(evaluatorsAtom) const [evaluatorConfigs, setEvaluatorConfigs] = useAtom(evaluatorConfigsAtom) - const [selectedEvaluatorCategory, setSelectedEvaluatorCategory] = useState("view_all") useEffect(() => { Promise.all([fetchAllEvaluators(), fetchAllEvaluatorConfigs(appId)]).then( @@ -44,17 +42,26 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { props.onCancel?.({} as any)} - selectedEvaluatorCategory={selectedEvaluatorCategory} - setSelectedEvaluatorCategory={setSelectedEvaluatorCategory} setCurrent={setCurrent} /> ), }, { - content: , + content: ( + props.onCancel?.({} as any)} + /> + ), }, { - content: , + content: ( + props.onCancel?.({} as any)} + /> + ), }, ] From 4a32a95519ea19f198f5cb90b82626891f155c69 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Sun, 25 Aug 2024 21:41:28 +0100 Subject: [PATCH 051/149] fix(frontend): added create new evaluator section --- .../ConfigureEvaluators/EvaluatorList.tsx | 2 +- .../ConfigureEvaluators/index.tsx | 2 +- .../CreateEvaluatorCard.tsx | 73 +++++++++++++++++ .../CreateEvaluatorList.tsx | 66 +++++++++++++++ .../CreateNewEvaluator/index.tsx | 81 +++++++++++++------ 5 files changed, 198 insertions(+), 26 deletions(-) create mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorCard.tsx create mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorList.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorList.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorList.tsx index 4fd72e4848..cda67047dd 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorList.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorList.tsx @@ -114,7 +114,7 @@ const EvaluatorList = ({evaluatorConfigs}: EvaluatorListProps) => { columns={columns} rowKey={"id"} dataSource={evaluatorConfigs} - scroll={{x: true, y: 600}} + scroll={{x: true, y: 550}} bordered pagination={false} onRow={(record) => ({ diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx index cc090631a7..3400435af5 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx @@ -111,7 +111,7 @@ const ConfigureEvaluatorModal = ({ -
+
{evaluatorsDisplay === "list" ? ( ) : ( diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorCard.tsx new file mode 100644 index 0000000000..8a26320f3b --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorCard.tsx @@ -0,0 +1,73 @@ +import {Evaluator, JSSTheme} from "@/lib/Types" +import {Card, Typography} from "antd" +import React from "react" +import {createUseStyles} from "react-jss" + +interface CreateEvaluatorCardProps { + evaluators: Evaluator[] +} + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + container: { + display: "flex", + flexDirection: "column", + gap: theme.paddingLG, + overflowY: "auto", + }, + cardTitle: { + fontSize: theme.fontSizeLG, + lineHeight: theme.lineHeightLG, + fontWeight: theme.fontWeightMedium, + }, + evaluatorCard: { + width: 276, + display: "flex", + flexDirection: "column", + transition: "all 0.025s ease-in", + cursor: "pointer", + "& > .ant-card-head": { + minHeight: 0, + padding: theme.paddingSM, + + "& .ant-card-head-title": { + fontSize: theme.fontSize, + fontWeight: theme.fontWeightMedium, + lineHeight: theme.lineHeight, + }, + }, + "& > .ant-card-body": { + height: 122, + overflowY: "auto", + padding: theme.paddingSM, + "& .ant-typography": { + color: theme.colorTextSecondary, + }, + }, + "&:hover": {}, + }, +})) + +const CreateEvaluatorCard = ({evaluators}: CreateEvaluatorCardProps) => { + const classes = useStyles() + + return ( +
+
+ Evaluator Title +
+ {evaluators.map((evaluator) => ( + + {evaluator.description} + + ))} +
+
+
+ ) +} + +export default CreateEvaluatorCard diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorList.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorList.tsx new file mode 100644 index 0000000000..e3a344dcbe --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorList.tsx @@ -0,0 +1,66 @@ +import {Evaluator, JSSTheme} from "@/lib/Types" +import {Space, Table, Tag, Typography} from "antd" +import {ColumnsType} from "antd/es/table" +import React from "react" +import {createUseStyles} from "react-jss" + +interface CreateEvaluatorListProps { + evaluators: Evaluator[] +} + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + textDescription: { + display: "flex", + flexDirection: "column", + "& .ant-typography:nth-of-type(1)": { + fontSize: theme.fontSize, + lineHeight: theme.lineHeight, + }, + }, +})) + +const CreateEvaluatorList = ({evaluators}: CreateEvaluatorListProps) => { + const classes = useStyles() + + const columns: ColumnsType = [ + { + title: "Category", + dataIndex: "key", + key: "key", + width: 160, + render: (_, record) => { + return ( +
+ {record.key} +
+ ) + }, + }, + { + title: "Type", + dataIndex: "description", + key: "description", + width: "100%", + render: (_, record) => { + return ( +
+ {record.name} + {record.description} +
+ ) + }, + }, + ] + return ( +
+ ) +} + +export default CreateEvaluatorList diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx index e8b732b55c..17ea1fb227 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx @@ -4,6 +4,8 @@ import {ArrowLeft, Cards, Table} from "@phosphor-icons/react" import {Button, Divider, Input, Radio, Space, Typography} from "antd" import React, {useState} from "react" import {createUseStyles} from "react-jss" +import CreateEvaluatorList from "./CreateEvaluatorList" +import CreateEvaluatorCard from "./CreateEvaluatorCard" type CreateNewEvaluatorProps = { setCurrent: React.Dispatch> @@ -17,11 +19,16 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ alignItems: "center", justifyContent: "space-between", "& h1": { - fontSize: theme.fontSizeHeading5, + fontSize: theme.fontSizeHeading4, fontWeight: theme.fontWeightStrong, lineHeight: theme.lineHeightLG, }, }, + subTitle: { + fontSize: theme.fontSizeLG, + lineHeight: theme.lineHeightLG, + fontWeight: theme.fontWeightMedium, + }, radioBtnContainer: { display: "flex", alignItems: "center", @@ -52,33 +59,52 @@ const CreateNewEvaluator = ({evaluators, setCurrent, handleOnCancel}: CreateNewE
-
- setSelectedEvaluatorCategory(e.target.value)} - > - View all - - {["RAG", "Classifiers", "Similarity", "AI / LLM", "Functional"].map( - (val, idx) => ( - - {val} - - ), - )} - + {evaluatorsDisplay === "list" ? ( + +
- + {evaluatorsDisplay !== "list" ? :
} +
+ +
+ {evaluatorsDisplay === "list" ? ( + + ) : ( + + )}
-
body
) } From cc33a662a4283c4f4738809645ddc1885dd6ff21 Mon Sep 17 00:00:00 2001 From: jp-agenta Date: Mon, 26 Aug 2024 15:13:28 +0200 Subject: [PATCH 052/149] Update evaluators_service.py --- agenta-backend/agenta_backend/services/evaluators_service.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 985e9e321b..d316db702d 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -442,9 +442,9 @@ async def ai_critique(input: EvaluatorInputInterface) -> EvaluatorOutputInterfac for key, value in input.inputs.items(): chain_run_args[key] = value - prompt_system = input.settings.get("prompt_system", "") + prompt_template = input.settings.get("prompt_template", "") messages = [ - {"role": "system", "content": prompt_system}, + {"role": "system", "content": prompt_template}, {"role": "user", "content": str(chain_run_args)}, ] From bfe4cdb21665dd38a75a0b4d470da8d0ccb21602 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Mon, 26 Aug 2024 14:36:36 +0100 Subject: [PATCH 053/149] fix(frontend): modified configure evaluator state to use query param, enable search feature and improved title style --- .../autoEvaluation/AutoEvaluation.tsx | 18 +++++--- .../ConfigureEvaluators/index.tsx | 29 ++++++++++--- .../CreateEvaluatorCard.tsx | 12 +++++- .../CreateEvaluatorList.tsx | 16 ++++++- .../CreateNewEvaluator/index.tsx | 43 +++++++++++++++---- .../EvaluatorsModal/EvaluatorsModal.tsx | 17 +++++--- 6 files changed, 106 insertions(+), 29 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx index 11fcea7585..92bf88f11e 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx @@ -16,6 +16,7 @@ import {createUseStyles} from "react-jss" import {ColumnsType} from "antd/es/table" import {MoreOutlined} from "@ant-design/icons" import EvaluatorsModal from "./EvaluatorsModal/EvaluatorsModal" +import {useQueryParam} from "@/hooks/useQuery" const useStyles = createUseStyles((theme: JSSTheme) => ({ button: { @@ -27,7 +28,10 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ const AutoEvaluation = () => { const classes = useStyles() const [selectedRowKeys, setSelectedRowKeys] = useState([]) - const [isConfigEvaluatorModalOpen, setIsConfigEvaluatorModalOpen] = useState(false) + const [isConfigEvaluatorModalOpen, setIsConfigEvaluatorModalOpen] = useQueryParam( + "configureEvaluatorModal", + "", + ) const columns: ColumnsType<_Evaluation> = [ { @@ -180,7 +184,7 @@ const AutoEvaluation = () => { @@ -230,10 +234,12 @@ const AutoEvaluation = () => { /> - setIsConfigEvaluatorModalOpen(false)} - /> + {isConfigEvaluatorModalOpen === "open" && ( + setIsConfigEvaluatorModalOpen("")} + /> + )} ) } diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx index 3400435af5..e35ad9ae6a 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx @@ -1,8 +1,8 @@ -import {EvaluatorConfig, JSSTheme} from "@/lib/Types" +import {Evaluator, EvaluatorConfig, JSSTheme} from "@/lib/Types" import {CloseOutlined, PlusOutlined} from "@ant-design/icons" import {Cards, Table} from "@phosphor-icons/react" import {Button, Divider, Input, Radio, Space, Typography} from "antd" -import React, {useState} from "react" +import React, {useMemo, useState} from "react" import {createUseStyles} from "react-jss" import EvaluatorCard from "./EvaluatorCard" import EvaluatorList from "./EvaluatorList" @@ -11,6 +11,7 @@ type ConfigureEvaluatorModalProps = { evaluatorConfigs: EvaluatorConfig[] handleOnCancel: () => void setCurrent: React.Dispatch> + setSelectedEvaluator: React.Dispatch> } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -18,7 +19,7 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ display: "flex", alignItems: "center", justifyContent: "space-between", - "& h1": { + "& .ant-typography": { fontSize: theme.fontSizeLG, fontWeight: theme.fontWeightStrong, lineHeight: theme.lineHeightLG, @@ -53,16 +54,25 @@ const ConfigureEvaluatorModal = ({ evaluatorConfigs, handleOnCancel, setCurrent, + setSelectedEvaluator, }: ConfigureEvaluatorModalProps) => { const classes = useStyles() + const [searchTerm, setSearchTerm] = useState("") const [evaluatorsDisplay, setEvaluatorsDisplay] = useState("card") const [selectedEvaluatorCategory, setSelectedEvaluatorCategory] = useState("view_all") + const filteredEvalConfigs = useMemo(() => { + if (!searchTerm) return evaluatorConfigs + return evaluatorConfigs.filter((item) => + item.name.toLowerCase().includes(searchTerm.toLowerCase()), + ) + }, [searchTerm, evaluatorConfigs]) + return (
- Configure evaluators + Configure evaluators
diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorCard.tsx index 8a26320f3b..2e544449ef 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorCard.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorCard.tsx @@ -5,6 +5,8 @@ import {createUseStyles} from "react-jss" interface CreateEvaluatorCardProps { evaluators: Evaluator[] + setSelectedEvaluator: React.Dispatch> + setCurrent: (value: React.SetStateAction) => void } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -47,7 +49,11 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ }, })) -const CreateEvaluatorCard = ({evaluators}: CreateEvaluatorCardProps) => { +const CreateEvaluatorCard = ({ + evaluators, + setSelectedEvaluator, + setCurrent, +}: CreateEvaluatorCardProps) => { const classes = useStyles() return ( @@ -60,6 +66,10 @@ const CreateEvaluatorCard = ({evaluators}: CreateEvaluatorCardProps) => { key={evaluator.key} className={classes.evaluatorCard} title={evaluator.name} + onClick={() => { + setSelectedEvaluator(evaluator) + setCurrent(2) + }} > {evaluator.description} diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorList.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorList.tsx index e3a344dcbe..e8e853d34d 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorList.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorList.tsx @@ -1,11 +1,13 @@ import {Evaluator, JSSTheme} from "@/lib/Types" -import {Space, Table, Tag, Typography} from "antd" +import {Table, Tag, Typography} from "antd" import {ColumnsType} from "antd/es/table" import React from "react" import {createUseStyles} from "react-jss" interface CreateEvaluatorListProps { evaluators: Evaluator[] + setSelectedEvaluator: React.Dispatch> + setCurrent: (value: React.SetStateAction) => void } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -19,7 +21,11 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ }, })) -const CreateEvaluatorList = ({evaluators}: CreateEvaluatorListProps) => { +const CreateEvaluatorList = ({ + evaluators, + setSelectedEvaluator, + setCurrent, +}: CreateEvaluatorListProps) => { const classes = useStyles() const columns: ColumnsType = [ @@ -59,6 +65,12 @@ const CreateEvaluatorList = ({evaluators}: CreateEvaluatorListProps) => { rowKey={"key"} className="ph-no-capture" scroll={{x: true, y: 550}} + onRow={(record) => ({ + onClick: () => { + setSelectedEvaluator(record) + setCurrent(2) + }, + })} /> ) } diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx index 17ea1fb227..2b1f605b43 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx @@ -2,7 +2,7 @@ import {Evaluator, JSSTheme} from "@/lib/Types" import {CloseOutlined} from "@ant-design/icons" import {ArrowLeft, Cards, Table} from "@phosphor-icons/react" import {Button, Divider, Input, Radio, Space, Typography} from "antd" -import React, {useState} from "react" +import React, {useMemo, useState} from "react" import {createUseStyles} from "react-jss" import CreateEvaluatorList from "./CreateEvaluatorList" import CreateEvaluatorCard from "./CreateEvaluatorCard" @@ -11,6 +11,7 @@ type CreateNewEvaluatorProps = { setCurrent: React.Dispatch> handleOnCancel: () => void evaluators: Evaluator[] + setSelectedEvaluator: React.Dispatch> } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -18,7 +19,7 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ display: "flex", alignItems: "center", justifyContent: "space-between", - "& h1": { + "& .ant-typography": { fontSize: theme.fontSizeHeading4, fontWeight: theme.fontWeightStrong, lineHeight: theme.lineHeightLG, @@ -49,18 +50,31 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ }, })) -const CreateNewEvaluator = ({evaluators, setCurrent, handleOnCancel}: CreateNewEvaluatorProps) => { +const CreateNewEvaluator = ({ + evaluators, + setCurrent, + handleOnCancel, + setSelectedEvaluator, +}: CreateNewEvaluatorProps) => { const classes = useStyles() + const [searchTerm, setSearchTerm] = useState("") const [evaluatorsDisplay, setEvaluatorsDisplay] = useState("card") const [selectedEvaluatorCategory, setSelectedEvaluatorCategory] = useState("view_all") + const filteredEvaluators = useMemo(() => { + if (!searchTerm) return evaluators + return evaluators.filter((item) => + item.name.toLowerCase().includes(searchTerm.toLowerCase()), + ) + }, [searchTerm, evaluators]) + return (
{evaluatorsDisplay === "list" ? ( - Configure evaluators + Configure evaluators ) : ( <>
diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx index 1125033329..aecc9f3a18 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx @@ -1,6 +1,6 @@ import {useAppId} from "@/hooks/useAppId" import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" -import {JSSTheme} from "@/lib/Types" +import {Evaluator, JSSTheme} from "@/lib/Types" import {fetchAllEvaluatorConfigs, fetchAllEvaluators} from "@/services/evaluations/api" import {Modal} from "antd" import {useAtom} from "jotai" @@ -26,6 +26,7 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { const [current, setCurrent] = useState(0) const [evaluators, setEvaluators] = useAtom(evaluatorsAtom) const [evaluatorConfigs, setEvaluatorConfigs] = useAtom(evaluatorConfigsAtom) + const [selectedEvaluator, setSelectedEvaluator] = useState(null) useEffect(() => { Promise.all([fetchAllEvaluators(), fetchAllEvaluatorConfigs(appId)]).then( @@ -43,6 +44,7 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { evaluatorConfigs={evaluatorConfigs} handleOnCancel={() => props.onCancel?.({} as any)} setCurrent={setCurrent} + setSelectedEvaluator={setSelectedEvaluator} /> ), }, @@ -52,18 +54,23 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { evaluators={evaluators} setCurrent={setCurrent} handleOnCancel={() => props.onCancel?.({} as any)} + setSelectedEvaluator={setSelectedEvaluator} /> ), }, - { + ] + + if (selectedEvaluator) { + steps.push({ content: ( props.onCancel?.({} as any)} /> ), - }, - ] + }) + } return ( { className={classes.modalWrapper} {...props} > - {steps[current].content} + {steps[current]?.content} ) } From 8ba96f226886e491bb20a74cbe9e5eda741a0715 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Tue, 27 Aug 2024 01:04:55 +0100 Subject: [PATCH 054/149] fix(frontend): improved config evaluator modal --- .../AdvancedSettings.tsx | 94 +++++++++++++ .../DynamicFormField.tsx | 132 ++++++++++++++++++ .../EvaluatorsModal/EvaluatorsModal.tsx | 3 + 3 files changed, 229 insertions(+) create mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/AdvancedSettings.tsx create mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/DynamicFormField.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/AdvancedSettings.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/AdvancedSettings.tsx new file mode 100644 index 0000000000..28c8f52451 --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/AdvancedSettings.tsx @@ -0,0 +1,94 @@ +import React from "react" +import {Form, Input, InputNumber, Switch, Tooltip, Collapse, theme} from "antd" +import {CaretRightOutlined, InfoCircleOutlined} from "@ant-design/icons" +import {createUseStyles} from "react-jss" +import {Editor} from "@monaco-editor/react" +import {useAppTheme} from "@/components/Layout/ThemeContextProvider" + +const useStyles = createUseStyles((theme: any) => ({ + label: { + display: "flex", + alignItems: "center", + gap: "0.5rem", + }, + editor: { + border: `1px solid ${theme.colorBorder}`, + borderRadius: theme.borderRadius, + overflow: "hidden", + }, +})) + +type AdvancedSettingsProps = { + settings: Record[] +} + +const AdvancedSettings: React.FC = ({settings}) => { + const classes = useStyles() + const {appTheme} = useAppTheme() + const {token} = theme.useToken() + + return ( + } + > + + {settings.map((field) => { + const rules = [ + {required: field.required ?? true, message: "This field is required"}, + ] + + return ( + + {field.label} + {field.description && ( + + + + )} +
+ } + initialValue={field.default} + rules={rules} + > + {field.type === "string" || field.type === "regex" ? ( + + ) : field.type === "number" ? ( + + ) : field.type === "boolean" || field.type === "bool" ? ( + + ) : field.type === "text" ? ( + + ) : field.type === "code" ? ( + + ) : field.type === "object" ? ( + + ) : null} + + ) + })} + + + ) +} + +export default AdvancedSettings diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/DynamicFormField.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/DynamicFormField.tsx new file mode 100644 index 0000000000..3b7f2f6edc --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/DynamicFormField.tsx @@ -0,0 +1,132 @@ +import {useAppTheme} from "@/components/Layout/ThemeContextProvider" +import {isValidRegex} from "@/lib/helpers/validators" +import {EvaluationSettingsTemplate, JSSTheme} from "@/lib/Types" +import {InfoCircleOutlined} from "@ant-design/icons" +import {Editor} from "@monaco-editor/react" +import {theme, Form, Tooltip, InputNumber, Switch, Input} from "antd" +import {Rule} from "antd/es/form" +import Link from "next/link" +import {createUseStyles} from "react-jss" + +type DynamicFormFieldProps = EvaluationSettingsTemplate & { + name: string | string[] +} + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + editor: { + border: `1px solid ${theme.colorBorder}`, + borderRadius: theme.borderRadius, + overflow: "hidden", + "& .monaco-editor": { + width: "0 !important", + }, + }, + ExternalHelp: { + marginBottom: "20px", + display: "flex", + alignItems: "center", + gap: "0.3em", + }, + ExternalHelpLink: { + margin: "0px", + padding: "0px", + textDecoration: "underline", + color: theme.isDark ? "rgba(255, 255, 255, 0.85)" : "#000", + + "&:hover": { + color: theme.isDark ? "rgba(255, 255, 255, 0.85)" : "#000", + textDecoration: "underline", + }, + }, +})) + +export const DynamicFormField: React.FC = ({ + name, + label, + type, + default: defaultVal, + description, + min, + max, + required, +}) => { + const {appTheme} = useAppTheme() + const classes = useStyles() + const {token} = theme.useToken() + + const rules: Rule[] = [{required: required ?? true, message: "This field is required"}] + if (type === "regex") + rules.push({ + validator: (_, value) => + new Promise((res, rej) => + isValidRegex(value) ? res("") : rej("Regex pattern is not valid"), + ), + }) + + const ExternalHelpInfo = + name[1] === "webhook_url" ? ( +
+ Learn + + more + + about the evaluator +
+ ) : null + + return ( + <> + {label !== "Correct Answer" && ( + + {label} + {description && ( + + + + )} +
+ } + initialValue={defaultVal} + rules={rules} + > + {type === "string" || type === "regex" ? ( + + ) : type === "number" ? ( + + ) : type === "boolean" || type === "bool" ? ( + + ) : type === "text" ? ( + + ) : type === "code" ? ( + + ) : type === "object" ? ( + + ) : null} + + )} + + {ExternalHelpInfo} + + ) +} diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx index aecc9f3a18..098344302c 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx @@ -16,6 +16,9 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ modalWrapper: { "& .ant-modal-content": { height: 800, + "& .ant-modal-body": { + height: "100%", + }, }, }, })) From b4ca6fc111a6363315125072e9d61af0ca00f33e Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Tue, 27 Aug 2024 03:49:34 +0100 Subject: [PATCH 055/149] fix(frontend): added 600px height to fixed overflow in config evaluator card views --- .../EvaluatorsModal/ConfigureEvaluators/EvaluatorCard.tsx | 2 ++ .../EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorCard.tsx | 1 + 2 files changed, 3 insertions(+) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorCard.tsx index 3533683ac2..3a553a570a 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorCard.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorCard.tsx @@ -14,6 +14,8 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ display: "flex", flexDirection: "column", gap: theme.paddingLG, + height: 600, + overflowY: "auto", }, cardTitle: { fontSize: theme.fontSizeLG, diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorCard.tsx index 2e544449ef..7f5ab0895a 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorCard.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorCard.tsx @@ -15,6 +15,7 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ flexDirection: "column", gap: theme.paddingLG, overflowY: "auto", + height: 600, }, cardTitle: { fontSize: theme.fontSizeLG, From ce455b19d6a79ca0678b3e8750f56edd92db3e41 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Tue, 27 Aug 2024 03:50:30 +0100 Subject: [PATCH 056/149] fix(frontend): added fetch variants state --- .../EvaluatorsModal/EvaluatorsModal.tsx | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx index 098344302c..fec6131b4e 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx @@ -1,6 +1,6 @@ import {useAppId} from "@/hooks/useAppId" import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" -import {Evaluator, JSSTheme} from "@/lib/Types" +import {Evaluator, JSSTheme, Variant} from "@/lib/Types" import {fetchAllEvaluatorConfigs, fetchAllEvaluators} from "@/services/evaluations/api" import {Modal} from "antd" import {useAtom} from "jotai" @@ -9,6 +9,7 @@ import {createUseStyles} from "react-jss" import ConfigureEvaluators from "./ConfigureEvaluators" import CreateNewEvaluator from "./CreateNewEvaluator" import ConfigureNewEvaluator from "./ConfigureNewEvaluator" +import {fetchVariants} from "@/services/api" type EvaluatorsModalProps = {} & React.ComponentProps @@ -30,14 +31,18 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { const [evaluators, setEvaluators] = useAtom(evaluatorsAtom) const [evaluatorConfigs, setEvaluatorConfigs] = useAtom(evaluatorConfigsAtom) const [selectedEvaluator, setSelectedEvaluator] = useState(null) + const [variants, setVariants] = useState(null) useEffect(() => { - Promise.all([fetchAllEvaluators(), fetchAllEvaluatorConfigs(appId)]).then( - ([evaluators, configs]) => { - setEvaluators(evaluators) - setEvaluatorConfigs(configs) - }, - ) + Promise.all([ + fetchAllEvaluators(), + fetchAllEvaluatorConfigs(appId), + fetchVariants(appId), + ]).then(([evaluators, configs, variants]) => { + setEvaluators(evaluators) + setEvaluatorConfigs(configs) + setVariants(variants) + }) }, [appId]) const steps = [ @@ -70,6 +75,7 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { selectedEvaluator={selectedEvaluator} setCurrent={setCurrent} handleOnCancel={() => props.onCancel?.({} as any)} + variants={variants} /> ), }) From 59d64f6f9f4df71e6ba5e6755a24303758092fbc Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Tue, 27 Aug 2024 03:52:02 +0100 Subject: [PATCH 057/149] design(frontend): added select variant modal in evaluator config modal --- .../EvaluatorVariantModal.tsx | 130 ++++++++++ .../ConfigureNewEvaluator/index.tsx | 239 +++++++++++++++++- 2 files changed, 364 insertions(+), 5 deletions(-) create mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/EvaluatorVariantModal.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/EvaluatorVariantModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/EvaluatorVariantModal.tsx new file mode 100644 index 0000000000..6a29dd3d30 --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/EvaluatorVariantModal.tsx @@ -0,0 +1,130 @@ +import {formatVariantIdWithHash} from "@/lib/helpers/utils" +import {JSSTheme, Variant} from "@/lib/Types" +import {CloseOutlined} from "@ant-design/icons" +import {Badge, Button, Divider, Input, Modal, Table, Tag, theme, Typography} from "antd" +import React, {useMemo, useState} from "react" +import {createUseStyles} from "react-jss" + +type EvaluatorVariantModalProps = { + variants: Variant[] | null +} & React.ComponentProps + +const {useToken} = theme + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + title: { + fontSize: theme.fontSizeHeading4, + lineHeight: theme.lineHeightLG, + fontWeight: theme.fontWeightStrong, + }, + container: { + "& .ant-modal-content": { + paddingLeft: 0, + paddingRight: 0, + }, + "& .ant-modal-body": { + paddingLeft: 24, + paddingRight: 24, + height: 300, + overflowY: "auto", + }, + }, + table: { + "& .ant-table-thead > tr > th": { + height: 32, + padding: "0 16px", + }, + "& .ant-table-tbody > tr > td": { + height: 48, + padding: "0 16px", + }, + }, +})) + +const EvaluatorVariantModal = ({variants, ...props}: EvaluatorVariantModalProps) => { + const classes = useStyles() + const {token} = useToken() + const [searchTerm, setSearchTerm] = useState("") + const [selectedVariant, setSelectedVariant] = useState() + + const filtered = useMemo(() => { + if (!searchTerm) return variants + if (variants) { + return variants.filter((item) => + item.variantName.toLowerCase().includes(searchTerm.toLowerCase()), + ) + } + }, [searchTerm, variants]) + + return ( + +
+
+ + Select variant + +
+ + setSearchTerm(e.target.value)} + placeholder="Search" + allowClear + /> +
+ + + } + centered + footer={null} + {...props} + > +
{ + return ( +
+
{record.variantName}
+ + + + +
+ ) + }, + }, + ]} + onRow={(record) => ({ + onClick: () => { + setSelectedVariant(record) + }, + style: {cursor: "pointer"}, + })} + className={classes.table} + scroll={{y: 300}} + style={{height: 330}} + /> + + ) +} + +export default EvaluatorVariantModal diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx index d49a7dd6ee..ccd4453367 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx @@ -1,15 +1,244 @@ -import {Button} from "antd" -import React from "react" +import {Evaluator, JSSTheme, Variant} from "@/lib/Types" +import {CloseOutlined} from "@ant-design/icons" +import { + ArrowLeft, + CaretDoubleLeft, + CaretDoubleRight, + ClockClockwise, + Database, + Lightning, + Play, +} from "@phosphor-icons/react" +import {Button, Divider, Flex, Form, Input, Space, Tag, Typography} from "antd" +import React, {useMemo, useState} from "react" +import {createUseStyles} from "react-jss" +import AdvancedSettings from "./AdvancedSettings" +import {DynamicFormField} from "./DynamicFormField" +import EvaluatorVariantModal from "./EvaluatorVariantModal" type ConfigureNewEvaluatorProps = { setCurrent: React.Dispatch> handleOnCancel: () => void + selectedEvaluator: Evaluator + variants: Variant[] | null } -const ConfigureNewEvaluator = ({setCurrent}: ConfigureNewEvaluatorProps) => { +const useStyles = createUseStyles((theme: JSSTheme) => ({ + headerText: { + "& .ant-typography": { + lineHeight: theme.lineHeightLG, + fontSize: theme.fontSizeHeading4, + fontWeight: theme.fontWeightStrong, + }, + }, + title: { + fontSize: theme.fontSizeLG, + fontWeight: theme.fontWeightMedium, + lineHeight: theme.lineHeightLG, + }, + formContainer: { + display: "flex", + flexDirection: "column", + gap: theme.padding, + overflowY: "auto", + maxHeight: 580, + "& .ant-form-item": { + marginBottom: 0, + }, + }, +})) + +const ConfigureNewEvaluator = ({ + setCurrent, + selectedEvaluator, + handleOnCancel, + variants, +}: ConfigureNewEvaluatorProps) => { + const classes = useStyles() + const [form] = Form.useForm() + const [debugEvaluator, setDebugEvaluator] = useState(false) + const [openVariantModal, setOpenVariantModal] = useState(false) + + const evalFields = useMemo( + () => + Object.keys(selectedEvaluator?.settings_template || {}) + .filter((key) => !!selectedEvaluator?.settings_template[key]?.type) + .map((key) => ({ + key, + ...selectedEvaluator?.settings_template[key]!, + advanced: selectedEvaluator?.settings_template[key]?.advanced || false, + })), + [selectedEvaluator], + ) + + const advancedSettingsFields = evalFields.filter((field) => field.advanced) + const basicSettingsFields = evalFields.filter((field) => !field.advanced) + + const onSubmit = () => { + try { + } catch (error: any) {} + } + return ( -
- ConfigureNewEvaluator +
+
+ +
+ + +
+
+ + + {selectedEvaluator.name} + + + + + + + + {selectedEvaluator.description} + +
+ +
+
onSubmit} + layout="vertical" + className={classes.formContainer} + > + + + + + {basicSettingsFields.map((field) => ( + + ))} + + {advancedSettingsFields.length > 0 && ( + + )} + +
+ + + + + +
+ + {debugEvaluator && ( + <> + + +
+ + + Debug evaluator + + + Test your evaluator by generating a test data + + + + + + Generate test data + + + + + + + + +
+ JSON + +
+ +
+ + Output + + + + +
+
+ + )} +
+ + setOpenVariantModal(false)} + />
) } From a75d801ecda25bd477fcc5f0ab9446bb0182d37a Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Tue, 27 Aug 2024 11:07:35 +0100 Subject: [PATCH 058/149] fix(frontend): displayed evaluator table content --- .../ConfigureEvaluators/EvaluatorList.tsx | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorList.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorList.tsx index cda67047dd..a288b382c9 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorList.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorList.tsx @@ -1,7 +1,7 @@ import {EvaluatorConfig} from "@/lib/Types" import {MoreOutlined} from "@ant-design/icons" import {Copy, GearSix, Note, Trash} from "@phosphor-icons/react" -import {Button, Dropdown, Table} from "antd" +import {Button, Dropdown, Table, Tag} from "antd" import {ColumnsType} from "antd/es/table" import React, {useState} from "react" @@ -28,6 +28,9 @@ const EvaluatorList = ({evaluatorConfigs}: EvaluatorListProps) => { onHeaderCell: () => ({ style: {minWidth: 400}, }), + render: (_, record) => { + return
{record.name}
+ }, }, { title: "Type", @@ -36,6 +39,9 @@ const EvaluatorList = ({evaluatorConfigs}: EvaluatorListProps) => { onHeaderCell: () => ({ style: {minWidth: 200}, }), + render: (_, record) => { + return {record.evaluator_key} + }, }, { title: "Tags", @@ -114,9 +120,8 @@ const EvaluatorList = ({evaluatorConfigs}: EvaluatorListProps) => { columns={columns} rowKey={"id"} dataSource={evaluatorConfigs} - scroll={{x: true, y: 550}} + scroll={{x: true}} bordered - pagination={false} onRow={(record) => ({ style: {cursor: "pointer"}, onClick: () => {}, From ebb53012e7383bac1071ca7ad4157d51bb22545a Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Tue, 27 Aug 2024 13:54:55 +0100 Subject: [PATCH 059/149] design(frontend): ui improvements --- .../DynamicFormField.tsx | 2 +- .../ConfigureNewEvaluator/index.tsx | 87 +++++++++++++++---- 2 files changed, 70 insertions(+), 19 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/DynamicFormField.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/DynamicFormField.tsx index 3b7f2f6edc..1bab9a6e8c 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/DynamicFormField.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/DynamicFormField.tsx @@ -108,7 +108,7 @@ export const DynamicFormField: React.FC = ({ ) : type === "code" ? ( ({ "& .ant-form-item": { marginBottom: 0, }, + "& .ant-form-item-label": { + paddingBottom: theme.paddingXXS, + }, + }, + formTitleText: { + fontSize: theme.fontSize, + lineHeight: theme.lineHeight, + fontWeight: theme.fontWeightMedium, }, })) @@ -144,21 +152,60 @@ const ConfigureNewEvaluator = ({ layout="vertical" className={classes.formContainer} > - - - - - {basicSettingsFields.map((field) => ( - - ))} + + + Identifier + + +
+ + + + +
{ - setSelectedRowKeys(selectedRowKeys) - }, - }} - className="ph-no-capture" - columns={columns} - rowKey={"id"} - dataSource={[]} - scroll={{x: true}} - bordered - pagination={false} - onRow={(record) => ({ - style: {cursor: "pointer"}, - onClick: () => {}, - })} - /> - +
{ + setSelectedRowKeys(selectedRowKeys) + }, + }} + className="ph-no-capture" + columns={columns} + rowKey={"id"} + dataSource={evaluationList} + scroll={{x: true}} + bordered + pagination={false} + onRow={(record) => ({ + style: {cursor: "pointer"}, + onClick: () => {}, + })} + /> {isConfigEvaluatorModalOpen === "open" && ( ({ @@ -32,28 +41,72 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ })) const EvaluationsPage = () => { + const appId = useAppId() const classes = useStyles() + const [autoEvaluationList, setAutoEvaluationList] = useState<_Evaluation[]>([]) + const [singleModelEvalList, setSingleModelEvalList] = useState< + SingleModelEvaluationListTableDataType[] + >([]) + const [abTestingEvalList, setAbTestingEvalList] = useState( + [], + ) const [selectedEvaluation, setSelectedEvaluation] = useQueryParam( "selectedEvaluation", "auto_evaluation", ) + const [fetchingEvaluations, setFetchingEvaluations] = useState(false) + + useEffect(() => { + if (!appId) return + + setFetchingEvaluations(true) + Promise.all([ + fetchAllEvaluations(appId), + fetchSingleModelEvaluationResult(appId), + fetchAbTestingEvaluationResult(appId), + ]) + .then(([autoEvalResult, singleModelEvalResult, abTestingEvalResult]) => { + setAutoEvaluationList(autoEvalResult) + setSingleModelEvalList(singleModelEvalResult as any) + setAbTestingEvalList(abTestingEvalResult) + }) + .catch(console.error) + .finally(() => setFetchingEvaluations(false)) + }, [appId]) const items: TabsProps["items"] = [ { key: "auto_evaluation", label: "Automatic Evaluation", icon: , - children: , + children: ( + + ), }, { key: "ab_testing_evaluation", label: "A/B Testing Evaluation", icon: , + children: ( + + ), }, { key: "single_model_evaluation", label: "Single Model Evaluation", icon: , + children: ( + + ), }, ] From a3380534fc33dfb697af98172c0f32ecdf2c46dd Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Thu, 29 Aug 2024 13:38:48 +0100 Subject: [PATCH 063/149] fix(frontend): fixed config evaluator modal alignment and updated human eval table --- .../AbTestingEvaluation.tsx | 364 +++++++++++++++++- .../ConfigureEvaluators/index.tsx | 6 +- .../CreateNewEvaluator/index.tsx | 6 +- .../SingleModelEvaluation.tsx | 229 ++++++++++- 4 files changed, 595 insertions(+), 10 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/abTestingEvaluation/AbTestingEvaluation.tsx b/agenta-web/src/components/pages/evaluations/abTestingEvaluation/AbTestingEvaluation.tsx index c2cf16b993..358626d3d6 100644 --- a/agenta-web/src/components/pages/evaluations/abTestingEvaluation/AbTestingEvaluation.tsx +++ b/agenta-web/src/components/pages/evaluations/abTestingEvaluation/AbTestingEvaluation.tsx @@ -1,13 +1,373 @@ import {HumanEvaluationListTableDataType} from "@/components/Evaluations/HumanEvaluationResult" -import React from "react" +import {getColorFromStr} from "@/lib/helpers/colors" +import {getVotesPercentage} from "@/lib/helpers/evaluate" +import {getInitials, isDemo} from "@/lib/helpers/utils" +import {variantNameWithRev} from "@/lib/helpers/variantHelper" +import {JSSTheme} from "@/lib/Types" +import {MoreOutlined} from "@ant-design/icons" +import { + ArrowsLeftRight, + Columns, + Database, + GearSix, + Note, + Plus, + Rocket, + Trash, +} from "@phosphor-icons/react" +import {Avatar, Button, Dropdown, Space, Statistic, Table, Typography} from "antd" +import {ColumnsType} from "antd/es/table" +import {useRouter} from "next/router" +import React, {useState} from "react" +import {createUseStyles} from "react-jss" interface AbTestingEvaluationProps { evaluationList: HumanEvaluationListTableDataType[] fetchingEvaluations: boolean } +const useStyles = createUseStyles((theme: JSSTheme) => ({ + button: { + display: "flex", + alignItems: "center", + }, + statFlag: { + lineHeight: theme.lineHeight, + "& .ant-statistic-content-value": { + fontSize: theme.fontSize, + color: theme.colorError, + }, + "& .ant-statistic-content-suffix": { + fontSize: theme.fontSize, + color: theme.colorError, + }, + }, + stat: { + lineHeight: theme.lineHeight, + "& .ant-statistic-content-value": { + fontSize: theme.fontSize, + color: theme.colorPrimary, + }, + "& .ant-statistic-content-suffix": { + fontSize: theme.fontSize, + color: theme.colorPrimary, + }, + }, + statGood: { + lineHeight: theme.lineHeight, + "& .ant-statistic-content-value": { + fontSize: theme.fontSize, + color: theme.colorSuccess, + }, + "& .ant-statistic-content-suffix": { + fontSize: theme.fontSize, + color: theme.colorSuccess, + }, + }, +})) + const AbTestingEvaluation = ({evaluationList, fetchingEvaluations}: AbTestingEvaluationProps) => { - return
AbTestingEvaluation
+ const classes = useStyles() + const router = useRouter() + const appId = router.query.app_id as string + const [selectedRowKeys, setSelectedRowKeys] = useState([]) + + const handleNavigation = (variantName: string, revisionNum: string) => { + router.push(`/apps/${appId}/playground?variant=${variantName}&revision=${revisionNum}`) + } + + const columns: ColumnsType = [ + { + title: "Variant A", + dataIndex: "variantNames", + key: "variant1", + onHeaderCell: () => ({ + style: {minWidth: 160}, + }), + render: (value, record) => { + return ( +
+ {variantNameWithRev({ + variant_name: value[0], + revision: record.revisions[0], + })} +
+ ) + }, + }, + { + title: "Variant B", + dataIndex: "variantNames", + key: "variant2", + onHeaderCell: () => ({ + style: {minWidth: 160}, + }), + render: (value, record) => { + return ( +
+ {variantNameWithRev({ + variant_name: value[1], + revision: record.revisions[1], + })} +
+ ) + }, + }, + { + title: "Test set", + dataIndex: "testsetName", + key: "testsetName", + onHeaderCell: () => ({ + style: {minWidth: 160}, + }), + render: (_, record: HumanEvaluationListTableDataType, index: number) => { + return {record.testset.name} + }, + }, + { + title: "Results", + key: "results", + onHeaderCell: () => ({ + style: {minWidth: 240}, + }), + render: (_, record: HumanEvaluationListTableDataType) => { + const stat1 = getVotesPercentage(record, 0) + const stat2 = getVotesPercentage(record, 1) + + return ( +
+ + | + +
+ ) + }, + }, + { + title: "Both are good", + dataIndex: "positive", + key: "positive", + onHeaderCell: () => ({ + style: {minWidth: 160}, + }), + render: (_, record: HumanEvaluationListTableDataType) => { + let percentage = record.votesData.positive_votes.percentage + return ( + + + + ) + }, + }, + { + title: "Flag", + dataIndex: "flag", + key: "flag", + onHeaderCell: () => ({ + style: {minWidth: 160}, + }), + render: (value: any, record: HumanEvaluationListTableDataType) => { + let percentage = record.votesData.flag_votes.percentage + return ( + + + + ) + }, + }, + ] + + if (isDemo()) { + columns.push({ + title: "User", + dataIndex: ["user", "username"], + key: "username", + onHeaderCell: () => ({ + style: {minWidth: 160}, + }), + render: (_, record: any) => { + return ( + + + {getInitials(record.user.username)} + + {record.user.username} + + ) + }, + }) + } + + columns.push( + ...([ + { + title: "Created on", + dataIndex: "createdAt", + key: "createdAt", + onHeaderCell: () => ({ + style: {minWidth: 160}, + }), + }, + { + title: , + key: "key", + width: 56, + fixed: "right", + align: "center", + render: (_: any, record: HumanEvaluationListTableDataType) => { + return ( + , + onClick: (e) => { + e.domEvent.stopPropagation() + router.push( + `/apps/${appId}/annotations/human_a_b_testing/${record.key}`, + ) + }, + }, + { + key: "variant1", + label: "View variant 1", + icon: , + onClick: (e) => { + e.domEvent.stopPropagation() + handleNavigation( + record.variantNames[0], + record.revisions[0], + ) + }, + }, + { + key: "variant2", + label: "View variant 2", + icon: , + onClick: (e) => { + e.domEvent.stopPropagation() + handleNavigation( + record.variantNames[1], + record.revisions[1], + ) + }, + }, + { + key: "view_testset", + label: "View test set", + icon: , + onClick: (e) => { + e.domEvent.stopPropagation() + router.push( + `/apps/${appId}/testsets/${record.testset._id}`, + ) + }, + }, + {type: "divider"}, + { + key: "delete_eval", + label: "Delete", + icon: , + danger: true, + onClick: (e) => { + e.domEvent.stopPropagation() + }, + }, + ], + }} + > + + + + + + + + +
{ + setSelectedRowKeys(selectedRowKeys) + }, + }} + className="ph-no-capture" + columns={columns} + rowKey={"id"} + dataSource={evaluationList} + scroll={{x: true}} + bordered + pagination={false} + onRow={(record) => ({ + style: {cursor: "pointer"}, + onClick: () => {}, + })} + /> + + ) } export default AbTestingEvaluation diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx index e35ad9ae6a..8ee0459f0f 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx @@ -1,7 +1,7 @@ import {Evaluator, EvaluatorConfig, JSSTheme} from "@/lib/Types" import {CloseOutlined, PlusOutlined} from "@ant-design/icons" import {Cards, Table} from "@phosphor-icons/react" -import {Button, Divider, Input, Radio, Space, Typography} from "antd" +import {Button, Divider, Flex, Input, Radio, Space, Typography} from "antd" import React, {useMemo, useState} from "react" import {createUseStyles} from "react-jss" import EvaluatorCard from "./EvaluatorCard" @@ -102,7 +102,7 @@ const ConfigureEvaluatorModal = ({ ), )} - + - + diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx index 2b1f605b43..2b31a706ed 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx @@ -1,7 +1,7 @@ import {Evaluator, JSSTheme} from "@/lib/Types" import {CloseOutlined} from "@ant-design/icons" import {ArrowLeft, Cards, Table} from "@phosphor-icons/react" -import {Button, Divider, Input, Radio, Space, Typography} from "antd" +import {Button, Divider, Flex, Input, Radio, Space, Typography} from "antd" import React, {useMemo, useState} from "react" import {createUseStyles} from "react-jss" import CreateEvaluatorList from "./CreateEvaluatorList" @@ -119,7 +119,7 @@ const CreateNewEvaluator = ({ )} )} - + setSearchTerm(e.target.value)} @@ -137,7 +137,7 @@ const CreateNewEvaluator = ({ - + {evaluatorsDisplay !== "list" ? :
} diff --git a/agenta-web/src/components/pages/evaluations/singleModelEvaluation/SingleModelEvaluation.tsx b/agenta-web/src/components/pages/evaluations/singleModelEvaluation/SingleModelEvaluation.tsx index dc7c97a424..b2500c0892 100644 --- a/agenta-web/src/components/pages/evaluations/singleModelEvaluation/SingleModelEvaluation.tsx +++ b/agenta-web/src/components/pages/evaluations/singleModelEvaluation/SingleModelEvaluation.tsx @@ -1,16 +1,241 @@ -import {SingleModelEvaluationListTableDataType} from "@/lib/Types" +import {EvaluationType} from "@/lib/enums" +import {calculateResultsDataAvg} from "@/lib/helpers/evaluate" +import {variantNameWithRev} from "@/lib/helpers/variantHelper" +import {JSSTheme, SingleModelEvaluationListTableDataType} from "@/lib/Types" +import {MoreOutlined} from "@ant-design/icons" +import {ArrowsLeftRight, Database, GearSix, Note, Plus, Rocket, Trash} from "@phosphor-icons/react" +import {Button, Dropdown, Space, Statistic, Table} from "antd" +import {ColumnsType} from "antd/es/table" +import {useRouter} from "next/router" import React from "react" +import {createUseStyles} from "react-jss" interface SingleModelEvaluationProps { evaluationList: SingleModelEvaluationListTableDataType[] fetchingEvaluations: boolean } +const useStyles = createUseStyles((theme: JSSTheme) => ({ + button: { + display: "flex", + alignItems: "center", + }, + stat: { + lineHeight: theme.lineHeight, + "& .ant-statistic-content-value": { + fontSize: theme.fontSize, + color: theme.colorPrimary, + }, + "& .ant-statistic-content-suffix": { + fontSize: theme.fontSize, + color: theme.colorPrimary, + }, + }, +})) + const SingleModelEvaluation = ({ evaluationList, fetchingEvaluations, }: SingleModelEvaluationProps) => { - return
SingleModelEvaluation
+ const classes = useStyles() + const router = useRouter() + const appId = router.query.app_id as string + + const handleNavigation = (variantName: string, revisionNum: string) => { + router.push(`/apps/${appId}/playground?variant=${variantName}&revision=${revisionNum}`) + } + + const columns: ColumnsType = [ + { + title: "Variant", + dataIndex: "variants", + key: "variants", + onHeaderCell: () => ({ + style: {minWidth: 160}, + }), + render: (value, record: SingleModelEvaluationListTableDataType) => { + return ( + + {variantNameWithRev({ + variant_name: value[0].variantName, + revision: record.revisions[0], + })} + + ) + }, + }, + { + title: "Test set", + dataIndex: "testsetName", + key: "testsetName", + onHeaderCell: () => ({ + style: {minWidth: 160}, + }), + render: (_, record) => { + return {record.testset.name} + }, + }, + { + title: "Average score", + dataIndex: "averageScore", + key: "averageScore", + onHeaderCell: () => ({ + style: {minWidth: 160}, + }), + render: (_, record) => { + let score = 0 + if (record.scoresData) { + score = + ((record.scoresData.correct?.length || + record.scoresData.true?.length || + 0) / + record.scoresData.nb_of_rows) * + 100 + } else if (record.resultsData) { + const multiplier = { + [EvaluationType.auto_webhook_test]: 100, + [EvaluationType.single_model_test]: 1, + } + score = calculateResultsDataAvg( + record.resultsData, + multiplier[record.evaluationType as keyof typeof multiplier], + ) + score = isNaN(score) ? 0 : score + } else if (record.avgScore) { + score = record.avgScore * 100 + } + + return ( + + + + ) + }, + }, + { + title: "Created on", + dataIndex: "createdAt", + key: "createdAt", + onHeaderCell: () => ({ + style: {minWidth: 160}, + }), + }, + { + title: , + key: "key", + width: 56, + fixed: "right", + align: "center", + render: (_, record) => { + return ( + , + onClick: (e) => { + e.domEvent.stopPropagation() + router.push( + `/apps/${appId}/annotations/single_model_test/${record.key}`, + ) + }, + }, + { + key: "variant", + label: "View variant", + icon: , + onClick: (e) => { + e.domEvent.stopPropagation() + handleNavigation( + record.variants[0].variantName, + record.revisions[0], + ) + }, + }, + { + key: "view_testset", + label: "View test set", + icon: , + onClick: (e) => { + e.domEvent.stopPropagation() + router.push(`/apps/${appId}/testsets/${record.testset._id}`) + }, + }, + {type: "divider"}, + { + key: "delete_eval", + label: "Delete", + icon: , + danger: true, + onClick: (e) => { + e.domEvent.stopPropagation() + }, + }, + ], + }} + > + + + + + + +
+ +
({ + style: {cursor: "pointer"}, + onClick: () => {}, + })} + /> + + ) } export default SingleModelEvaluation From 35d5eb3f984c716a0dfca489bcd117fb967c2483 Mon Sep 17 00:00:00 2001 From: ashrafchowdury Date: Fri, 30 Aug 2024 22:16:29 +0600 Subject: [PATCH 064/149] ui(frontend): implemented human evaluations --- .../AbTestingEvaluation.tsx | 3 +- .../SingleModelEvaluation.tsx | 3 +- .../AbTestingEvalOverview.tsx | 121 ++++++++++++++--- .../SingleModelEvalOverview.tsx | 122 +++++++++++++++--- .../pages/apps/[app_id]/overview/index.tsx | 4 +- 5 files changed, 213 insertions(+), 40 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/abTestingEvaluation/AbTestingEvaluation.tsx b/agenta-web/src/components/pages/evaluations/abTestingEvaluation/AbTestingEvaluation.tsx index c2cf16b993..4918145dff 100644 --- a/agenta-web/src/components/pages/evaluations/abTestingEvaluation/AbTestingEvaluation.tsx +++ b/agenta-web/src/components/pages/evaluations/abTestingEvaluation/AbTestingEvaluation.tsx @@ -1,5 +1,6 @@ import {HumanEvaluationListTableDataType} from "@/components/Evaluations/HumanEvaluationResult" import React from "react" +import AbTestingEvalOverview from "@/components/pages/overview/abTestingEvaluation/AbTestingEvalOverview" interface AbTestingEvaluationProps { evaluationList: HumanEvaluationListTableDataType[] @@ -7,7 +8,7 @@ interface AbTestingEvaluationProps { } const AbTestingEvaluation = ({evaluationList, fetchingEvaluations}: AbTestingEvaluationProps) => { - return
AbTestingEvaluation
+ return } export default AbTestingEvaluation diff --git a/agenta-web/src/components/pages/evaluations/singleModelEvaluation/SingleModelEvaluation.tsx b/agenta-web/src/components/pages/evaluations/singleModelEvaluation/SingleModelEvaluation.tsx index dc7c97a424..74b5362218 100644 --- a/agenta-web/src/components/pages/evaluations/singleModelEvaluation/SingleModelEvaluation.tsx +++ b/agenta-web/src/components/pages/evaluations/singleModelEvaluation/SingleModelEvaluation.tsx @@ -1,5 +1,6 @@ import {SingleModelEvaluationListTableDataType} from "@/lib/Types" import React from "react" +import SingleModelEvalOverview from "@/components/pages/overview/singleModelEvaluation/SingleModelEvalOverview" interface SingleModelEvaluationProps { evaluationList: SingleModelEvaluationListTableDataType[] @@ -10,7 +11,7 @@ const SingleModelEvaluation = ({ evaluationList, fetchingEvaluations, }: SingleModelEvaluationProps) => { - return
SingleModelEvaluation
+ return } export default SingleModelEvaluation diff --git a/agenta-web/src/components/pages/overview/abTestingEvaluation/AbTestingEvalOverview.tsx b/agenta-web/src/components/pages/overview/abTestingEvaluation/AbTestingEvalOverview.tsx index efbcdadb58..397e035320 100644 --- a/agenta-web/src/components/pages/overview/abTestingEvaluation/AbTestingEvalOverview.tsx +++ b/agenta-web/src/components/pages/overview/abTestingEvaluation/AbTestingEvalOverview.tsx @@ -14,7 +14,7 @@ import { fetchEvaluationResults, } from "@/services/human-evaluations/api" import {MoreOutlined, PlusOutlined} from "@ant-design/icons" -import {Database, GearSix, Note, Rocket, Trash} from "@phosphor-icons/react" +import {ArrowsLeftRight, Database, GearSix, Note, Plus, Rocket, Trash} from "@phosphor-icons/react" import {Avatar, Button, Dropdown, message, Space, Spin, Statistic, Table, Typography} from "antd" import {ColumnsType} from "antd/es/table" import {useRouter} from "next/router" @@ -62,9 +62,13 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ color: theme.colorSuccess, }, }, + button: { + display: "flex", + alignItems: "center", + }, })) -const AbTestingEvalOverview = () => { +const AbTestingEvalOverview = ({viewType}: {viewType: "evaluation" | "overview"}) => { const classes = useStyles() const router = useRouter() const appId = router.query.app_id as string @@ -74,6 +78,8 @@ const AbTestingEvalOverview = () => { const [isEvalModalOpen, setIsEvalModalOpen] = useState(false) const [selectedEvalRecord, setSelectedEvalRecord] = useState() const [isDeleteEvalModalOpen, setIsDeleteEvalModalOpen] = useState(false) + const [isDeleteMultipleEvalModalOpen, setIsDeleteMultipleEvalModalOpen] = useState(false) + const [selectedRowKeys, setSelectedRowKeys] = useState([]) useEffect(() => { if (!appId) return @@ -102,9 +108,8 @@ const AbTestingEvalOverview = () => { new Date(b.createdAt || 0).getTime() - new Date(a.createdAt || 0).getTime(), ) - .slice(0, 5) - setEvaluationsList(results) + setEvaluationsList(viewType === "overview" ? results.slice(0, 5) : results) } catch (error) { console.error(error) } finally { @@ -119,6 +124,31 @@ const AbTestingEvalOverview = () => { router.push(`/apps/${appId}/playground?variant=${variantName}&revision=${revisionNum}`) } + const rowSelection = { + onChange: (selectedRowKeys: React.Key[]) => { + setSelectedRowKeys(selectedRowKeys) + }, + } + + const handleDeleteMultipleEvaluations = async () => { + const evaluationsIds = selectedRowKeys.map((key) => key.toString()) + try { + setFetchingEvaluations(true) + await deleteEvaluations(evaluationsIds) + setEvaluationsList((prevEvaluationsList) => + prevEvaluationsList.filter( + (evaluation) => !evaluationsIds.includes(evaluation.key), + ), + ) + setSelectedRowKeys([]) + message.success("Evaluations Deleted") + } catch (error) { + console.error(error) + } finally { + setFetchingEvaluations(false) + } + } + const handleDeleteEvaluation = async (record: HumanEvaluationListTableDataType) => { try { setFetchingEvaluations(true) @@ -381,25 +411,68 @@ const AbTestingEvalOverview = () => { return (
-
- - A/B Testing Evaluations - + + + +
+ ) : ( +
+ - - -
+ + + + +
+ )}
{ evaluationType={"a/b testing evaluation"} /> )} + + {isDeleteMultipleEvalModalOpen && ( + setIsDeleteMultipleEvalModalOpen(false)} + onOk={async () => { + await handleDeleteMultipleEvaluations() + setIsDeleteMultipleEvalModalOpen(false) + }} + evaluationType={"a/b testing evaluation"} + /> + )} ) } diff --git a/agenta-web/src/components/pages/overview/singleModelEvaluation/SingleModelEvalOverview.tsx b/agenta-web/src/components/pages/overview/singleModelEvaluation/SingleModelEvalOverview.tsx index 02d4961289..6b585fcbde 100644 --- a/agenta-web/src/components/pages/overview/singleModelEvaluation/SingleModelEvalOverview.tsx +++ b/agenta-web/src/components/pages/overview/singleModelEvaluation/SingleModelEvalOverview.tsx @@ -14,7 +14,7 @@ import { fetchEvaluationResults, } from "@/services/human-evaluations/api" import {MoreOutlined, PlusOutlined} from "@ant-design/icons" -import {Database, GearSix, Note, Rocket, Trash} from "@phosphor-icons/react" +import {ArrowsLeftRight, Database, GearSix, Note, Plus, Rocket, Trash} from "@phosphor-icons/react" import {Button, Dropdown, message, Space, Spin, Statistic, Table, Typography} from "antd" import {ColumnsType} from "antd/es/table" import {useRouter} from "next/router" @@ -42,9 +42,13 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ color: theme.colorPrimary, }, }, + button: { + display: "flex", + alignItems: "center", + }, })) -const SingleModelEvalOverview = () => { +const SingleModelEvalOverview = ({viewType}: {viewType: "evaluation" | "overview"}) => { const classes = useStyles() const router = useRouter() const appId = router.query.app_id as string @@ -57,6 +61,8 @@ const SingleModelEvalOverview = () => { const [selectedEvalRecord, setSelectedEvalRecord] = useState() const [isDeleteEvalModalOpen, setIsDeleteEvalModalOpen] = useState(false) + const [isDeleteEvalMultipleModalOpen, setIsDeleteEvalMultipleModalOpen] = useState(false) + const [selectedRowKeys, setSelectedRowKeys] = useState([]) useEffect(() => { if (!appId) return @@ -88,9 +94,10 @@ const SingleModelEvalOverview = () => { new Date(b?.createdAt ?? 0).getTime() - new Date(a?.createdAt ?? 0).getTime(), ) - .slice(0, 5) - setEvaluationsList(newEvalResults as any) + setEvaluationsList( + viewType === "overview" ? newEvalResults.slice(0, 5) : (newEvalResults as any), + ) } catch (error) { console.error(error) } finally { @@ -101,6 +108,31 @@ const SingleModelEvalOverview = () => { fetchEvaluations() }, [appId]) + const rowSelection = { + onChange: (selectedRowKeys: React.Key[]) => { + setSelectedRowKeys(selectedRowKeys) + }, + } + + const handleDeleteMultipleEvaluations = async () => { + const evaluationsIds = selectedRowKeys.map((key) => key.toString()) + try { + setFetchingEvaluations(true) + await deleteEvaluations(evaluationsIds) + setEvaluationsList((prevEvaluationsList) => + prevEvaluationsList.filter( + (evaluation) => !evaluationsIds.includes(evaluation.key), + ), + ) + setSelectedRowKeys([]) + message.success("Evaluations Deleted") + } catch (error) { + console.error(error) + } finally { + setFetchingEvaluations(false) + } + } + const handleNavigation = (variantName: string, revisionNum: string) => { router.push(`/apps/${appId}/playground?variant=${variantName}&revision=${revisionNum}`) } @@ -274,26 +306,69 @@ const SingleModelEvalOverview = () => { return (
-
- - Single Model Evaluations + {viewType === "overview" ? ( +
+ + Single Model Evaluations - + + + +
+ ) : ( +
+ - - -
+ + + + +
+ )}
{ evaluationType={"single model evaluation"} /> )} + {isDeleteEvalMultipleModalOpen && ( + setIsDeleteEvalMultipleModalOpen(false)} + onOk={async () => { + await handleDeleteMultipleEvaluations() + setIsDeleteEvalMultipleModalOpen(false) + }} + evaluationType={"single model evaluation"} + /> + )} ) } diff --git a/agenta-web/src/pages/apps/[app_id]/overview/index.tsx b/agenta-web/src/pages/apps/[app_id]/overview/index.tsx index 23434e9ed6..7150478950 100644 --- a/agenta-web/src/pages/apps/[app_id]/overview/index.tsx +++ b/agenta-web/src/pages/apps/[app_id]/overview/index.tsx @@ -165,9 +165,9 @@ export default function Overview() { - + - + {currentApp && ( Date: Fri, 30 Aug 2024 23:25:23 +0100 Subject: [PATCH 065/149] fix(frontend): added evaluator mapping endpoints --- agenta-web/src/lib/Types.ts | 9 +++++++++ agenta-web/src/services/evaluations/api/index.ts | 14 ++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/agenta-web/src/lib/Types.ts b/agenta-web/src/lib/Types.ts index 127643d793..d71918b71f 100644 --- a/agenta-web/src/lib/Types.ts +++ b/agenta-web/src/lib/Types.ts @@ -357,6 +357,15 @@ export interface Evaluator { oss?: boolean } +export interface EvaluatorMappingInput { + inputs: Record + mapping: Record +} + +export interface EvaluatorMappingOutput { + outputs: Record +} + export interface EvaluatorConfig { id: string evaluator_key: string diff --git a/agenta-web/src/services/evaluations/api/index.ts b/agenta-web/src/services/evaluations/api/index.ts index c6cb31cd34..aefa0edeb0 100644 --- a/agenta-web/src/services/evaluations/api/index.ts +++ b/agenta-web/src/services/evaluations/api/index.ts @@ -6,6 +6,8 @@ import { EvaluationStatus, Evaluator, EvaluatorConfig, + EvaluatorMappingInput, + EvaluatorMappingOutput, KeyValuePair, LLMRunRateLimit, TestSet, @@ -65,6 +67,18 @@ export const fetchAllEvaluators = async () => { return evaluators } +export const createEvaluatorDataMapping = async ( + config: EvaluatorMappingInput, +): Promise => { + const response = await axios.post("/api/evaluators/map/", {...config}) + return response.data +} + +export const createEvaluatorRunExecution = async (evaluatorKey: string) => { + const response = await axios.post(`/api/evaluators/${evaluatorKey}/run/`) + return response.data +} + // Evaluator Configs export const fetchAllEvaluatorConfigs = async (appId: string) => { const response = await axios.get(`/api/evaluators/configs/`, {params: {app_id: appId}}) From 41745ff9aa821140a0da3f25ac8d87a6ded35540 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Sat, 31 Aug 2024 23:58:02 +0100 Subject: [PATCH 066/149] design(frontend): improved config evaluator ui --- .../ConfigureEvaluators/index.tsx | 8 ++- .../EvaluatorVariantModal.tsx | 57 +++++++++++++++-- .../ConfigureNewEvaluator/index.tsx | 63 ++++++++++++++++--- .../EvaluatorsModal/EvaluatorsModal.tsx | 51 +++++++++++---- .../TestcaseTab/TestcaseTab.tsx | 33 ++++++++++ 5 files changed, 182 insertions(+), 30 deletions(-) create mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/TestcaseTab/TestcaseTab.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx index 8ee0459f0f..840fa52520 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx @@ -1,7 +1,7 @@ import {Evaluator, EvaluatorConfig, JSSTheme} from "@/lib/Types" import {CloseOutlined, PlusOutlined} from "@ant-design/icons" import {Cards, Table} from "@phosphor-icons/react" -import {Button, Divider, Flex, Input, Radio, Space, Typography} from "antd" +import {Button, Divider, Flex, Input, Radio, Space, Spin, Typography} from "antd" import React, {useMemo, useState} from "react" import {createUseStyles} from "react-jss" import EvaluatorCard from "./EvaluatorCard" @@ -12,6 +12,7 @@ type ConfigureEvaluatorModalProps = { handleOnCancel: () => void setCurrent: React.Dispatch> setSelectedEvaluator: React.Dispatch> + fetchingEvalConfigs: boolean } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -55,6 +56,7 @@ const ConfigureEvaluatorModal = ({ handleOnCancel, setCurrent, setSelectedEvaluator, + fetchingEvalConfigs, }: ConfigureEvaluatorModalProps) => { const classes = useStyles() const [searchTerm, setSearchTerm] = useState("") @@ -126,13 +128,13 @@ const ConfigureEvaluatorModal = ({ -
+ {evaluatorsDisplay === "list" ? ( ) : ( )} -
+ ) } diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/EvaluatorVariantModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/EvaluatorVariantModal.tsx index 6a29dd3d30..31aa00bf1d 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/EvaluatorVariantModal.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/EvaluatorVariantModal.tsx @@ -1,12 +1,26 @@ import {formatVariantIdWithHash} from "@/lib/helpers/utils" import {JSSTheme, Variant} from "@/lib/Types" import {CloseOutlined} from "@ant-design/icons" -import {Badge, Button, Divider, Input, Modal, Table, Tag, theme, Typography} from "antd" +import { + Badge, + Button, + Divider, + Flex, + Input, + Modal, + Space, + Table, + Tag, + theme, + Typography, +} from "antd" import React, {useMemo, useState} from "react" import {createUseStyles} from "react-jss" type EvaluatorVariantModalProps = { variants: Variant[] | null + setSelectedVariant: React.Dispatch> + selectedVariant: Variant | null } & React.ComponentProps const {useToken} = theme @@ -25,8 +39,6 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ "& .ant-modal-body": { paddingLeft: 24, paddingRight: 24, - height: 300, - overflowY: "auto", }, }, table: { @@ -41,11 +53,15 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ }, })) -const EvaluatorVariantModal = ({variants, ...props}: EvaluatorVariantModalProps) => { +const EvaluatorVariantModal = ({ + variants, + setSelectedVariant, + selectedVariant, + ...props +}: EvaluatorVariantModalProps) => { const classes = useStyles() const {token} = useToken() const [searchTerm, setSearchTerm] = useState("") - const [selectedVariant, setSelectedVariant] = useState() const filtered = useMemo(() => { if (!searchTerm) return variants @@ -100,7 +116,16 @@ const EvaluatorVariantModal = ({variants, ...props}: EvaluatorVariantModalProps) render: (_, record) => { return (
-
{record.variantName}
+ +
{record.variantName}
+ {selectedVariant?.variantId === record.variantId ? ( + + Selected + + ) : ( + "" + )} +
+ + + + + ) } diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx index cba1c6d5da..17f9855542 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx @@ -1,4 +1,4 @@ -import {Evaluator, JSSTheme, Variant} from "@/lib/Types" +import {Evaluator, JSSTheme, TestSet, Variant} from "@/lib/Types" import {CloseOutlined} from "@ant-design/icons" import { ArrowLeft, @@ -9,18 +9,28 @@ import { Lightning, Play, } from "@phosphor-icons/react" -import {Button, Divider, Flex, Form, Input, Select, Space, Tag, Typography} from "antd" +import {Button, Divider, Flex, Form, Input, message, Select, Space, Tag, Typography} from "antd" import React, {useMemo, useState} from "react" import {createUseStyles} from "react-jss" import AdvancedSettings from "./AdvancedSettings" import {DynamicFormField} from "./DynamicFormField" import EvaluatorVariantModal from "./EvaluatorVariantModal" +import { + CreateEvaluationConfigData, + createEvaluatorConfig, + updateEvaluatorConfig, +} from "@/services/evaluations/api" +import {useAppId} from "@/hooks/useAppId" +import {useVariant} from "@/lib/hooks/useVariant" +import {useLocalStorage} from "usehooks-ts" type ConfigureNewEvaluatorProps = { setCurrent: React.Dispatch> handleOnCancel: () => void + onSuccess: () => void selectedEvaluator: Evaluator variants: Variant[] | null + testsets: TestSet[] | null } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -61,11 +71,16 @@ const ConfigureNewEvaluator = ({ selectedEvaluator, handleOnCancel, variants, + testsets, + onSuccess, }: ConfigureNewEvaluatorProps) => { + const appId = useAppId() const classes = useStyles() const [form] = Form.useForm() - const [debugEvaluator, setDebugEvaluator] = useState(false) + const [debugEvaluator, setDebugEvaluator] = useLocalStorage("isDebugSelectionOpen", false) const [openVariantModal, setOpenVariantModal] = useState(false) + const [submitLoading, setSubmitLoading] = useState(false) + const [selectedVariant, setSelectedVariant] = useState(null) const evalFields = useMemo( () => @@ -82,9 +97,29 @@ const ConfigureNewEvaluator = ({ const advancedSettingsFields = evalFields.filter((field) => field.advanced) const basicSettingsFields = evalFields.filter((field) => !field.advanced) - const onSubmit = () => { + const onSubmit = (values: CreateEvaluationConfigData) => { try { - } catch (error: any) {} + setSubmitLoading(true) + if (!selectedEvaluator.key) throw new Error("No selected key") + const settingsValues = values.settings_values || {} + + const data = { + ...values, + evaluator_key: selectedEvaluator.key, + settings_values: settingsValues, + } + ;(false + ? updateEvaluatorConfig("initialValues?.id"!, data) + : createEvaluatorConfig(appId, data) + ) + .then(onSuccess) + .catch(console.error) + .finally(() => setSubmitLoading(false)) + } catch (error: any) { + setSubmitLoading(false) + console.error(error) + message.error(error.message) + } } return ( @@ -148,7 +183,7 @@ const ConfigureNewEvaluator = ({ requiredMark={false} form={form} name="new-evaluator" - onFinish={() => onSubmit} + onFinish={onSubmit} layout="vertical" className={classes.formContainer} > @@ -214,8 +249,12 @@ const ConfigureNewEvaluator = ({
- - + + @@ -238,7 +277,11 @@ const ConfigureNewEvaluator = ({ Generate test data - @@ -289,6 +332,8 @@ const ConfigureNewEvaluator = ({ variants={variants} open={openVariantModal} onCancel={() => setOpenVariantModal(false)} + setSelectedVariant={setSelectedVariant} + selectedVariant={selectedVariant} /> ) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx index fec6131b4e..d3e8819c53 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx @@ -1,6 +1,6 @@ import {useAppId} from "@/hooks/useAppId" import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" -import {Evaluator, JSSTheme, Variant} from "@/lib/Types" +import {Evaluator, JSSTheme, TestSet, Variant} from "@/lib/Types" import {fetchAllEvaluatorConfigs, fetchAllEvaluators} from "@/services/evaluations/api" import {Modal} from "antd" import {useAtom} from "jotai" @@ -10,6 +10,8 @@ import ConfigureEvaluators from "./ConfigureEvaluators" import CreateNewEvaluator from "./CreateNewEvaluator" import ConfigureNewEvaluator from "./ConfigureNewEvaluator" import {fetchVariants} from "@/services/api" +import {fetchTestsets} from "@/services/testsets/api" +import TestcaseTab from "./TestcaseTab/TestcaseTab" type EvaluatorsModalProps = {} & React.ComponentProps @@ -32,16 +34,28 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { const [evaluatorConfigs, setEvaluatorConfigs] = useAtom(evaluatorConfigsAtom) const [selectedEvaluator, setSelectedEvaluator] = useState(null) const [variants, setVariants] = useState(null) + const [testsets, setTestsets] = useState(null) + const [fetchingEvalConfigs, setFetchingEvalConfigs] = useState(false) + + const evalConfigFetcher = () => { + setFetchingEvalConfigs(true) + fetchAllEvaluatorConfigs(appId) + .then(setEvaluatorConfigs) + .catch(console.error) + .finally(() => setFetchingEvalConfigs(false)) + } useEffect(() => { Promise.all([ fetchAllEvaluators(), fetchAllEvaluatorConfigs(appId), fetchVariants(appId), - ]).then(([evaluators, configs, variants]) => { + fetchTestsets(appId), + ]).then(([evaluators, configs, variants, testsets]) => { setEvaluators(evaluators) setEvaluatorConfigs(configs) setVariants(variants) + setTestsets(testsets) }) }, [appId]) @@ -53,6 +67,7 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { handleOnCancel={() => props.onCancel?.({} as any)} setCurrent={setCurrent} setSelectedEvaluator={setSelectedEvaluator} + fetchingEvalConfigs={fetchingEvalConfigs} /> ), }, @@ -69,16 +84,28 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { ] if (selectedEvaluator) { - steps.push({ - content: ( - props.onCancel?.({} as any)} - variants={variants} - /> - ), - }) + steps.push( + ...[ + { + content: ( + props.onCancel?.({} as any)} + variants={variants} + testsets={testsets} + onSuccess={() => { + evalConfigFetcher() + setCurrent(0) + }} + /> + ), + }, + { + content: setCurrent(2)} />, + }, + ], + ) } return ( diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/TestcaseTab/TestcaseTab.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/TestcaseTab/TestcaseTab.tsx new file mode 100644 index 0000000000..5f01ad05ff --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/TestcaseTab/TestcaseTab.tsx @@ -0,0 +1,33 @@ +import {JSSTheme} from "@/lib/Types" +import {CloseOutlined} from "@ant-design/icons" +import {Button, Typography} from "antd" +import React from "react" +import {createUseStyles} from "react-jss" + +interface TestcaseTabProps { + handleOnCancel: () => void +} + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + header: { + display: "flex", + alignItems: "center", + justifyContent: "space-between", + }, +})) + +const TestcaseTab = ({handleOnCancel}: TestcaseTabProps) => { + const classes = useStyles() + + return ( +
+
+ Select test case + +
+
+ ) +} + +export default TestcaseTab From 244e8bd20780f1d4a08979338d98aae512267d7e Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Sun, 1 Sep 2024 23:18:28 +0100 Subject: [PATCH 067/149] fix(frontend): implemented select test case functionality --- .../EvaluatorsModal/EvaluatorsModal.tsx | 56 +++--- .../TestcaseTab/TestcaseTab.tsx | 159 +++++++++++++++++- 2 files changed, 186 insertions(+), 29 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx index d3e8819c53..29d75ed7bc 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx @@ -1,6 +1,6 @@ import {useAppId} from "@/hooks/useAppId" import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" -import {Evaluator, JSSTheme, TestSet, Variant} from "@/lib/Types" +import {Evaluator, JSSTheme, testset, Variant} from "@/lib/Types" import {fetchAllEvaluatorConfigs, fetchAllEvaluators} from "@/services/evaluations/api" import {Modal} from "antd" import {useAtom} from "jotai" @@ -34,8 +34,9 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { const [evaluatorConfigs, setEvaluatorConfigs] = useAtom(evaluatorConfigsAtom) const [selectedEvaluator, setSelectedEvaluator] = useState(null) const [variants, setVariants] = useState(null) - const [testsets, setTestsets] = useState(null) + const [testsets, setTestsets] = useState(null) const [fetchingEvalConfigs, setFetchingEvalConfigs] = useState(false) + const [selectedTestcase, setSelectedTestcase] = useState | null>(null) const evalConfigFetcher = () => { setFetchingEvalConfigs(true) @@ -84,28 +85,35 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { ] if (selectedEvaluator) { - steps.push( - ...[ - { - content: ( - props.onCancel?.({} as any)} - variants={variants} - testsets={testsets} - onSuccess={() => { - evalConfigFetcher() - setCurrent(0) - }} - /> - ), - }, - { - content: setCurrent(2)} />, - }, - ], - ) + steps.push({ + content: ( + props.onCancel?.({} as any)} + variants={variants} + testsets={testsets} + onSuccess={() => { + evalConfigFetcher() + setCurrent(0) + }} + selectedTestcase={selectedTestcase} + /> + ), + }) + + if (testsets && testsets.length) { + steps.push({ + content: ( + setCurrent(2)} + testsets={testsets} + setSelectedTestcase={setSelectedTestcase} + selectedTestcase={selectedTestcase} + /> + ), + }) + } } return ( diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/TestcaseTab/TestcaseTab.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/TestcaseTab/TestcaseTab.tsx index 5f01ad05ff..ab57765fb6 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/TestcaseTab/TestcaseTab.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/TestcaseTab/TestcaseTab.tsx @@ -1,11 +1,16 @@ -import {JSSTheme} from "@/lib/Types" +import {JSSTheme, TestSet, testset} from "@/lib/Types" +import {fetchTestset} from "@/services/testsets/api" import {CloseOutlined} from "@ant-design/icons" -import {Button, Typography} from "antd" -import React from "react" +import {Button, Divider, Input, Menu, Table, Typography} from "antd" +import {ColumnsType} from "antd/es/table" +import React, {useEffect, useMemo, useState} from "react" import {createUseStyles} from "react-jss" interface TestcaseTabProps { handleOnCancel: () => void + setSelectedTestcase: React.Dispatch | null>> + testsets: testset[] + selectedTestcase: Record | null } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -13,19 +18,163 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ display: "flex", alignItems: "center", justifyContent: "space-between", + "& .ant-typography": { + fontSize: theme.fontSizeHeading4, + fontWeight: theme.fontWeightStrong, + lineHeight: theme.lineHeightLG, + }, + }, + title: { + fontSize: theme.fontSizeLG, + fontWeight: theme.fontWeightMedium, + lineHeight: theme.lineHeightLG, + }, + sidebar: { + display: "flex", + flexDirection: "column", + gap: theme.padding, + width: 213, + }, + menu: { + height: 550, + overflowY: "auto", + borderInlineEnd: `0px !important`, }, })) -const TestcaseTab = ({handleOnCancel}: TestcaseTabProps) => { +const TestcaseTab = ({ + handleOnCancel, + setSelectedTestcase, + testsets, + selectedTestcase, +}: TestcaseTabProps) => { const classes = useStyles() + const [selectedTestset, setSelectedTestset] = useState(testsets[0]._id) + const [isLoadingTestset, setIsLoadingTestset] = useState(false) + const [testsetCsvData, setTestsetCsvData] = useState([]) + + const [searchTerm, setSearchTerm] = useState("") + + const filteredTestset = useMemo(() => { + if (!searchTerm) return testsets + return testsets.filter((item) => item.name.toLowerCase().includes(searchTerm.toLowerCase())) + }, [searchTerm, testsets]) + + useEffect(() => { + const testsetFetcher = async () => { + try { + setIsLoadingTestset(true) + const data = await fetchTestset(selectedTestset) + setTestsetCsvData(data.csvdata) + } catch (error) { + console.error(error) + } finally { + setIsLoadingTestset(false) + } + } + + testsetFetcher() + }, [selectedTestset]) + + const columnDef = useMemo(() => { + const columns: ColumnsType = [] + + if (testsetCsvData.length > 0) { + const keys = Object.keys(testsetCsvData[0]) + + columns.push( + ...keys.map((key, index) => ({ + title: key, + dataIndex: key, + key: index, + render: (_: any, record: any) => { + return
{record[key]}
+ }, + })), + ) + } + + return columns + }, [testsetCsvData]) return ( -
+
Select test case
+
+
+
+ + Select test case + + + Lorem ipsum, dolor sit amet consectetur adipisicing elit. Itaque culpa + similique reiciendis + +
+ setSearchTerm(e.target.value)} + /> + + + + ({ + key: testset._id, + label: testset.name, + }))} + onSelect={({key}) => { + setSelectedTestset(key) + }} + defaultSelectedKeys={[selectedTestset]} + className={classes.menu} + /> +
+ +
+ Select test cases + +
{ + setSelectedTestcase(selectedRows[0]) + }, + }} + loading={isLoadingTestset} + dataSource={testsetCsvData.map((data, index) => ({...data, id: index}))} + columns={columnDef} + className="flex-1" + bordered + rowKey={"id"} + pagination={false} + scroll={{y: 550}} + /> + +
+ + +
+ + ) } From 54a0d496eb6109233cb049a36b22f57a4f6e6dcc Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Mon, 2 Sep 2024 00:57:05 +0100 Subject: [PATCH 068/149] fix(frontend): added conditional to select test case --- .../ConfigureNewEvaluator/index.tsx | 57 +++++++++++++++---- 1 file changed, 46 insertions(+), 11 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx index 17f9855542..72db58927b 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx @@ -1,4 +1,4 @@ -import {Evaluator, JSSTheme, TestSet, Variant} from "@/lib/Types" +import {Evaluator, JSSTheme, testset, Variant} from "@/lib/Types" import {CloseOutlined} from "@ant-design/icons" import { ArrowLeft, @@ -9,8 +9,20 @@ import { Lightning, Play, } from "@phosphor-icons/react" -import {Button, Divider, Flex, Form, Input, message, Select, Space, Tag, Typography} from "antd" -import React, {useMemo, useState} from "react" +import { + Button, + Divider, + Flex, + Form, + Input, + message, + Select, + Space, + Tag, + Tooltip, + Typography, +} from "antd" +import React, {useEffect, useMemo, useState} from "react" import {createUseStyles} from "react-jss" import AdvancedSettings from "./AdvancedSettings" import {DynamicFormField} from "./DynamicFormField" @@ -23,6 +35,7 @@ import { import {useAppId} from "@/hooks/useAppId" import {useVariant} from "@/lib/hooks/useVariant" import {useLocalStorage} from "usehooks-ts" +import {getAllVariantParameters} from "@/lib/helpers/variantHelper" type ConfigureNewEvaluatorProps = { setCurrent: React.Dispatch> @@ -30,7 +43,8 @@ type ConfigureNewEvaluatorProps = { onSuccess: () => void selectedEvaluator: Evaluator variants: Variant[] | null - testsets: TestSet[] | null + testsets: testset[] | null + selectedTestcase: Record | null } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -73,6 +87,7 @@ const ConfigureNewEvaluator = ({ variants, testsets, onSuccess, + selectedTestcase, }: ConfigureNewEvaluatorProps) => { const appId = useAppId() const classes = useStyles() @@ -122,6 +137,20 @@ const ConfigureNewEvaluator = ({ } } + useEffect(() => { + if (!selectedVariant) return + + const fetchParameters = async () => { + try { + const {parameters, inputs} = await getAllVariantParameters(appId, selectedVariant) + } catch (error) { + console.error(error) + } + } + + fetchParameters() + }, [selectedVariant]) + return (
@@ -277,14 +306,20 @@ const ConfigureNewEvaluator = ({ Generate test data - + + @@ -393,7 +393,7 @@ const HumanEvaluationModal = ({
{selectedVariants[index]?.variantName || "Select a variant"} - +
@@ -430,6 +430,8 @@ const HumanEvaluationModal = ({ onClick={onStartEvaluation} type="primary" data-cy="start-new-evaluation-button" + icon={} + className="flex items-center" > Start diff --git a/agenta-web/src/components/pages/overview/abTestingEvaluation/AbTestingEvalOverview.tsx b/agenta-web/src/components/HumanEvaluations/AbTestingEvaluation.tsx similarity index 97% rename from agenta-web/src/components/pages/overview/abTestingEvaluation/AbTestingEvalOverview.tsx rename to agenta-web/src/components/HumanEvaluations/AbTestingEvaluation.tsx index 397e035320..a7e71e13eb 100644 --- a/agenta-web/src/components/pages/overview/abTestingEvaluation/AbTestingEvalOverview.tsx +++ b/agenta-web/src/components/HumanEvaluations/AbTestingEvaluation.tsx @@ -14,7 +14,7 @@ import { fetchEvaluationResults, } from "@/services/human-evaluations/api" import {MoreOutlined, PlusOutlined} from "@ant-design/icons" -import {ArrowsLeftRight, Database, GearSix, Note, Plus, Rocket, Trash} from "@phosphor-icons/react" +import {Database, GearSix, Note, Plus, Rocket, Trash} from "@phosphor-icons/react" import {Avatar, Button, Dropdown, message, Space, Spin, Statistic, Table, Typography} from "antd" import {ColumnsType} from "antd/es/table" import {useRouter} from "next/router" @@ -68,7 +68,7 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ }, })) -const AbTestingEvalOverview = ({viewType}: {viewType: "evaluation" | "overview"}) => { +const AbTestingEvaluation = ({viewType}: {viewType: "evaluation" | "overview"}) => { const classes = useStyles() const router = useRouter() const appId = router.query.app_id as string @@ -450,14 +450,6 @@ const AbTestingEvalOverview = ({viewType}: {viewType: "evaluation" | "overview"} > Delete -
)} @@ -522,4 +514,4 @@ const AbTestingEvalOverview = ({viewType}: {viewType: "evaluation" | "overview"} ) } -export default AbTestingEvalOverview +export default AbTestingEvaluation diff --git a/agenta-web/src/components/pages/overview/singleModelEvaluation/SingleModelEvalOverview.tsx b/agenta-web/src/components/HumanEvaluations/SingleModelEvaluation.tsx similarity index 96% rename from agenta-web/src/components/pages/overview/singleModelEvaluation/SingleModelEvalOverview.tsx rename to agenta-web/src/components/HumanEvaluations/SingleModelEvaluation.tsx index 6b585fcbde..ee22eb4655 100644 --- a/agenta-web/src/components/pages/overview/singleModelEvaluation/SingleModelEvalOverview.tsx +++ b/agenta-web/src/components/HumanEvaluations/SingleModelEvaluation.tsx @@ -14,7 +14,7 @@ import { fetchEvaluationResults, } from "@/services/human-evaluations/api" import {MoreOutlined, PlusOutlined} from "@ant-design/icons" -import {ArrowsLeftRight, Database, GearSix, Note, Plus, Rocket, Trash} from "@phosphor-icons/react" +import {Database, GearSix, Note, Plus, Rocket, Trash} from "@phosphor-icons/react" import {Button, Dropdown, message, Space, Spin, Statistic, Table, Typography} from "antd" import {ColumnsType} from "antd/es/table" import {useRouter} from "next/router" @@ -48,7 +48,7 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ }, })) -const SingleModelEvalOverview = ({viewType}: {viewType: "evaluation" | "overview"}) => { +const SingleModelEvaluation = ({viewType}: {viewType: "evaluation" | "overview"}) => { const classes = useStyles() const router = useRouter() const appId = router.query.app_id as string @@ -346,14 +346,6 @@ const SingleModelEvalOverview = ({viewType}: {viewType: "evaluation" | "overview > Delete -
)} @@ -417,4 +409,4 @@ const SingleModelEvalOverview = ({viewType}: {viewType: "evaluation" | "overview ) } -export default SingleModelEvalOverview +export default SingleModelEvaluation diff --git a/agenta-web/src/components/pages/evaluations/abTestingEvaluation/AbTestingEvaluation.tsx b/agenta-web/src/components/pages/evaluations/abTestingEvaluation/AbTestingEvaluation.tsx deleted file mode 100644 index 358626d3d6..0000000000 --- a/agenta-web/src/components/pages/evaluations/abTestingEvaluation/AbTestingEvaluation.tsx +++ /dev/null @@ -1,373 +0,0 @@ -import {HumanEvaluationListTableDataType} from "@/components/Evaluations/HumanEvaluationResult" -import {getColorFromStr} from "@/lib/helpers/colors" -import {getVotesPercentage} from "@/lib/helpers/evaluate" -import {getInitials, isDemo} from "@/lib/helpers/utils" -import {variantNameWithRev} from "@/lib/helpers/variantHelper" -import {JSSTheme} from "@/lib/Types" -import {MoreOutlined} from "@ant-design/icons" -import { - ArrowsLeftRight, - Columns, - Database, - GearSix, - Note, - Plus, - Rocket, - Trash, -} from "@phosphor-icons/react" -import {Avatar, Button, Dropdown, Space, Statistic, Table, Typography} from "antd" -import {ColumnsType} from "antd/es/table" -import {useRouter} from "next/router" -import React, {useState} from "react" -import {createUseStyles} from "react-jss" - -interface AbTestingEvaluationProps { - evaluationList: HumanEvaluationListTableDataType[] - fetchingEvaluations: boolean -} - -const useStyles = createUseStyles((theme: JSSTheme) => ({ - button: { - display: "flex", - alignItems: "center", - }, - statFlag: { - lineHeight: theme.lineHeight, - "& .ant-statistic-content-value": { - fontSize: theme.fontSize, - color: theme.colorError, - }, - "& .ant-statistic-content-suffix": { - fontSize: theme.fontSize, - color: theme.colorError, - }, - }, - stat: { - lineHeight: theme.lineHeight, - "& .ant-statistic-content-value": { - fontSize: theme.fontSize, - color: theme.colorPrimary, - }, - "& .ant-statistic-content-suffix": { - fontSize: theme.fontSize, - color: theme.colorPrimary, - }, - }, - statGood: { - lineHeight: theme.lineHeight, - "& .ant-statistic-content-value": { - fontSize: theme.fontSize, - color: theme.colorSuccess, - }, - "& .ant-statistic-content-suffix": { - fontSize: theme.fontSize, - color: theme.colorSuccess, - }, - }, -})) - -const AbTestingEvaluation = ({evaluationList, fetchingEvaluations}: AbTestingEvaluationProps) => { - const classes = useStyles() - const router = useRouter() - const appId = router.query.app_id as string - const [selectedRowKeys, setSelectedRowKeys] = useState([]) - - const handleNavigation = (variantName: string, revisionNum: string) => { - router.push(`/apps/${appId}/playground?variant=${variantName}&revision=${revisionNum}`) - } - - const columns: ColumnsType = [ - { - title: "Variant A", - dataIndex: "variantNames", - key: "variant1", - onHeaderCell: () => ({ - style: {minWidth: 160}, - }), - render: (value, record) => { - return ( -
- {variantNameWithRev({ - variant_name: value[0], - revision: record.revisions[0], - })} -
- ) - }, - }, - { - title: "Variant B", - dataIndex: "variantNames", - key: "variant2", - onHeaderCell: () => ({ - style: {minWidth: 160}, - }), - render: (value, record) => { - return ( -
- {variantNameWithRev({ - variant_name: value[1], - revision: record.revisions[1], - })} -
- ) - }, - }, - { - title: "Test set", - dataIndex: "testsetName", - key: "testsetName", - onHeaderCell: () => ({ - style: {minWidth: 160}, - }), - render: (_, record: HumanEvaluationListTableDataType, index: number) => { - return {record.testset.name} - }, - }, - { - title: "Results", - key: "results", - onHeaderCell: () => ({ - style: {minWidth: 240}, - }), - render: (_, record: HumanEvaluationListTableDataType) => { - const stat1 = getVotesPercentage(record, 0) - const stat2 = getVotesPercentage(record, 1) - - return ( -
- - | - -
- ) - }, - }, - { - title: "Both are good", - dataIndex: "positive", - key: "positive", - onHeaderCell: () => ({ - style: {minWidth: 160}, - }), - render: (_, record: HumanEvaluationListTableDataType) => { - let percentage = record.votesData.positive_votes.percentage - return ( - - - - ) - }, - }, - { - title: "Flag", - dataIndex: "flag", - key: "flag", - onHeaderCell: () => ({ - style: {minWidth: 160}, - }), - render: (value: any, record: HumanEvaluationListTableDataType) => { - let percentage = record.votesData.flag_votes.percentage - return ( - - - - ) - }, - }, - ] - - if (isDemo()) { - columns.push({ - title: "User", - dataIndex: ["user", "username"], - key: "username", - onHeaderCell: () => ({ - style: {minWidth: 160}, - }), - render: (_, record: any) => { - return ( - - - {getInitials(record.user.username)} - - {record.user.username} - - ) - }, - }) - } - - columns.push( - ...([ - { - title: "Created on", - dataIndex: "createdAt", - key: "createdAt", - onHeaderCell: () => ({ - style: {minWidth: 160}, - }), - }, - { - title: , - key: "key", - width: 56, - fixed: "right", - align: "center", - render: (_: any, record: HumanEvaluationListTableDataType) => { - return ( - , - onClick: (e) => { - e.domEvent.stopPropagation() - router.push( - `/apps/${appId}/annotations/human_a_b_testing/${record.key}`, - ) - }, - }, - { - key: "variant1", - label: "View variant 1", - icon: , - onClick: (e) => { - e.domEvent.stopPropagation() - handleNavigation( - record.variantNames[0], - record.revisions[0], - ) - }, - }, - { - key: "variant2", - label: "View variant 2", - icon: , - onClick: (e) => { - e.domEvent.stopPropagation() - handleNavigation( - record.variantNames[1], - record.revisions[1], - ) - }, - }, - { - key: "view_testset", - label: "View test set", - icon: , - onClick: (e) => { - e.domEvent.stopPropagation() - router.push( - `/apps/${appId}/testsets/${record.testset._id}`, - ) - }, - }, - {type: "divider"}, - { - key: "delete_eval", - label: "Delete", - icon: , - danger: true, - onClick: (e) => { - e.domEvent.stopPropagation() - }, - }, - ], - }} - > - - - - - - - - -
{ - setSelectedRowKeys(selectedRowKeys) - }, - }} - className="ph-no-capture" - columns={columns} - rowKey={"id"} - dataSource={evaluationList} - scroll={{x: true}} - bordered - pagination={false} - onRow={(record) => ({ - style: {cursor: "pointer"}, - onClick: () => {}, - })} - /> - - ) -} - -export default AbTestingEvaluation diff --git a/agenta-web/src/components/pages/evaluations/singleModelEvaluation/SingleModelEvaluation.tsx b/agenta-web/src/components/pages/evaluations/singleModelEvaluation/SingleModelEvaluation.tsx deleted file mode 100644 index b2500c0892..0000000000 --- a/agenta-web/src/components/pages/evaluations/singleModelEvaluation/SingleModelEvaluation.tsx +++ /dev/null @@ -1,241 +0,0 @@ -import {EvaluationType} from "@/lib/enums" -import {calculateResultsDataAvg} from "@/lib/helpers/evaluate" -import {variantNameWithRev} from "@/lib/helpers/variantHelper" -import {JSSTheme, SingleModelEvaluationListTableDataType} from "@/lib/Types" -import {MoreOutlined} from "@ant-design/icons" -import {ArrowsLeftRight, Database, GearSix, Note, Plus, Rocket, Trash} from "@phosphor-icons/react" -import {Button, Dropdown, Space, Statistic, Table} from "antd" -import {ColumnsType} from "antd/es/table" -import {useRouter} from "next/router" -import React from "react" -import {createUseStyles} from "react-jss" - -interface SingleModelEvaluationProps { - evaluationList: SingleModelEvaluationListTableDataType[] - fetchingEvaluations: boolean -} - -const useStyles = createUseStyles((theme: JSSTheme) => ({ - button: { - display: "flex", - alignItems: "center", - }, - stat: { - lineHeight: theme.lineHeight, - "& .ant-statistic-content-value": { - fontSize: theme.fontSize, - color: theme.colorPrimary, - }, - "& .ant-statistic-content-suffix": { - fontSize: theme.fontSize, - color: theme.colorPrimary, - }, - }, -})) - -const SingleModelEvaluation = ({ - evaluationList, - fetchingEvaluations, -}: SingleModelEvaluationProps) => { - const classes = useStyles() - const router = useRouter() - const appId = router.query.app_id as string - - const handleNavigation = (variantName: string, revisionNum: string) => { - router.push(`/apps/${appId}/playground?variant=${variantName}&revision=${revisionNum}`) - } - - const columns: ColumnsType = [ - { - title: "Variant", - dataIndex: "variants", - key: "variants", - onHeaderCell: () => ({ - style: {minWidth: 160}, - }), - render: (value, record: SingleModelEvaluationListTableDataType) => { - return ( - - {variantNameWithRev({ - variant_name: value[0].variantName, - revision: record.revisions[0], - })} - - ) - }, - }, - { - title: "Test set", - dataIndex: "testsetName", - key: "testsetName", - onHeaderCell: () => ({ - style: {minWidth: 160}, - }), - render: (_, record) => { - return {record.testset.name} - }, - }, - { - title: "Average score", - dataIndex: "averageScore", - key: "averageScore", - onHeaderCell: () => ({ - style: {minWidth: 160}, - }), - render: (_, record) => { - let score = 0 - if (record.scoresData) { - score = - ((record.scoresData.correct?.length || - record.scoresData.true?.length || - 0) / - record.scoresData.nb_of_rows) * - 100 - } else if (record.resultsData) { - const multiplier = { - [EvaluationType.auto_webhook_test]: 100, - [EvaluationType.single_model_test]: 1, - } - score = calculateResultsDataAvg( - record.resultsData, - multiplier[record.evaluationType as keyof typeof multiplier], - ) - score = isNaN(score) ? 0 : score - } else if (record.avgScore) { - score = record.avgScore * 100 - } - - return ( - - - - ) - }, - }, - { - title: "Created on", - dataIndex: "createdAt", - key: "createdAt", - onHeaderCell: () => ({ - style: {minWidth: 160}, - }), - }, - { - title: , - key: "key", - width: 56, - fixed: "right", - align: "center", - render: (_, record) => { - return ( - , - onClick: (e) => { - e.domEvent.stopPropagation() - router.push( - `/apps/${appId}/annotations/single_model_test/${record.key}`, - ) - }, - }, - { - key: "variant", - label: "View variant", - icon: , - onClick: (e) => { - e.domEvent.stopPropagation() - handleNavigation( - record.variants[0].variantName, - record.revisions[0], - ) - }, - }, - { - key: "view_testset", - label: "View test set", - icon: , - onClick: (e) => { - e.domEvent.stopPropagation() - router.push(`/apps/${appId}/testsets/${record.testset._id}`) - }, - }, - {type: "divider"}, - { - key: "delete_eval", - label: "Delete", - icon: , - danger: true, - onClick: (e) => { - e.domEvent.stopPropagation() - }, - }, - ], - }} - > - - - - - - - - -
({ - style: {cursor: "pointer"}, - onClick: () => {}, - })} - /> - - ) -} - -export default SingleModelEvaluation diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx index b4d3bea986..a9507f3919 100644 --- a/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx +++ b/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx @@ -1,15 +1,10 @@ -import {HumanEvaluationListTableDataType} from "@/components/Evaluations/HumanEvaluationResult" -import AbTestingEvaluation from "@/components/pages/evaluations/abTestingEvaluation/AbTestingEvaluation" +import AbTestingEvaluation from "@/components/HumanEvaluations/AbTestingEvaluation" import AutoEvaluation from "@/components/pages/evaluations/autoEvaluation/AutoEvaluation" -import SingleModelEvaluation from "@/components/pages/evaluations/singleModelEvaluation/SingleModelEvaluation" +import SingleModelEvaluation from "@/components/HumanEvaluations/SingleModelEvaluation" import {useAppId} from "@/hooks/useAppId" import {useQueryParam} from "@/hooks/useQuery" -import {_Evaluation, JSSTheme, SingleModelEvaluationListTableDataType} from "@/lib/Types" +import {_Evaluation, JSSTheme} from "@/lib/Types" import {fetchAllEvaluations} from "@/services/evaluations/api" -import { - fetchAbTestingEvaluationResult, - fetchSingleModelEvaluationResult, -} from "@/services/human-evaluations/api" import {ChartDonut, ListChecks, TestTube} from "@phosphor-icons/react" import {Tabs, TabsProps, Typography} from "antd" import React, {useEffect, useState} from "react" @@ -44,12 +39,6 @@ const EvaluationsPage = () => { const appId = useAppId() const classes = useStyles() const [autoEvaluationList, setAutoEvaluationList] = useState<_Evaluation[]>([]) - const [singleModelEvalList, setSingleModelEvalList] = useState< - SingleModelEvaluationListTableDataType[] - >([]) - const [abTestingEvalList, setAbTestingEvalList] = useState( - [], - ) const [selectedEvaluation, setSelectedEvaluation] = useQueryParam( "selectedEvaluation", "auto_evaluation", @@ -60,15 +49,10 @@ const EvaluationsPage = () => { if (!appId) return setFetchingEvaluations(true) - Promise.all([ - fetchAllEvaluations(appId), - fetchSingleModelEvaluationResult(appId), - fetchAbTestingEvaluationResult(appId), - ]) - .then(([autoEvalResult, singleModelEvalResult, abTestingEvalResult]) => { + + fetchAllEvaluations(appId) + .then((autoEvalResult) => { setAutoEvaluationList(autoEvalResult) - setSingleModelEvalList(singleModelEvalResult as any) - setAbTestingEvalList(abTestingEvalResult) }) .catch(console.error) .finally(() => setFetchingEvaluations(false)) @@ -90,23 +74,13 @@ const EvaluationsPage = () => { key: "ab_testing_evaluation", label: "A/B Testing Evaluation", icon: , - children: ( - - ), + children: , }, { key: "single_model_evaluation", label: "Single Model Evaluation", icon: , - children: ( - - ), + children: , }, ] diff --git a/agenta-web/src/pages/apps/[app_id]/overview/index.tsx b/agenta-web/src/pages/apps/[app_id]/overview/index.tsx index 7150478950..b4cb7a93f7 100644 --- a/agenta-web/src/pages/apps/[app_id]/overview/index.tsx +++ b/agenta-web/src/pages/apps/[app_id]/overview/index.tsx @@ -1,9 +1,9 @@ import DeleteAppModal from "@/components/AppSelector/modals/DeleteAppModal" import EditAppModal from "@/components/AppSelector/modals/EditAppModal" -import AbTestingEvalOverview from "@/components/pages/overview/abTestingEvaluation/AbTestingEvalOverview" +import AbTestingEvaluation from "@/components/HumanEvaluations/AbTestingEvaluation" import AutomaticEvalOverview from "@/components/pages/overview/automaticEvaluation/AutomaticEvalOverview" import DeploymentOverview from "@/components/pages/overview/deployments/DeploymentOverview" -import SingleModelEvalOverview from "@/components/pages/overview/singleModelEvaluation/SingleModelEvalOverview" +import SingleModelEvaluation from "@/components/HumanEvaluations/SingleModelEvaluation" import VariantsOverview from "@/components/pages/overview/variants/VariantsOverview" import {useAppsData} from "@/contexts/app.context" import {useAppId} from "@/hooks/useAppId" @@ -165,9 +165,9 @@ export default function Overview() { - + - + {currentApp && ( Date: Mon, 2 Sep 2024 19:08:49 +0100 Subject: [PATCH 070/149] fix(frontend): implemented run variant functionality --- .../ConfigureNewEvaluator/index.tsx | 104 ++++++++++++++++-- .../EvaluatorsModal/EvaluatorsModal.tsx | 4 + 2 files changed, 97 insertions(+), 11 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx index 72db58927b..7164929537 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx @@ -1,5 +1,5 @@ -import {Evaluator, JSSTheme, testset, Variant} from "@/lib/Types" -import {CloseOutlined} from "@ant-design/icons" +import {Evaluator, GenericObject, JSSTheme, Parameter, testset, Variant} from "@/lib/Types" +import {CloseCircleOutlined, CloseOutlined} from "@ant-design/icons" import { ArrowLeft, CaretDoubleLeft, @@ -22,7 +22,7 @@ import { Tooltip, Typography, } from "antd" -import React, {useEffect, useMemo, useState} from "react" +import React, {useEffect, useMemo, useRef, useState} from "react" import {createUseStyles} from "react-jss" import AdvancedSettings from "./AdvancedSettings" import {DynamicFormField} from "./DynamicFormField" @@ -33,9 +33,10 @@ import { updateEvaluatorConfig, } from "@/services/evaluations/api" import {useAppId} from "@/hooks/useAppId" -import {useVariant} from "@/lib/hooks/useVariant" import {useLocalStorage} from "usehooks-ts" import {getAllVariantParameters} from "@/lib/helpers/variantHelper" +import {randString, removeKeys} from "@/lib/helpers/utils" +import {callVariant} from "@/services/api" type ConfigureNewEvaluatorProps = { setCurrent: React.Dispatch> @@ -45,6 +46,9 @@ type ConfigureNewEvaluatorProps = { variants: Variant[] | null testsets: testset[] | null selectedTestcase: Record | null + setSelectedTestcase: React.Dispatch | null>> + setSelectedVariant: React.Dispatch> + selectedVariant: Variant | null } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -88,6 +92,9 @@ const ConfigureNewEvaluator = ({ testsets, onSuccess, selectedTestcase, + setSelectedTestcase, + selectedVariant, + setSelectedVariant, }: ConfigureNewEvaluatorProps) => { const appId = useAppId() const classes = useStyles() @@ -95,7 +102,11 @@ const ConfigureNewEvaluator = ({ const [debugEvaluator, setDebugEvaluator] = useLocalStorage("isDebugSelectionOpen", false) const [openVariantModal, setOpenVariantModal] = useState(false) const [submitLoading, setSubmitLoading] = useState(false) - const [selectedVariant, setSelectedVariant] = useState(null) + const [optInputs, setOptInputs] = useState(null) + const [optParams, setOptParams] = useState(null) + const [isChatVariant, setIsChatVariant] = useState(false) + const abortControllersRef = useRef(null) + const [isRunningVariant, setIsRunningVariant] = useState(false) const evalFields = useMemo( () => @@ -138,11 +149,35 @@ const ConfigureNewEvaluator = ({ } useEffect(() => { - if (!selectedVariant) return + if (optInputs && selectedTestcase) { + setSelectedTestcase(() => { + let result: GenericObject = {} + + optInputs.forEach((data) => { + if (selectedTestcase.hasOwnProperty(data.name)) { + result[data.name] = selectedTestcase[data.name] + } + }) + + result["id"] = randString(6) + + return result + }) + } + }, [optInputs]) + + useEffect(() => { + if (!selectedVariant || !selectedTestcase) return const fetchParameters = async () => { try { - const {parameters, inputs} = await getAllVariantParameters(appId, selectedVariant) + const {parameters, inputs, isChatVariant} = await getAllVariantParameters( + appId, + selectedVariant, + ) + setOptInputs(inputs) + setOptParams(parameters) + setIsChatVariant(isChatVariant) } catch (error) { console.error(error) } @@ -151,6 +186,31 @@ const ConfigureNewEvaluator = ({ fetchParameters() }, [selectedVariant]) + const handleRunVariant = async () => { + if (!selectedTestcase || !selectedVariant) return + const controller = new AbortController() + abortControllersRef.current = controller + + try { + setIsRunningVariant(true) + const data = await callVariant( + isChatVariant ? removeKeys(selectedTestcase, ["chat"]) : selectedTestcase, + optInputs || [], + optParams || [], + appId, + selectedVariant.baseId, + isChatVariant ? selectedTestcase.chat || [{}] : [], + controller.signal, + true, + ) + console.log(data) + } catch (error) { + console.error(error) + } finally { + setIsRunningVariant(false) + } + } + return (
@@ -328,10 +388,32 @@ const ConfigureNewEvaluator = ({ Select variant - + {isRunningVariant ? ( + + ) : ( + + )} diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx index 29d75ed7bc..bd0024fbdc 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx @@ -37,6 +37,7 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { const [testsets, setTestsets] = useState(null) const [fetchingEvalConfigs, setFetchingEvalConfigs] = useState(false) const [selectedTestcase, setSelectedTestcase] = useState | null>(null) + const [selectedVariant, setSelectedVariant] = useState(null) const evalConfigFetcher = () => { setFetchingEvalConfigs(true) @@ -98,6 +99,9 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { setCurrent(0) }} selectedTestcase={selectedTestcase} + setSelectedTestcase={setSelectedTestcase} + selectedVariant={selectedVariant} + setSelectedVariant={setSelectedVariant} /> ), }) From 528d2235ad3b542386404af48d9dffebc4be49f0 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Mon, 2 Sep 2024 19:34:10 +0100 Subject: [PATCH 071/149] refactor(frontend): rename directory to improve clarity and context --- .../AdvancedSettings.tsx | 0 .../DynamicFormField.tsx | 0 .../EvaluatorVariantModal.tsx | 0 .../index.tsx | 0 .../EvaluatorCard.tsx | 0 .../EvaluatorList.tsx | 0 .../{ConfigureEvaluators => Evaluators}/index.tsx | 0 .../EvaluatorsModal/EvaluatorsModal.tsx | 12 ++++++------ .../NewEvaluatorCard.tsx} | 0 .../NewEvaluatorList.tsx} | 0 .../{CreateNewEvaluator => NewEvaluator}/index.tsx | 8 ++++---- 11 files changed, 10 insertions(+), 10 deletions(-) rename agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/{ConfigureNewEvaluator => ConfigureEvaluator}/AdvancedSettings.tsx (100%) rename agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/{ConfigureNewEvaluator => ConfigureEvaluator}/DynamicFormField.tsx (100%) rename agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/{ConfigureNewEvaluator => ConfigureEvaluator}/EvaluatorVariantModal.tsx (100%) rename agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/{ConfigureNewEvaluator => ConfigureEvaluator}/index.tsx (100%) rename agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/{ConfigureEvaluators => Evaluators}/EvaluatorCard.tsx (100%) rename agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/{ConfigureEvaluators => Evaluators}/EvaluatorList.tsx (100%) rename agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/{ConfigureEvaluators => Evaluators}/index.tsx (100%) rename agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/{CreateNewEvaluator/CreateEvaluatorCard.tsx => NewEvaluator/NewEvaluatorCard.tsx} (100%) rename agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/{CreateNewEvaluator/CreateEvaluatorList.tsx => NewEvaluator/NewEvaluatorList.tsx} (100%) rename agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/{CreateNewEvaluator => NewEvaluator}/index.tsx (97%) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/AdvancedSettings.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx similarity index 100% rename from agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/AdvancedSettings.tsx rename to agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/DynamicFormField.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx similarity index 100% rename from agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/DynamicFormField.tsx rename to agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/EvaluatorVariantModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/EvaluatorVariantModal.tsx similarity index 100% rename from agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/EvaluatorVariantModal.tsx rename to agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/EvaluatorVariantModal.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx similarity index 100% rename from agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureNewEvaluator/index.tsx rename to agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx similarity index 100% rename from agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorCard.tsx rename to agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorList.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx similarity index 100% rename from agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/EvaluatorList.tsx rename to agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx similarity index 100% rename from agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluators/index.tsx rename to agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx index bd0024fbdc..754b596a61 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx @@ -6,12 +6,12 @@ import {Modal} from "antd" import {useAtom} from "jotai" import React, {useEffect, useState} from "react" import {createUseStyles} from "react-jss" -import ConfigureEvaluators from "./ConfigureEvaluators" -import CreateNewEvaluator from "./CreateNewEvaluator" -import ConfigureNewEvaluator from "./ConfigureNewEvaluator" import {fetchVariants} from "@/services/api" import {fetchTestsets} from "@/services/testsets/api" import TestcaseTab from "./TestcaseTab/TestcaseTab" +import ConfigureEvaluator from "./ConfigureEvaluator" +import NewEvaluator from "./NewEvaluator" +import Evaluators from "./Evaluators" type EvaluatorsModalProps = {} & React.ComponentProps @@ -64,7 +64,7 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { const steps = [ { content: ( - props.onCancel?.({} as any)} setCurrent={setCurrent} @@ -75,7 +75,7 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { }, { content: ( - props.onCancel?.({} as any)} @@ -88,7 +88,7 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { if (selectedEvaluator) { steps.push({ content: ( - props.onCancel?.({} as any)} diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorCard.tsx similarity index 100% rename from agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorCard.tsx rename to agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorCard.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorList.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx similarity index 100% rename from agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/CreateEvaluatorList.tsx rename to agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx similarity index 97% rename from agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx rename to agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx index 2b31a706ed..621227b93d 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/CreateNewEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx @@ -4,8 +4,8 @@ import {ArrowLeft, Cards, Table} from "@phosphor-icons/react" import {Button, Divider, Flex, Input, Radio, Space, Typography} from "antd" import React, {useMemo, useState} from "react" import {createUseStyles} from "react-jss" -import CreateEvaluatorList from "./CreateEvaluatorList" -import CreateEvaluatorCard from "./CreateEvaluatorCard" +import NewEvaluatorList from "./NewEvaluatorList" +import NewEvaluatorCard from "./NewEvaluatorCard" type CreateNewEvaluatorProps = { setCurrent: React.Dispatch> @@ -145,13 +145,13 @@ const CreateNewEvaluator = ({
{evaluatorsDisplay === "list" ? ( - ) : ( - Date: Tue, 3 Sep 2024 12:37:26 +0600 Subject: [PATCH 072/149] refactor(frontend): removed unused codes --- .../Evaluations/AutomaticEvaluationResult.tsx | 286 -------------- .../Evaluations/HumanEvaluationResult.tsx | 373 ------------------ .../HumanEvaluationModal.tsx | 15 +- .../HumanEvaluations/AbTestingEvaluation.tsx | 4 +- .../SingleModelEvaluation.tsx | 1 + agenta-web/src/lib/Types.ts | 32 ++ agenta-web/src/lib/helpers/evaluate.ts | 2 +- .../annotations/human_a_b_testing.tsx | 21 - .../annotations/single_model_test.tsx | 21 - 9 files changed, 50 insertions(+), 705 deletions(-) delete mode 100644 agenta-web/src/components/Evaluations/AutomaticEvaluationResult.tsx delete mode 100644 agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx delete mode 100644 agenta-web/src/pages/apps/[app_id]/annotations/human_a_b_testing.tsx delete mode 100644 agenta-web/src/pages/apps/[app_id]/annotations/single_model_test.tsx diff --git a/agenta-web/src/components/Evaluations/AutomaticEvaluationResult.tsx b/agenta-web/src/components/Evaluations/AutomaticEvaluationResult.tsx deleted file mode 100644 index 2b6ef8ef5c..0000000000 --- a/agenta-web/src/components/Evaluations/AutomaticEvaluationResult.tsx +++ /dev/null @@ -1,286 +0,0 @@ -import { - deleteEvaluations, - fetchEvaluationResults, - fetchAllLoadEvaluations, -} from "@/services/human-evaluations/api" -import {Button, Spin, Statistic, Table, Typography} from "antd" -import {useRouter} from "next/router" -import {useEffect, useState} from "react" -import {ColumnsType} from "antd/es/table" -import {Evaluation, SingleModelEvaluationListTableDataType, StyleProps} from "@/lib/Types" -import {DeleteOutlined} from "@ant-design/icons" -import {EvaluationFlow, EvaluationType} from "@/lib/enums" -import {createUseStyles} from "react-jss" -import {useAppTheme} from "../Layout/ThemeContextProvider" -import {calculateResultsDataAvg} from "@/lib/helpers/evaluate" -import { - fromEvaluationResponseToEvaluation, - singleModelTestEvaluationTransformer, -} from "@/lib/transformers" -import {variantNameWithRev} from "@/lib/helpers/variantHelper" - -const useStyles = createUseStyles({ - container: { - marginBottom: 20, - }, - collapse: ({themeMode}: StyleProps) => ({ - margin: "10px 0", - "& .ant-collapse-header": { - alignItems: "center !important", - padding: "0px 20px !important", - borderTopLeftRadius: "10px !important", - borderTopRightRadius: "10px !important", - background: themeMode === "dark" ? "#1d1d1d" : "#f8f8f8", - }, - }), - stat: { - "& .ant-statistic-content-value": { - fontSize: 20, - color: "#1677ff", - }, - "& .ant-statistic-content-suffix": { - fontSize: 20, - color: "#1677ff", - }, - }, - btnContainer: { - display: "flex", - alignItems: "center", - justifyContent: "flex-end", - margin: "20px 0", - gap: 10, - "& svg": { - color: "red", - }, - }, -}) - -const {Title} = Typography -interface AutomaticEvaluationResultProps { - setIsEvalModalOpen: React.Dispatch> -} -export default function AutomaticEvaluationResult({ - setIsEvalModalOpen, -}: AutomaticEvaluationResultProps) { - const router = useRouter() - const [evaluationsList, setEvaluationsList] = useState< - SingleModelEvaluationListTableDataType[] - >([]) - const [selectedRowKeys, setSelectedRowKeys] = useState([]) - const [selectionType] = useState<"checkbox" | "radio">("checkbox") - const {appTheme} = useAppTheme() - const classes = useStyles({themeMode: appTheme} as StyleProps) - const app_id = router.query.app_id?.toString() || "" - const [fetchingEvaluations, setFetchingEvaluations] = useState(false) - - useEffect(() => { - if (!app_id) { - return - } - - const fetchEvaluations = async () => { - try { - setFetchingEvaluations(true) - const evals: Evaluation[] = (await fetchAllLoadEvaluations(app_id)).map( - fromEvaluationResponseToEvaluation, - ) - const results = await Promise.all(evals.map((e) => fetchEvaluationResults(e.id))) - const newEvals = results.map((result, ix) => { - const item = evals[ix] - if ([EvaluationType.single_model_test].includes(item.evaluationType)) { - return singleModelTestEvaluationTransformer({item, result}) - } - }) - - setEvaluationsList( - newEvals - .filter((evaluation) => evaluation !== undefined) - .filter( - (item: any) => - item.resultsData !== undefined || - !(Object.keys(item.scoresData || {}).length === 0) || - item.avgScore !== undefined, - ) as any, - ) - } catch (error) { - console.error(error) - } finally { - setFetchingEvaluations(false) - } - } - - fetchEvaluations() - }, [app_id]) - - const handleNavigation = (variantName: string, revisionNum: string) => { - router.push(`/apps/${app_id}/playground?variant=${variantName}&revision=${revisionNum}`) - } - - const onCompleteEvaluation = (evaluation: any) => { - // TODO: improve type - const evaluationType = - EvaluationType[evaluation.evaluationType as keyof typeof EvaluationType] - - if (evaluationType === EvaluationType.single_model_test) { - router.push(`/apps/${app_id}/annotations/single_model_test/${evaluation.key}`) - } - } - - const columns: ColumnsType = [ - { - title: "Variant", - dataIndex: "variants", - key: "variants", - render: (value, record: SingleModelEvaluationListTableDataType) => { - return ( -
handleNavigation(value[0].variantName, record.revisions[0])} - style={{cursor: "pointer"}} - > - - {variantNameWithRev({ - variant_name: value[0].variantName, - revision: record.revisions[0], - })} - -
- ) - }, - }, - { - title: "Test set", - dataIndex: "testsetName", - key: "testsetName", - render: (value: any, record: SingleModelEvaluationListTableDataType, index: number) => { - return {record.testset.name} - }, - }, - { - title: "Average score", - dataIndex: "averageScore", - key: "averageScore", - render: (value: any, record: SingleModelEvaluationListTableDataType, index: number) => { - let score = 0 - if (record.scoresData) { - score = - ((record.scoresData.correct?.length || - record.scoresData.true?.length || - 0) / - record.scoresData.nb_of_rows) * - 100 - } else if (record.resultsData) { - const multiplier = { - [EvaluationType.auto_webhook_test]: 100, - [EvaluationType.single_model_test]: 1, - } - score = calculateResultsDataAvg( - record.resultsData, - multiplier[record.evaluationType as keyof typeof multiplier], - ) - score = isNaN(score) ? 0 : score - } else if (record.avgScore) { - score = record.avgScore * 100 - } - - return ( - - - - ) - }, - }, - { - title: "Created at", - dataIndex: "createdAt", - key: "createdAt", - width: "300", - }, - { - title: "Action", - dataIndex: "action", - key: "action", - render: (value: any, record: SingleModelEvaluationListTableDataType, index: number) => { - let actionText = "View evaluation" - if (record.status !== EvaluationFlow.EVALUATION_FINISHED) { - actionText = "Continue evaluation" - } - return ( -
- -
- ) - }, - }, - ] - - const rowSelection = { - onChange: ( - selectedRowKeys: React.Key[], - selectedRows: SingleModelEvaluationListTableDataType[], - ) => { - setSelectedRowKeys(selectedRowKeys) - }, - } - - const onDelete = async () => { - const evaluationsIds = selectedRowKeys.map((key) => key.toString()) - try { - await deleteEvaluations(evaluationsIds) - setEvaluationsList((prevEvaluationsList) => - prevEvaluationsList.filter( - (evaluation) => !evaluationsIds.includes(evaluation.key), - ), - ) - - setSelectedRowKeys([]) - } catch (error) { - console.error(error) - } - } - - return ( -
-
- - -
- -
- Single Model Test Results -
- - -
- - - ) -} diff --git a/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx b/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx deleted file mode 100644 index 87720a4387..0000000000 --- a/agenta-web/src/components/Evaluations/HumanEvaluationResult.tsx +++ /dev/null @@ -1,373 +0,0 @@ -import { - deleteEvaluations, - fetchAllLoadEvaluations, - fetchEvaluationResults, -} from "@/services/human-evaluations/api" -import {Button, Spin, Statistic, Table, Typography} from "antd" -import {useRouter} from "next/router" -import {useEffect, useState} from "react" -import {ColumnsType} from "antd/es/table" -import {EvaluationResponseType, StyleProps} from "@/lib/Types" -import {DeleteOutlined} from "@ant-design/icons" -import {EvaluationFlow, EvaluationType} from "@/lib/enums" -import {createUseStyles} from "react-jss" -import {useAppTheme} from "../Layout/ThemeContextProvider" -import {getVotesPercentage} from "@/lib/helpers/evaluate" -import {isDemo} from "@/lib/helpers/utils" -import {variantNameWithRev} from "@/lib/helpers/variantHelper" -import {abTestingEvaluationTransformer} from "@/lib/transformers" - -interface VariantVotesData { - number_of_votes: number - percentage: number -} - -export interface HumanEvaluationListTableDataType { - key: string - variants: string[] - testset: { - _id: string - name: string - } - evaluationType: string - status: EvaluationFlow - votesData: { - nb_of_rows: number - variants: string[] - flag_votes: { - number_of_votes: number - percentage: number - } - positive_votes: { - number_of_votes: number - percentage: number - } - variants_votes_data: Record - } - createdAt: string - revisions: string[] - variant_revision_ids: string[] - variantNames: string[] -} - -const useStyles = createUseStyles({ - container: { - marginBottom: 20, - }, - collapse: ({themeMode}: StyleProps) => ({ - margin: "10px 0", - "& .ant-collapse-header": { - alignItems: "center !important", - padding: "0px 20px !important", - borderTopLeftRadius: "10px !important", - borderTopRightRadius: "10px !important", - background: themeMode === "dark" ? "#1d1d1d" : "#f8f8f8", - }, - }), - statFlag: { - "& .ant-statistic-content-value": { - fontSize: 20, - color: "#cf1322", - }, - "& .ant-statistic-content-suffix": { - fontSize: 20, - color: "#cf1322", - }, - }, - stat: { - "& .ant-statistic-content-value": { - fontSize: 20, - color: "#1677ff", - }, - "& .ant-statistic-content-suffix": { - fontSize: 20, - color: "#1677ff", - }, - }, - statGood: { - "& .ant-statistic-content-value": { - fontSize: 20, - color: "#3f8600", - }, - "& .ant-statistic-content-suffix": { - fontSize: 20, - color: "#3f8600", - }, - }, - btnContainer: { - display: "flex", - alignItems: "center", - justifyContent: "flex-end", - margin: "20px 0", - gap: 10, - "& svg": { - color: "red", - }, - }, -}) - -const {Title} = Typography - -interface HumanEvaluationResultProps { - setIsEvalModalOpen: React.Dispatch> -} - -export default function HumanEvaluationResult({setIsEvalModalOpen}: HumanEvaluationResultProps) { - const router = useRouter() - const [evaluationsList, setEvaluationsList] = useState([]) - const [selectedRowKeys, setSelectedRowKeys] = useState([]) - const [selectionType] = useState<"checkbox" | "radio">("checkbox") - const {appTheme} = useAppTheme() - const classes = useStyles({themeMode: appTheme} as StyleProps) - const app_id = router.query.app_id?.toString() || "" - const [fetchingEvaluations, setFetchingEvaluations] = useState(false) - - useEffect(() => { - if (!app_id) { - return - } - const fetchEvaluations = async () => { - try { - setFetchingEvaluations(true) - fetchAllLoadEvaluations(app_id) - .then((response) => { - const fetchPromises = response.map((item: EvaluationResponseType) => { - return fetchEvaluationResults(item.id) - .then((results) => { - if (item.evaluation_type === EvaluationType.human_a_b_testing) { - if (Object.keys(results.votes_data).length > 0) { - return abTestingEvaluationTransformer({item, results}) - } - } - }) - .catch((err) => console.error(err)) - }) - Promise.all(fetchPromises) - .then((evaluations) => { - const validEvaluations = evaluations.filter( - (evaluation) => evaluation !== undefined, - ) - setEvaluationsList(validEvaluations) - }) - .catch((err) => console.error(err)) - }) - .catch((err) => console.error(err)) - .finally(() => setFetchingEvaluations(false)) - } catch (error) { - console.error(error) - } - } - - fetchEvaluations() - }, [app_id]) - - const onCompleteEvaluation = (evaluation: any) => { - // TODO: improve type - const evaluationType = - EvaluationType[evaluation.evaluationType as keyof typeof EvaluationType] - - if (evaluationType === EvaluationType.human_a_b_testing) { - router.push(`/apps/${app_id}/annotations/human_a_b_testing/${evaluation.key}`) - } - } - - const handleNavigation = (variantName: string, revisionNum: string) => { - router.push(`/apps/${app_id}/playground?variant=${variantName}&revision=${revisionNum}`) - } - - const columns: ColumnsType = [ - { - title: "Test set", - dataIndex: "testsetName", - key: "testsetName", - render: (_, record: HumanEvaluationListTableDataType, index: number) => { - return {record.testset.name} - }, - }, - { - title: "Variant 1", - dataIndex: "variantNames", - key: "variant1", - render: (value, record) => { - const percentage = getVotesPercentage(record, 0) - return ( -
- -
handleNavigation(value[0], record.revisions[0])} - > - ( - {variantNameWithRev({ - variant_name: value[0], - revision: record.revisions[0], - })} - ) -
-
- ) - }, - }, - { - title: "Variant 2", - dataIndex: "variantNames", - key: "variant2", - render: (value, record) => { - const percentage = getVotesPercentage(record, 1) - return ( -
- -
handleNavigation(value[1], record.revisions[1])} - > - ( - {variantNameWithRev({ - variant_name: value[1], - revision: record.revisions[1], - })} - ) -
-
- ) - }, - }, - { - title: "Both are good", - dataIndex: "positive", - key: "positive", - render: (value: any, record: HumanEvaluationListTableDataType) => { - let percentage = record.votesData.positive_votes.percentage - return ( - - - - ) - }, - }, - { - title: "Flag", - dataIndex: "flag", - key: "flag", - render: (value: any, record: HumanEvaluationListTableDataType) => { - let percentage = record.votesData.flag_votes.percentage - return ( - - - - ) - }, - }, - ] - - if (isDemo()) { - columns.push({ - title: "User", - dataIndex: ["user", "username"], - key: "username", - }) - } - - columns.push( - ...[ - { - title: "Created at", - dataIndex: "createdAt", - key: "createdAt", - width: "300", - }, - { - title: "Action", - dataIndex: "action", - key: "action", - render: (value: any, record: HumanEvaluationListTableDataType, index: number) => { - let actionText = "View evaluation" - if (record.status !== EvaluationFlow.EVALUATION_FINISHED) { - actionText = "Continue evaluation" - } - return ( -
- -
- ) - }, - }, - ], - ) - - const rowSelection = { - onChange: (selectedRowKeys: React.Key[]) => { - setSelectedRowKeys(selectedRowKeys) - }, - } - - const onDelete = async () => { - const evaluationsIds = selectedRowKeys.map((key) => key.toString()) - try { - await deleteEvaluations(evaluationsIds) - setEvaluationsList((prevEvaluationsList) => - prevEvaluationsList.filter( - (evaluation) => !evaluationsIds.includes(evaluation.key), - ), - ) - - setSelectedRowKeys([]) - } catch {} - } - - return ( -
-
- - -
- -
- A/B Test Results -
- - -
- - - ) -} diff --git a/agenta-web/src/components/HumanEvaluationModal/HumanEvaluationModal.tsx b/agenta-web/src/components/HumanEvaluationModal/HumanEvaluationModal.tsx index 814a9713d0..5e8f6a0667 100644 --- a/agenta-web/src/components/HumanEvaluationModal/HumanEvaluationModal.tsx +++ b/agenta-web/src/components/HumanEvaluationModal/HumanEvaluationModal.tsx @@ -111,6 +111,11 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ alignItems: "center", justifyContent: "space-between", }, + dropdownItemLabels: { + fontSize: theme.fontSizeSM, + lineHeight: theme.lineHeightSM, + color: theme.colorTextDescription, + }, })) interface HumanEvaluationModalProps { @@ -278,7 +283,15 @@ const HumanEvaluationModal = ({ filteredVariants.push({ label: ( <> -
{variant.variantName}
+
+ {variant.variantName} + + #{variant.variantId.split("-")[0]} + +
), key: label, diff --git a/agenta-web/src/components/HumanEvaluations/AbTestingEvaluation.tsx b/agenta-web/src/components/HumanEvaluations/AbTestingEvaluation.tsx index a7e71e13eb..b774a55c64 100644 --- a/agenta-web/src/components/HumanEvaluations/AbTestingEvaluation.tsx +++ b/agenta-web/src/components/HumanEvaluations/AbTestingEvaluation.tsx @@ -1,5 +1,5 @@ import DeleteEvaluationModal from "@/components/DeleteEvaluationModal/DeleteEvaluationModal" -import {HumanEvaluationListTableDataType} from "@/components/Evaluations/HumanEvaluationResult" +import {HumanEvaluationListTableDataType, JSSTheme} from "@/lib/Types" import HumanEvaluationModal from "@/components/HumanEvaluationModal/HumanEvaluationModal" import {EvaluationType} from "@/lib/enums" import {getColorFromStr} from "@/lib/helpers/colors" @@ -7,7 +7,6 @@ import {getVotesPercentage} from "@/lib/helpers/evaluate" import {getInitials, isDemo} from "@/lib/helpers/utils" import {variantNameWithRev} from "@/lib/helpers/variantHelper" import {abTestingEvaluationTransformer} from "@/lib/transformers" -import {JSSTheme} from "@/lib/Types" import { deleteEvaluations, fetchAllLoadEvaluations, @@ -435,6 +434,7 @@ const AbTestingEvaluation = ({viewType}: {viewType: "evaluation" | "overview"}) icon={} className={classes.button} onClick={() => setIsEvalModalOpen(true)} + data-cy="new-human-eval-modal-button" > Start new evaluation diff --git a/agenta-web/src/components/HumanEvaluations/SingleModelEvaluation.tsx b/agenta-web/src/components/HumanEvaluations/SingleModelEvaluation.tsx index ee22eb4655..9f9a4bbaf4 100644 --- a/agenta-web/src/components/HumanEvaluations/SingleModelEvaluation.tsx +++ b/agenta-web/src/components/HumanEvaluations/SingleModelEvaluation.tsx @@ -331,6 +331,7 @@ const SingleModelEvaluation = ({viewType}: {viewType: "evaluation" | "overview"} icon={} className={classes.button} onClick={() => setIsEvalModalOpen(true)} + data-cy="new-human-eval-modal-button" > Start new evaluation diff --git a/agenta-web/src/lib/Types.ts b/agenta-web/src/lib/Types.ts index 127643d793..e67dc7c920 100644 --- a/agenta-web/src/lib/Types.ts +++ b/agenta-web/src/lib/Types.ts @@ -621,3 +621,35 @@ export interface TraceSpanTreeNode { key: string children?: TraceSpanTreeNode[] } + +interface VariantVotesData { + number_of_votes: number + percentage: number +} +export interface HumanEvaluationListTableDataType { + key: string + variants: string[] + testset: { + _id: string + name: string + } + evaluationType: string + status: EvaluationFlow + votesData: { + nb_of_rows: number + variants: string[] + flag_votes: { + number_of_votes: number + percentage: number + } + positive_votes: { + number_of_votes: number + percentage: number + } + variants_votes_data: Record + } + createdAt: string + revisions: string[] + variant_revision_ids: string[] + variantNames: string[] +} diff --git a/agenta-web/src/lib/helpers/evaluate.ts b/agenta-web/src/lib/helpers/evaluate.ts index 87b5b48f4b..2a1af7ee59 100644 --- a/agenta-web/src/lib/helpers/evaluate.ts +++ b/agenta-web/src/lib/helpers/evaluate.ts @@ -1,4 +1,4 @@ -import {HumanEvaluationListTableDataType} from "@/components/Evaluations/HumanEvaluationResult" +import {HumanEvaluationListTableDataType} from "@/lib/Types" import { Evaluation, GenericObject, diff --git a/agenta-web/src/pages/apps/[app_id]/annotations/human_a_b_testing.tsx b/agenta-web/src/pages/apps/[app_id]/annotations/human_a_b_testing.tsx deleted file mode 100644 index ea60e0a9af..0000000000 --- a/agenta-web/src/pages/apps/[app_id]/annotations/human_a_b_testing.tsx +++ /dev/null @@ -1,21 +0,0 @@ -import HumanEvaluationResult from "@/components/Evaluations/HumanEvaluationResult" -import HumanEvaluationModal from "@/components/HumanEvaluationModal/HumanEvaluationModal" -import React, {useState} from "react" - -const HumanABTestingEvaluation = () => { - const [isEvalModalOpen, setIsEvalModalOpen] = useState(false) - - return ( - <> - - - - - ) -} - -export default HumanABTestingEvaluation diff --git a/agenta-web/src/pages/apps/[app_id]/annotations/single_model_test.tsx b/agenta-web/src/pages/apps/[app_id]/annotations/single_model_test.tsx deleted file mode 100644 index 9487da1bc3..0000000000 --- a/agenta-web/src/pages/apps/[app_id]/annotations/single_model_test.tsx +++ /dev/null @@ -1,21 +0,0 @@ -import React, {useState} from "react" -import AutomaticEvaluationResult from "@/components/Evaluations/AutomaticEvaluationResult" -import HumanEvaluationModal from "@/components/HumanEvaluationModal/HumanEvaluationModal" - -const SingleModelTestEvaluation = () => { - const [isEvalModalOpen, setIsEvalModalOpen] = useState(false) - - return ( - <> - - - - - ) -} - -export default SingleModelTestEvaluation From 0a02a48c12ba8c80ad30ab7291295dac937f8b42 Mon Sep 17 00:00:00 2001 From: ashrafchowdury Date: Tue, 3 Sep 2024 12:38:24 +0600 Subject: [PATCH 073/149] fix(frontend): failing cypress tests --- .../cypress/e2e/ab-testing-evaluation.cy.ts | 6 ++--- agenta-web/cypress/e2e/app-navigation.cy.ts | 26 +++---------------- .../e2e/single-model-test-evaluation.cy.ts | 12 ++++----- 3 files changed, 13 insertions(+), 31 deletions(-) diff --git a/agenta-web/cypress/e2e/ab-testing-evaluation.cy.ts b/agenta-web/cypress/e2e/ab-testing-evaluation.cy.ts index a46112b76e..c166affcf9 100644 --- a/agenta-web/cypress/e2e/ab-testing-evaluation.cy.ts +++ b/agenta-web/cypress/e2e/ab-testing-evaluation.cy.ts @@ -43,9 +43,9 @@ describe("A/B Testing Evaluation workflow", () => { context("When executing the evaluation", () => { it("Should successfully execute the evaluation process", () => { - cy.visit(`/apps/${app_id}/annotations/human_a_b_testing`) - cy.url().should("include", "/annotations/human_a_b_testing") - cy.clickLinkAndWait('[data-cy="new-annotation-modal-button"]') + cy.visit(`/apps/${app_id}/evaluations?selectedEvaluation=ab_testing_evaluation`) + cy.url().should("include", "/evaluations?selectedEvaluation=ab_testing_evaluation") + cy.clickLinkAndWait('[data-cy="new-human-eval-modal-button"]') cy.get(".ant-modal-content").should("exist") cy.get('[data-cy="variants-dropdown-0"]').trigger("mouseover") diff --git a/agenta-web/cypress/e2e/app-navigation.cy.ts b/agenta-web/cypress/e2e/app-navigation.cy.ts index a5d3127501..288918fa75 100644 --- a/agenta-web/cypress/e2e/app-navigation.cy.ts +++ b/agenta-web/cypress/e2e/app-navigation.cy.ts @@ -28,28 +28,10 @@ describe("App Navigation without errors", () => { cy.get('[data-cy="app-testset-list"]').should("exist") }) - it("should navigate successfully to Automatic Evaluation results evaluators page", () => { - cy.clickLinkAndWait('[data-cy="app-auto-evaluations-link"]') - cy.clickLinkAndWait('[data-cy="app-evaluators-link"]') - cy.url().should("include", "/evaluations/new-evaluator") - }) - - it("should navigate successfully to Automatic Evaluation results page", () => { - cy.clickLinkAndWait('[data-cy="app-auto-evaluations-link"]') - cy.clickLinkAndWait('[data-cy="app-evaluations-results-link"]') - cy.url().should("include", "/evaluations/results") - }) - - it("should navigate successfully to A/B Test page", () => { - cy.clickLinkAndWait('[data-cy="app-human-evaluations-link"]') - cy.clickLinkAndWait('[data-cy="app-human-ab-testing-link"]') - cy.location("pathname").should("include", "/annotations/human_a_b_testing") - }) - - it("should navigate successfully to Single Model Test page", () => { - cy.clickLinkAndWait('[data-cy="app-human-evaluations-link"]') - cy.clickLinkAndWait('[data-cy="app-single-model-test-link"]') - cy.location("pathname").should("include", "/annotations/single_model_test") + it("should navigate successfully to Evaluations page", () => { + cy.clickLinkAndWait('[data-cy="app-evaluations-link"]') + cy.url().should("include", "/evaluations") + cy.contains(/evaluations/i) }) if (isDemo()) { diff --git a/agenta-web/cypress/e2e/single-model-test-evaluation.cy.ts b/agenta-web/cypress/e2e/single-model-test-evaluation.cy.ts index f6a9a6070a..37af94e625 100644 --- a/agenta-web/cypress/e2e/single-model-test-evaluation.cy.ts +++ b/agenta-web/cypress/e2e/single-model-test-evaluation.cy.ts @@ -16,9 +16,9 @@ describe("Single Model Test workflow", () => { context("When executing the evaluation", () => { it("Should successfully execute the evaluation process", () => { - cy.visit(`/apps/${app_id}/annotations/single_model_test`) - cy.url().should("include", "/annotations/single_model_test") - cy.clickLinkAndWait('[data-cy="new-annotation-modal-button"]') + cy.visit(`/apps/${app_id}/evaluations?selectedEvaluation=single_model_evaluation`) + cy.url().should("include", "/evaluations?selectedEvaluation=single_model_evaluation") + cy.clickLinkAndWait('[data-cy="new-human-eval-modal-button"]') cy.get(".ant-modal-content").should("exist") @@ -49,10 +49,10 @@ describe("Single Model Test workflow", () => { }) it("Should modify the evaluation vote scores", () => { - cy.visit(`/apps/${app_id}/annotations/single_model_test`) - cy.url().should("include", "/annotations/single_model_test") + cy.visit(`/apps/${app_id}/evaluations?selectedEvaluation=single_model_evaluation`) + cy.url().should("include", "/evaluations?selectedEvaluation=single_model_evaluation") cy.wait(1000) - cy.clickLinkAndWait('[data-cy="single-model-view-evaluation-button"]') + cy.clickLinkAndWait(".ant-table-row").eq(0) cy.get('[data-cy="evalInstructionsShown-ok-btn"]').click() cy.get('[data-cy="evaluation-vote-panel-numeric-vote-input"]').clear() cy.get('[data-cy="evaluation-vote-panel-numeric-vote-input"]').type("85") From e91e37bc70a81ad1b4c59882e4a26cf5dc5d0a0e Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Tue, 3 Sep 2024 09:50:45 +0100 Subject: [PATCH 074/149] fix: invalid import in evaluation router and improved editor --- .../routers/evaluation_router.py | 3 - .../ConfigureEvaluator/index.tsx | 76 ++++++++++++++++--- 2 files changed, 65 insertions(+), 14 deletions(-) diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py index efe59e84e4..f25c5d1818 100644 --- a/agenta-backend/agenta_backend/routers/evaluation_router.py +++ b/agenta-backend/agenta_backend/routers/evaluation_router.py @@ -15,9 +15,6 @@ NewEvaluation, DeleteEvaluation, ) -from agenta_backend.services.evaluator_manager import ( - check_ai_critique_inputs, -) from agenta_backend.services import evaluation_service, db_manager, app_manager if isCloudEE(): diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index 7164929537..9ec346e29f 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -35,8 +35,12 @@ import { import {useAppId} from "@/hooks/useAppId" import {useLocalStorage} from "usehooks-ts" import {getAllVariantParameters} from "@/lib/helpers/variantHelper" -import {randString, removeKeys} from "@/lib/helpers/utils" +import {getStringOrJson, randString, removeKeys} from "@/lib/helpers/utils" import {callVariant} from "@/services/api" +import {Editor} from "@monaco-editor/react" +import {useAppTheme} from "@/components/Layout/ThemeContextProvider" +import {isBaseResponse, isFuncResponse} from "@/lib/helpers/playgroundResp" +import {formatCurrency, formatLatency} from "@/lib/helpers/formatters" type ConfigureNewEvaluatorProps = { setCurrent: React.Dispatch> @@ -82,6 +86,11 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ lineHeight: theme.lineHeight, fontWeight: theme.fontWeightMedium, }, + editor: { + border: `1px solid ${theme.colorBorder}`, + borderRadius: theme.borderRadius, + overflow: "hidden", + }, })) const ConfigureNewEvaluator = ({ @@ -98,6 +107,7 @@ const ConfigureNewEvaluator = ({ }: ConfigureNewEvaluatorProps) => { const appId = useAppId() const classes = useStyles() + const {appTheme} = useAppTheme() const [form] = Form.useForm() const [debugEvaluator, setDebugEvaluator] = useLocalStorage("isDebugSelectionOpen", false) const [openVariantModal, setOpenVariantModal] = useState(false) @@ -107,6 +117,7 @@ const ConfigureNewEvaluator = ({ const [isChatVariant, setIsChatVariant] = useState(false) const abortControllersRef = useRef(null) const [isRunningVariant, setIsRunningVariant] = useState(false) + const [variantResult, setVariantResult] = useState("") const evalFields = useMemo( () => @@ -193,7 +204,7 @@ const ConfigureNewEvaluator = ({ try { setIsRunningVariant(true) - const data = await callVariant( + const result = await callVariant( isChatVariant ? removeKeys(selectedTestcase, ["chat"]) : selectedTestcase, optInputs || [], optParams || [], @@ -203,9 +214,30 @@ const ConfigureNewEvaluator = ({ controller.signal, true, ) - console.log(data) - } catch (error) { - console.error(error) + + if (typeof result === "string") { + setVariantResult(getStringOrJson({data: result})) + } else if (isFuncResponse(result)) { + setVariantResult(getStringOrJson(result)) + } else if (isBaseResponse(result)) { + const {trace, data} = result + setVariantResult( + getStringOrJson({ + ...(typeof data === "string" ? {message: data} : data), + cost: formatCurrency(trace?.cost), + usage: trace?.usage, + latency: formatLatency(trace?.latency), + }), + ) + } else { + console.error("Unknown response type:", result) + } + } catch (error: any) { + if (!controller.signal.aborted) { + console.error(error) + message.error(error.message) + setVariantResult("") + } } finally { setIsRunningVariant(false) } @@ -421,10 +453,30 @@ const ConfigureNewEvaluator = ({ JSON - + + + +
+ + App Output + +
-
+
Output @@ -434,10 +486,12 @@ const ConfigureNewEvaluator = ({ -
From 959e7735d277db720a170596472a3942c8337557 Mon Sep 17 00:00:00 2001 From: ashrafchowdury Date: Tue, 3 Sep 2024 17:31:26 +0600 Subject: [PATCH 075/149] test(frontend): tests for eval tabs --- agenta-web/cypress/e2e/app-navigation.cy.ts | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/agenta-web/cypress/e2e/app-navigation.cy.ts b/agenta-web/cypress/e2e/app-navigation.cy.ts index 288918fa75..18f17c7eb4 100644 --- a/agenta-web/cypress/e2e/app-navigation.cy.ts +++ b/agenta-web/cypress/e2e/app-navigation.cy.ts @@ -32,6 +32,15 @@ describe("App Navigation without errors", () => { cy.clickLinkAndWait('[data-cy="app-evaluations-link"]') cy.url().should("include", "/evaluations") cy.contains(/evaluations/i) + + cy.get(".ant-tabs-tab").eq(1).click() + cy.url().should("include", "/evaluations?selectedEvaluation=ab_testing_evaluation") + + cy.get(".ant-tabs-tab").eq(2).click() + cy.url().should("include", "/evaluations?selectedEvaluation=single_model_evaluation") + + cy.get(".ant-tabs-tab").eq(0).click() + cy.url().should("include", "/evaluations?selectedEvaluation=auto_evaluation") }) if (isDemo()) { From 538b852b164841988101309215be12b0c3ad4a58 Mon Sep 17 00:00:00 2001 From: ashrafchowdury Date: Tue, 3 Sep 2024 17:51:59 +0600 Subject: [PATCH 076/149] fix(frontend): prettier error --- agenta-web/cypress/e2e/app-navigation.cy.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-web/cypress/e2e/app-navigation.cy.ts b/agenta-web/cypress/e2e/app-navigation.cy.ts index 18f17c7eb4..b72066acf5 100644 --- a/agenta-web/cypress/e2e/app-navigation.cy.ts +++ b/agenta-web/cypress/e2e/app-navigation.cy.ts @@ -35,7 +35,7 @@ describe("App Navigation without errors", () => { cy.get(".ant-tabs-tab").eq(1).click() cy.url().should("include", "/evaluations?selectedEvaluation=ab_testing_evaluation") - + cy.get(".ant-tabs-tab").eq(2).click() cy.url().should("include", "/evaluations?selectedEvaluation=single_model_evaluation") From f9c2c394bda008beb4a7eb181abfaa8257c40d83 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Tue, 3 Sep 2024 13:41:44 +0100 Subject: [PATCH 077/149] refactor(frontend): moved annotations to evaluations dir and cleanup --- .../HumanEvaluationModal.tsx | 4 +- .../HumanEvaluations/AbTestingEvaluation.tsx | 6 +- .../SingleModelEvaluation.tsx | 6 +- .../[evaluation_id]/index.tsx | 0 .../[app_id]/evaluations/new-evaluator.tsx | 25 -------- .../apps/[app_id]/evaluations/results.tsx | 25 -------- .../[evaluation_id]/index.tsx | 2 +- .../pages/apps/[app_id]/testsets/index.tsx | 2 +- .../src/services/evaluations/api/index.ts | 57 ------------------- 9 files changed, 10 insertions(+), 117 deletions(-) rename agenta-web/src/pages/apps/[app_id]/{annotations => evaluations}/human_a_b_testing/[evaluation_id]/index.tsx (100%) delete mode 100644 agenta-web/src/pages/apps/[app_id]/evaluations/new-evaluator.tsx delete mode 100644 agenta-web/src/pages/apps/[app_id]/evaluations/results.tsx rename agenta-web/src/pages/apps/[app_id]/{annotations => evaluations}/single_model_test/[evaluation_id]/index.tsx (97%) diff --git a/agenta-web/src/components/HumanEvaluationModal/HumanEvaluationModal.tsx b/agenta-web/src/components/HumanEvaluationModal/HumanEvaluationModal.tsx index 5e8f6a0667..084bf22fbc 100644 --- a/agenta-web/src/components/HumanEvaluationModal/HumanEvaluationModal.tsx +++ b/agenta-web/src/components/HumanEvaluationModal/HumanEvaluationModal.tsx @@ -353,9 +353,9 @@ const HumanEvaluationModal = ({ setVariants(selectedVariants) if (evaluationType === EvaluationType.human_a_b_testing) { - router.push(`/apps/${appId}/annotations/human_a_b_testing/${evaluationTableId}`) + router.push(`/apps/${appId}/evaluations/human_a_b_testing/${evaluationTableId}`) } else if (evaluationType === EvaluationType.single_model_test) { - router.push(`/apps/${appId}/annotations/single_model_test/${evaluationTableId}`) + router.push(`/apps/${appId}/evaluations/single_model_test/${evaluationTableId}`) } } diff --git a/agenta-web/src/components/HumanEvaluations/AbTestingEvaluation.tsx b/agenta-web/src/components/HumanEvaluations/AbTestingEvaluation.tsx index b774a55c64..2c8b6b4d4f 100644 --- a/agenta-web/src/components/HumanEvaluations/AbTestingEvaluation.tsx +++ b/agenta-web/src/components/HumanEvaluations/AbTestingEvaluation.tsx @@ -341,7 +341,7 @@ const AbTestingEvaluation = ({viewType}: {viewType: "evaluation" | "overview"}) onClick: (e) => { e.domEvent.stopPropagation() router.push( - `/apps/${appId}/annotations/human_a_b_testing/${record.key}`, + `/apps/${appId}/evaluations/human_a_b_testing/${record.key}`, ) }, }, @@ -414,7 +414,7 @@ const AbTestingEvaluation = ({viewType}: {viewType: "evaluation" | "overview"})
A/B Testing Evaluations - @@ -475,7 +475,7 @@ const AbTestingEvaluation = ({viewType}: {viewType: "evaluation" | "overview"}) style: {cursor: "pointer"}, onClick: () => router.push( - `/apps/${appId}/annotations/human_a_b_testing/${record.key}`, + `/apps/${appId}/evaluations/human_a_b_testing/${record.key}`, ), })} /> diff --git a/agenta-web/src/components/HumanEvaluations/SingleModelEvaluation.tsx b/agenta-web/src/components/HumanEvaluations/SingleModelEvaluation.tsx index 9f9a4bbaf4..b2a2ec26b5 100644 --- a/agenta-web/src/components/HumanEvaluations/SingleModelEvaluation.tsx +++ b/agenta-web/src/components/HumanEvaluations/SingleModelEvaluation.tsx @@ -252,7 +252,7 @@ const SingleModelEvaluation = ({viewType}: {viewType: "evaluation" | "overview"} onClick: (e) => { e.domEvent.stopPropagation() router.push( - `/apps/${appId}/annotations/single_model_test/${record.key}`, + `/apps/${appId}/evaluations/single_model_test/${record.key}`, ) }, }, @@ -311,7 +311,7 @@ const SingleModelEvaluation = ({viewType}: {viewType: "evaluation" | "overview"} Single Model Evaluations - @@ -372,7 +372,7 @@ const SingleModelEvaluation = ({viewType}: {viewType: "evaluation" | "overview"} style: {cursor: "pointer"}, onClick: () => router.push( - `/apps/${appId}/annotations/single_model_test/${record.key}`, + `/apps/${appId}/evaluations/single_model_test/${record.key}`, ), })} /> diff --git a/agenta-web/src/pages/apps/[app_id]/annotations/human_a_b_testing/[evaluation_id]/index.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/human_a_b_testing/[evaluation_id]/index.tsx similarity index 100% rename from agenta-web/src/pages/apps/[app_id]/annotations/human_a_b_testing/[evaluation_id]/index.tsx rename to agenta-web/src/pages/apps/[app_id]/evaluations/human_a_b_testing/[evaluation_id]/index.tsx diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations/new-evaluator.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/new-evaluator.tsx deleted file mode 100644 index 5637267752..0000000000 --- a/agenta-web/src/pages/apps/[app_id]/evaluations/new-evaluator.tsx +++ /dev/null @@ -1,25 +0,0 @@ -import Evaluators from "@/components/pages/evaluations/evaluators/Evaluators" -import {useAppId} from "@/hooks/useAppId" -import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" -import {fetchAllEvaluatorConfigs, fetchAllEvaluators} from "@/services/evaluations/api" -import {useAtom} from "jotai" -import React, {useEffect} from "react" - -const NewEvaluator = () => { - const appId = useAppId() - const setEvaluators = useAtom(evaluatorsAtom)[1] - const setEvaluatorConfigs = useAtom(evaluatorConfigsAtom)[1] - - useEffect(() => { - Promise.all([fetchAllEvaluators(), fetchAllEvaluatorConfigs(appId)]).then( - ([evaluators, configs]) => { - setEvaluators(evaluators) - setEvaluatorConfigs(configs) - }, - ) - }, [appId]) - - return -} - -export default NewEvaluator diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations/results.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/results.tsx deleted file mode 100644 index ae10ff2b50..0000000000 --- a/agenta-web/src/pages/apps/[app_id]/evaluations/results.tsx +++ /dev/null @@ -1,25 +0,0 @@ -import EvaluationResults from "@/components/pages/evaluations/evaluationResults/EvaluationResults" -import {useAppId} from "@/hooks/useAppId" -import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" -import {fetchAllEvaluatorConfigs, fetchAllEvaluators} from "@/services/evaluations/api" -import {useAtom} from "jotai" -import React, {useEffect} from "react" - -const EvalResults = () => { - const appId = useAppId() - const setEvaluators = useAtom(evaluatorsAtom)[1] - const setEvaluatorConfigs = useAtom(evaluatorConfigsAtom)[1] - - useEffect(() => { - Promise.all([fetchAllEvaluators(), fetchAllEvaluatorConfigs(appId)]).then( - ([evaluators, configs]) => { - setEvaluators(evaluators) - setEvaluatorConfigs(configs) - }, - ) - }, [appId]) - - return -} - -export default EvalResults diff --git a/agenta-web/src/pages/apps/[app_id]/annotations/single_model_test/[evaluation_id]/index.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/single_model_test/[evaluation_id]/index.tsx similarity index 97% rename from agenta-web/src/pages/apps/[app_id]/annotations/single_model_test/[evaluation_id]/index.tsx rename to agenta-web/src/pages/apps/[app_id]/evaluations/single_model_test/[evaluation_id]/index.tsx index c367c77136..b33bf9a4e5 100644 --- a/agenta-web/src/pages/apps/[app_id]/annotations/single_model_test/[evaluation_id]/index.tsx +++ b/agenta-web/src/pages/apps/[app_id]/evaluations/single_model_test/[evaluation_id]/index.tsx @@ -1,4 +1,4 @@ -import {Evaluation, EvaluationScenario, GenericObject} from "@/lib/Types" +import type {Evaluation, EvaluationScenario, GenericObject} from "@/lib/Types" import { fetchLoadEvaluation, fetchAllLoadEvaluationsScenarios, diff --git a/agenta-web/src/pages/apps/[app_id]/testsets/index.tsx b/agenta-web/src/pages/apps/[app_id]/testsets/index.tsx index 71b4adcc36..455829ed73 100644 --- a/agenta-web/src/pages/apps/[app_id]/testsets/index.tsx +++ b/agenta-web/src/pages/apps/[app_id]/testsets/index.tsx @@ -122,7 +122,7 @@ export default function Testsets() { - + diff --git a/agenta-web/src/services/evaluations/api/index.ts b/agenta-web/src/services/evaluations/api/index.ts index aefa0edeb0..e45bd4cc23 100644 --- a/agenta-web/src/services/evaluations/api/index.ts +++ b/agenta-web/src/services/evaluations/api/index.ts @@ -1,9 +1,6 @@ import axios from "@/lib//helpers/axiosConfig" import { - Annotation, - AnnotationScenario, ComparisonResultRow, - EvaluationStatus, Evaluator, EvaluatorConfig, EvaluatorMappingInput, @@ -176,60 +173,6 @@ export const fetchAllEvaluationScenarios = async (evaluationId: string) => { return evaluationScenarios as _EvaluationScenario[] } -//annotations -export const fetchAllAnnotations = async (appId: string) => { - const response = await axios.get(`/api/annotations/`, {params: {app_id: appId}}) - return response.data.map(evaluationTransformer) as Annotation[] -} - -export const fetchAnnotation = async (annotationId: string) => { - const response = await axios.get(`/api/annotations/${annotationId}/`) - return evaluationTransformer(response.data) as unknown as Annotation -} - -export const fetchAnnotationStatus = async (annotationId: string) => { - const response = await axios.get(`/api/annotations/${annotationId}/status/`) - return response.data as {status: EvaluationStatus} -} - -export const createAnnotation = async ( - appId: string, - annotation: Omit & - Pick, -) => { - return axios.post(`/api/annotations/`, {...annotation, app_id: appId}) -} - -export const deleteAnnotations = async (annotationsIds: string[]) => { - return axios.delete(`/api/annotations/`, {data: {annotations_ids: annotationsIds}}) -} - -// Annotation Scenarios -export const fetchAllAnnotationScenarios = async (appId: string, annotationId: string) => { - const [{data: annotationScenarios}, annotation] = await Promise.all([ - axios.get(`/api/annotations/${annotationId}/annotation_scenarios/`, { - params: {app_id: appId}, - }), - fetchAnnotation(annotationId), - ]) - - annotationScenarios.forEach((scenario: AnnotationScenario) => { - scenario.annotation = annotation - }) - return annotationScenarios as AnnotationScenario[] -} - -export const updateAnnotationScenario = async ( - annotationId: string, - annotationScenarioId: string, - data: Pick, -) => { - return axios.put( - `/api/annotations/${annotationId}/annotation_scenarios/${annotationScenarioId}`, - data, - ) -} - // Comparison export const fetchAllComparisonResults = async (evaluationIds: string[]) => { const scenarioGroups = await Promise.all(evaluationIds.map(fetchAllEvaluationScenarios)) From 0a7e551fc5bc828a7b44de787883082c24e7ff9b Mon Sep 17 00:00:00 2001 From: ashrafchowdury Date: Tue, 3 Sep 2024 20:41:06 +0600 Subject: [PATCH 078/149] ui(frontend): automatic eval funcational table --- .../autoEvaluation/AutoEvaluation.tsx | 279 +++++++++++++++++- .../autoEvaluation/EditColumns.tsx | 87 ++++++ .../autoEvaluation/NewEvaluationModel.tsx | 244 +++++++++++++++ .../autoEvaluation/SearchFilter.tsx | 74 +++++ .../pages/apps/[app_id]/evaluations/index.tsx | 1 + 5 files changed, 673 insertions(+), 12 deletions(-) create mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/EditColumns.tsx create mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/NewEvaluationModel.tsx create mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/SearchFilter.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx index ac586cf58e..31516ee6fb 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx @@ -1,7 +1,6 @@ -import {_Evaluation, JSSTheme} from "@/lib/Types" +import {_Evaluation, EvaluationStatus, JSSTheme} from "@/lib/Types" import { ArrowsLeftRight, - Columns, Database, Gauge, GearSix, @@ -10,20 +9,40 @@ import { Rocket, Trash, } from "@phosphor-icons/react" -import {Button, Dropdown, Space, Table} from "antd" -import React, {useState} from "react" +import {Button, Dropdown, DropdownProps, message, Space, Table} from "antd" +import React, {useEffect, useMemo, useRef, useState} from "react" import {createUseStyles} from "react-jss" import {ColumnsType} from "antd/es/table" import {MoreOutlined} from "@ant-design/icons" import EvaluatorsModal from "./EvaluatorsModal/EvaluatorsModal" import {useQueryParam} from "@/hooks/useQuery" import {formatDay} from "@/lib/helpers/dateTimeHelper" -import {getTypedValue} from "@/lib/helpers/evaluate" +import {calcEvalDuration, getTypedValue} from "@/lib/helpers/evaluate" import {variantNameWithRev} from "@/lib/helpers/variantHelper" +import NewEvaluationModal from "./NewEvaluationModel" +import { + deleteEvaluations, + fetchAllEvaluations, + fetchAllEvaluatorConfigs, + fetchAllEvaluators, + fetchEvaluationStatus, +} from "@/services/evaluations/api" +import {useAppId} from "@/hooks/useAppId" +import {useAtom} from "jotai" +import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" +import DeleteEvaluationModal from "@/components/DeleteEvaluationModal/DeleteEvaluationModal" +import {useRouter} from "next/router" +import EditColumns, {generateEditItems} from "./EditColumns" +import StatusRenderer from "../../overview/automaticEvaluation/StatusRenderer" +import {runningStatuses} from "../../evaluations/cellRenderers/cellRenderers" +import {useUpdateEffect} from "usehooks-ts" +import {shortPoll} from "@/lib/helpers/utils" +import {getFilterParams} from "./SearchFilter" interface AutoEvaluationProps { evaluationList: _Evaluation[] fetchingEvaluations: boolean + setEvaluationList: React.Dispatch> } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -33,13 +52,119 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ }, })) -const AutoEvaluation = ({evaluationList, fetchingEvaluations}: AutoEvaluationProps) => { +const AutoEvaluation = ({ + evaluationList, + fetchingEvaluations, + setEvaluationList, +}: AutoEvaluationProps) => { const classes = useStyles() + const appId = useAppId() + const router = useRouter() + const [selectedRowKeys, setSelectedRowKeys] = useState([]) const [isConfigEvaluatorModalOpen, setIsConfigEvaluatorModalOpen] = useQueryParam( "configureEvaluatorModal", "", ) + // create new evaluation + const [newEvalModalOpen, setNewEvalModalOpen] = useState(false) + const [isEvalLoading, setIsEvalLoading] = useState(false) + const [evaluators, setEvaluators] = useAtom(evaluatorsAtom) + const setEvaluatorConfigs = useAtom(evaluatorConfigsAtom)[1] + // delete evaluation + const [selectedEvalRecord, setSelectedEvalRecord] = useState<_Evaluation>() + const [isDeleteEvalModalOpen, setIsDeleteEvalModalOpen] = useState(false) + const [isDeleteEvalMultipleModalOpen, setIsDeleteEvalMultipleModalOpen] = useState(false) + //edit columns + const [editColumns, setEditColumns] = useState([]) + const [isFilterColsDropdownOpen, setIsFilterColsDropdownOpen] = useState(false) + // + const stoppers = useRef() + + const fetchEvaluations = async () => { + try { + setIsEvalLoading(true) + const [allEvaluations, allEvaluators, allEvaluatorConfigs] = await Promise.all([ + fetchAllEvaluations(appId), + fetchAllEvaluators(), + fetchAllEvaluatorConfigs(appId), + ]) + const result = allEvaluations.sort( + (a, b) => + new Date(b.created_at || 0).getTime() - new Date(a.created_at || 0).getTime(), + ) + setEvaluationList(result) + setEvaluators(allEvaluators) + setEvaluatorConfigs(allEvaluatorConfigs) + } catch (error) { + console.error(error) + } finally { + setIsEvalLoading(false) + } + } + + const handleDeleteMultipleEvaluations = async () => { + const evaluationsIds = selectedRowKeys.map((key) => key.toString()) + try { + setIsEvalLoading(true) + await deleteEvaluations(evaluationsIds) + setEvaluationList((prevEvaluationsList) => + prevEvaluationsList.filter((evaluation) => !evaluationsIds.includes(evaluation.id)), + ) + setSelectedRowKeys([]) + message.success("Evaluations Deleted") + } catch (error) { + console.error(error) + } finally { + setIsEvalLoading(false) + } + } + + const handleDeleteEvaluation = async (record: _Evaluation) => { + try { + setIsEvalLoading(true) + await deleteEvaluations([record.id]) + setEvaluationList((prevEvaluationsList) => + prevEvaluationsList.filter((evaluation) => ![record.id].includes(evaluation.id)), + ) + message.success("Evaluation Deleted") + } catch (error) { + console.error(error) + } finally { + setIsEvalLoading(false) + } + } + + const compareDisabled = useMemo(() => { + const evalList = evaluationList.filter((e) => selectedRowKeys.includes(e.id)) + return ( + evalList.length < 2 || + evalList.some( + (item) => + item.status.value === EvaluationStatus.STARTED || + item.status.value === EvaluationStatus.INITIALIZED || + item.testset.id !== evalList[0].testset.id, + ) + ) + }, [selectedRowKeys]) + + const onToggleEvaluatorVisibility = (evalConfigId: string) => { + if (!editColumns.includes(evalConfigId)) { + setEditColumns([...editColumns, evalConfigId]) + } else { + setEditColumns(editColumns.filter((item) => item !== evalConfigId)) + } + } + + const handleOpenChangeEditCols: DropdownProps["onOpenChange"] = (nextOpen, info) => { + if (info.source === "trigger" || nextOpen) { + setIsFilterColsDropdownOpen(nextOpen) + } + } + + const handleNavigation = (variantName: string, revisionNum: string) => { + router.push(`/apps/${appId}/playground?variant=${variantName}&revision=${revisionNum}`) + } const columns: ColumnsType<_Evaluation> = [ { @@ -60,6 +185,7 @@ const AutoEvaluation = ({evaluationList, fetchingEvaluations}: AutoEvaluationPro ) }, + ...getFilterParams("variants", "text"), }, { title: "Test set", @@ -71,6 +197,7 @@ const AutoEvaluation = ({evaluationList, fetchingEvaluations}: AutoEvaluationPro render: (_, record) => { return {record.testset.name} }, + ...getFilterParams("testset", "text"), }, { title: "Status", @@ -79,9 +206,14 @@ const AutoEvaluation = ({evaluationList, fetchingEvaluations}: AutoEvaluationPro onHeaderCell: () => ({ style: {minWidth: 240}, }), + render: (_, record) => { + return + }, + ...getFilterParams("status", "text"), }, { title: "Results", + key: "results", children: [ { title: "Evaluator 1", @@ -119,6 +251,7 @@ const AutoEvaluation = ({evaluationList, fetchingEvaluations}: AutoEvaluationPro render: (_, record) => { return formatDay(record.created_at) }, + ...getFilterParams("created_at", "date"), }, { title: "Avg. Latency", @@ -130,6 +263,7 @@ const AutoEvaluation = ({evaluationList, fetchingEvaluations}: AutoEvaluationPro render: (_, record) => { return getTypedValue(record.average_latency) }, + ...getFilterParams("average_latency", "number"), }, { title: "Total Cost", @@ -141,6 +275,7 @@ const AutoEvaluation = ({evaluationList, fetchingEvaluations}: AutoEvaluationPro render: (_, record) => { return getTypedValue(record.average_cost) }, + ...getFilterParams("total_cost", "number"), }, { title: , @@ -161,6 +296,9 @@ const AutoEvaluation = ({evaluationList, fetchingEvaluations}: AutoEvaluationPro icon: , onClick: (e) => { e.domEvent.stopPropagation() + router.push( + `/apps/${appId}/evaluations/results/${record.id}`, + ) }, }, { @@ -169,6 +307,10 @@ const AutoEvaluation = ({evaluationList, fetchingEvaluations}: AutoEvaluationPro icon: , onClick: (e) => { e.domEvent.stopPropagation() + handleNavigation( + record.variants[0].variantName, + record.revisions[0], + ) }, }, { @@ -177,6 +319,7 @@ const AutoEvaluation = ({evaluationList, fetchingEvaluations}: AutoEvaluationPro icon: , onClick: (e) => { e.domEvent.stopPropagation() + router.push(`/apps/${appId}/testsets/${record.testset.id}`) }, }, {type: "divider"}, @@ -187,6 +330,8 @@ const AutoEvaluation = ({evaluationList, fetchingEvaluations}: AutoEvaluationPro danger: true, onClick: (e) => { e.domEvent.stopPropagation() + setSelectedEvalRecord(record) + setIsDeleteEvalModalOpen(true) }, }, ], @@ -204,11 +349,68 @@ const AutoEvaluation = ({evaluationList, fetchingEvaluations}: AutoEvaluationPro }, ] + const runningEvaluationIds = useMemo( + () => + evaluationList + .filter((item) => runningStatuses.includes(item.status.value)) + .map((item) => item.id), + [evaluationList], + ) + + useUpdateEffect(() => { + stoppers.current?.() + + if (runningEvaluationIds.length) { + stoppers.current = shortPoll( + () => + Promise.all(runningEvaluationIds.map((id) => fetchEvaluationStatus(id))) + .then((res) => { + setEvaluationList((prev) => { + const newEvals = [...prev] + runningEvaluationIds.forEach((id, ix) => { + const index = newEvals.findIndex((e) => e.id === id) + if (index !== -1) { + newEvals[index].status = res[ix].status + newEvals[index].duration = calcEvalDuration(newEvals[index]) + } + }) + if ( + res.some((item) => !runningStatuses.includes(item.status.value)) + ) + fetchEvaluations() + return newEvals + }) + }) + .catch(console.error), + {delayMs: 2000, timeoutMs: Infinity}, + ).stopper + } + + return () => { + stoppers.current?.() + } + }, [JSON.stringify(runningEvaluationIds)]) + + useEffect(() => { + const defaultColumnNames = columns.map((item) => item.key as string) + setEditColumns(defaultColumnNames) + }, []) + + const editedColumns = columns.map((item) => ({ + ...item, + hidden: !editColumns?.includes(item.key as string), + })) + return (
- @@ -232,17 +436,30 @@ const AutoEvaluation = ({evaluationList, fetchingEvaluations}: AutoEvaluationPro type="text" icon={} className={classes.button} + disabled={compareDisabled} + onClick={() => + router.push( + `/apps/${appId}/evaluations/results/compare?evaluations=${selectedRowKeys.join(",")}`, + ) + } > Compare - + { + onToggleEvaluatorVisibility(key) + setIsFilterColsDropdownOpen(true) + }} + />
+ { + setIsConfigEvaluatorModalOpen("open") + setNewEvalModalOpen(false) + }} + onCancel={() => { + setNewEvalModalOpen(false) + }} + onSuccess={() => { + setNewEvalModalOpen(false) + fetchEvaluations() + }} + /> + {isConfigEvaluatorModalOpen === "open" && ( setIsConfigEvaluatorModalOpen("")} /> )} + + {selectedEvalRecord && ( + setIsDeleteEvalModalOpen(false)} + onOk={async () => { + await handleDeleteEvaluation(selectedEvalRecord) + setIsDeleteEvalModalOpen(false) + }} + evaluationType={"automatic evaluation"} + /> + )} + {isDeleteEvalMultipleModalOpen && ( + setIsDeleteEvalMultipleModalOpen(false)} + onOk={async () => { + await handleDeleteMultipleEvaluations() + setIsDeleteEvalMultipleModalOpen(false) + }} + evaluationType={"single model evaluation"} + /> + )} ) } diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EditColumns.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EditColumns.tsx new file mode 100644 index 0000000000..979b519b02 --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EditColumns.tsx @@ -0,0 +1,87 @@ +import {_Evaluation, JSSTheme} from "@/lib/Types" +import {Button, Dropdown, Space, Checkbox} from "antd" +import React from "react" +import {createUseStyles} from "react-jss" +import {Columns} from "@phosphor-icons/react" +import {ColumnsType} from "antd/es/table" + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + dropdownMenu: { + "&>.ant-dropdown-menu-item": { + "& .anticon-check": { + display: "none", + }, + }, + "&>.ant-dropdown-menu-item-selected": { + "&:not(:hover)": { + backgroundColor: "transparent !important", + }, + "& .anticon-check": { + display: "inline-flex !important", + }, + }, + }, + button: { + display: "flex", + alignItems: "center", + }, +})) + +export const generateEditItems = (columns: ColumnsType, editColumns: string[]) => { + return columns + .filter((col) => col.key !== "key") + .map((col) => ({ + key: col.key, + label: ( + + + <>{col.title} + + ), + })) +} + +interface EditColumnsProps { + isOpen: boolean + handleOpenChange: ( + open: boolean, + info: { + source: "trigger" | "menu" + }, + ) => void + shownCols: string[] + items: any + onClick: ({key}: {key: string}) => void + buttonText?: string +} + +const EditColumns = ({ + isOpen, + handleOpenChange, + shownCols, + items, + onClick, + buttonText, +}: EditColumnsProps) => { + const classes = useStyles() + + return ( + + + + ) +} + +export default EditColumns diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/NewEvaluationModel.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/NewEvaluationModel.tsx new file mode 100644 index 0000000000..8092744ae5 --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/NewEvaluationModel.tsx @@ -0,0 +1,244 @@ +import React, {useEffect, useState} from "react" +import {useAppId} from "@/hooks/useAppId" +import {JSSTheme, Variant, testset} from "@/lib/Types" +import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" +import {apiKeyObject, redirectIfNoLLMKeys} from "@/lib/helpers/utils" +import {fetchVariants} from "@/services/api" +import {CreateEvaluationData, createEvalutaiton} from "@/services/evaluations/api" +import {fetchTestsets} from "@/services/testsets/api" +import {Button, Divider, Form, Modal, Select, Spin} from "antd" +import {useAtom} from "jotai" +import {createUseStyles} from "react-jss" +import {ChartDonut, Plus} from "@phosphor-icons/react" + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + spinContainer: { + display: "grid", + placeItems: "center", + height: "100%", + }, + selector: { + width: 300, + }, + configRow: { + display: "flex", + alignItems: "center", + justifyContent: "space-between", + }, + divider: { + margin: "1rem -1.5rem", + width: "unset", + }, + container: { + "& .ant-modal-footer": { + display: "flex", + alignItems: "center", + justifyContent: "flex-end", + }, + }, + modalContainer: { + display: "flex", + alignItems: "center", + }, + selectItemLabels: { + fontSize: theme.fontSizeSM, + lineHeight: theme.lineHeightSM, + color: theme.colorTextDescription, + margin: "0px 5px", + }, +})) + +type Props = { + onSuccess?: () => void + onOpenEvaluatorModal: () => void +} & React.ComponentProps + +const NewEvaluationModal: React.FC = ({onSuccess, onOpenEvaluatorModal, ...props}) => { + const classes = useStyles() + const appId = useAppId() + const [fetching, setFetching] = useState(false) + const [testSets, setTestSets] = useState([]) + const [variants, setVariants] = useState([]) + const [evaluatorConfigs] = useAtom(evaluatorConfigsAtom) + const [evaluators] = useAtom(evaluatorsAtom) + const [submitLoading, setSubmitLoading] = useState(false) + const [form] = Form.useForm() + + useEffect(() => { + setFetching(true) + form.resetFields() + Promise.all([fetchTestsets(appId), fetchVariants(appId)]) + .then(([testSets, variants]) => { + setTestSets(testSets) + setVariants(variants) + }) + .catch(console.error) + .finally(() => setFetching(false)) + }, [props.open, appId]) + + const rateLimitValues = { + batch_size: 10, + max_retries: 3, + retry_delay: 3, + delay_between_batches: 5, + } + const correctAnswerColumn = "correct_answer" + + const onSubmit = (values: CreateEvaluationData) => { + // redirect if no llm keys and an AI Critique config is selected + if ( + values.evaluators_configs.some( + (id) => + evaluatorConfigs.find((config) => config.id === id)?.evaluator_key === + "auto_ai_critique", + ) && + redirectIfNoLLMKeys() + ) + return + setSubmitLoading(true) + createEvalutaiton(appId, { + testset_id: values.testset_id, + variant_ids: values.variant_ids, + evaluators_configs: values.evaluators_configs, + rate_limit: rateLimitValues, + lm_providers_keys: apiKeyObject(), + correct_answer_column: correctAnswerColumn, + }) + .then(onSuccess) + .catch(console.error) + .finally(() => setSubmitLoading(false)) + } + + return ( + , + loading: submitLoading, + className: classes.modalContainer, + }} + className={classes.container} + {...props} + > + + +
+ + + + + + + + + + + + +
+
+ ) +} + +export default NewEvaluationModal diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/SearchFilter.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/SearchFilter.tsx new file mode 100644 index 0000000000..cfe24569c7 --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/SearchFilter.tsx @@ -0,0 +1,74 @@ +import {_Evaluation} from "@/lib/Types" +import {Input, InputRef, TableColumnType, DatePicker} from "antd" +import {FilterDropdownProps} from "antd/es/table/interface" +import React, {useRef} from "react" +import dayjs from "dayjs" + +type DataIndex = keyof _Evaluation + +type CellDataType = "number" | "text" | "date" + +export function getFilterParams(dataIndex: DataIndex, type: CellDataType): TableColumnType<_Evaluation> { + const searchInput = useRef(null) + + const filterDropdown = ({setSelectedKeys, selectedKeys, confirm}: FilterDropdownProps) => { + return ( +
e.stopPropagation()}> + {type === "date" ? ( + { + setSelectedKeys(dateString ? [dateString] : []) + confirm() + }} + /> + ) : ( + { + setSelectedKeys(e.target.value ? [e.target.value] : []) + confirm({closeDropdown: false}) + }} + style={{display: "block"}} + /> + )} +
+ ) + } + + const onFilter = (value: any, record: any) => { + try { + const cellValue = record[dataIndex] + + if (type === "date") { + return dayjs(cellValue).isSame(dayjs(value), "day") + } + if (typeof cellValue === "object" && cellValue !== null) { + if (Array.isArray(cellValue)) { + return cellValue.some((item) => + item.variantName?.toLowerCase().includes(value.toLowerCase()), + ) + } else if (cellValue.hasOwnProperty("name")) { + return cellValue.name.toLowerCase().includes(value.toLowerCase()) + } else if (cellValue.hasOwnProperty("value")) { + return cellValue.value.toLowerCase().includes(value.toLowerCase()) + } + } + return cellValue?.toString().toLowerCase().includes(value.toLowerCase()) + } catch (error) { + console.error(error) + } + } + + return { + filterDropdown, + onFilter, + onFilterDropdownOpenChange: (visible) => { + if (visible) { + setTimeout(() => searchInput.current?.select(), 100) + } + }, + } +} diff --git a/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx b/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx index b4d3bea986..eb233705d7 100644 --- a/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx +++ b/agenta-web/src/pages/apps/[app_id]/evaluations/index.tsx @@ -83,6 +83,7 @@ const EvaluationsPage = () => { ), }, From 1fe2babd6bf75cc9e840154dd0c4c1a55fcb2600 Mon Sep 17 00:00:00 2001 From: ashrafchowdury Date: Tue, 3 Sep 2024 21:27:30 +0600 Subject: [PATCH 079/149] fix(frontend): search issue with numbers --- .../pages/evaluations/autoEvaluation/SearchFilter.tsx | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/SearchFilter.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/SearchFilter.tsx index cfe24569c7..45d9e51435 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/SearchFilter.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/SearchFilter.tsx @@ -8,7 +8,10 @@ type DataIndex = keyof _Evaluation type CellDataType = "number" | "text" | "date" -export function getFilterParams(dataIndex: DataIndex, type: CellDataType): TableColumnType<_Evaluation> { +export function getFilterParams( + dataIndex: DataIndex, + type: CellDataType, +): TableColumnType<_Evaluation> { const searchInput = useRef(null) const filterDropdown = ({setSelectedKeys, selectedKeys, confirm}: FilterDropdownProps) => { @@ -32,6 +35,7 @@ export function getFilterParams(dataIndex: DataIndex, type: CellDataType): Table confirm({closeDropdown: false}) }} style={{display: "block"}} + type={type} /> )} @@ -51,9 +55,9 @@ export function getFilterParams(dataIndex: DataIndex, type: CellDataType): Table item.variantName?.toLowerCase().includes(value.toLowerCase()), ) } else if (cellValue.hasOwnProperty("name")) { - return cellValue.name.toLowerCase().includes(value.toLowerCase()) + return cellValue.name.toString().toLowerCase().includes(value.toLowerCase()) } else if (cellValue.hasOwnProperty("value")) { - return cellValue.value.toLowerCase().includes(value.toLowerCase()) + return cellValue.value.toString().toLowerCase().includes(value.toLowerCase()) } } return cellValue?.toString().toLowerCase().includes(value.toLowerCase()) From 4fc0fbfb59a237003ac8621ddd8677ad1379ca90 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Tue, 3 Sep 2024 20:38:06 +0100 Subject: [PATCH 080/149] feat(frontend): added helper function to transfor trace tree to json --- .../ConfigureEvaluator/index.tsx | 13 +++++++++ agenta-web/src/lib/transformers.ts | 29 +++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index 9ec346e29f..e852c35ce8 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -41,6 +41,7 @@ import {Editor} from "@monaco-editor/react" import {useAppTheme} from "@/components/Layout/ThemeContextProvider" import {isBaseResponse, isFuncResponse} from "@/lib/helpers/playgroundResp" import {formatCurrency, formatLatency} from "@/lib/helpers/formatters" +import {fromBaseResponseToTraceSpanType, transformTraceTreeToJson} from "@/lib/transformers" type ConfigureNewEvaluatorProps = { setCurrent: React.Dispatch> @@ -118,6 +119,7 @@ const ConfigureNewEvaluator = ({ const abortControllersRef = useRef(null) const [isRunningVariant, setIsRunningVariant] = useState(false) const [variantResult, setVariantResult] = useState("") + const [traceTree, setTraceTree] = useState>({}) const evalFields = useMemo( () => @@ -229,6 +231,13 @@ const ConfigureNewEvaluator = ({ latency: formatLatency(trace?.latency), }), ) + if (trace?.spans) { + setTraceTree( + transformTraceTreeToJson( + fromBaseResponseToTraceSpanType(trace.spans, trace.trace_id)[0], + ), + ) + } } else { console.error("Unknown response type:", result) } @@ -458,6 +467,10 @@ const ConfigureNewEvaluator = ({ width="100%" language="json" theme={`vs-${appTheme}`} + value={getStringOrJson(traceTree)} + // onChange={(value) => { + // console.log(value) + // }} options={{wordWrap: "on"}} /> diff --git a/agenta-web/src/lib/transformers.ts b/agenta-web/src/lib/transformers.ts index 63ab6fb2d0..637e9e4ccf 100644 --- a/agenta-web/src/lib/transformers.ts +++ b/agenta-web/src/lib/transformers.ts @@ -209,3 +209,32 @@ export const fromBaseResponseToTraceSpanType = ( return [top_level_spans, spans_dict] } + +export const transformTraceTreeToJson = (tree: TraceSpan[]) => { + const nodeMap: Record = {} + + function addNode(item: TraceSpan) { + if (item.name) { + if (!nodeMap[item.name]) { + nodeMap[item.name] = { + ...item.content, + ...(item.children ? transformTraceTreeToJson(item.children) : null), + } + } else { + if (!Array.isArray(nodeMap[item.name])) { + nodeMap[item.name] = [nodeMap[item.name]] + } + nodeMap[item.name].push({ + ...item.content, + ...(item.children ? transformTraceTreeToJson(item.children) : null), + }) + } + } + } + + tree.forEach((item) => { + addNode(item) + }) + + return nodeMap +} From 0ca0bc111a9f7827398d7ac463884e33f3703465 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Tue, 3 Sep 2024 20:43:03 +0100 Subject: [PATCH 081/149] minor naming improvement --- agenta-web/src/lib/transformers.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/agenta-web/src/lib/transformers.ts b/agenta-web/src/lib/transformers.ts index 637e9e4ccf..786c22c2c7 100644 --- a/agenta-web/src/lib/transformers.ts +++ b/agenta-web/src/lib/transformers.ts @@ -213,7 +213,7 @@ export const fromBaseResponseToTraceSpanType = ( export const transformTraceTreeToJson = (tree: TraceSpan[]) => { const nodeMap: Record = {} - function addNode(item: TraceSpan) { + function addTree(item: TraceSpan) { if (item.name) { if (!nodeMap[item.name]) { nodeMap[item.name] = { @@ -233,7 +233,7 @@ export const transformTraceTreeToJson = (tree: TraceSpan[]) => { } tree.forEach((item) => { - addNode(item) + addTree(item) }) return nodeMap From cacad5769a156c1e40e6968a4069d2d4e011ecd8 Mon Sep 17 00:00:00 2001 From: ashrafchowdury Date: Wed, 4 Sep 2024 17:11:47 +0600 Subject: [PATCH 082/149] ui(frontend): added table results column --- .../autoEvaluation/AutoEvaluation.tsx | 219 ++++++++++++++---- .../pages/apps/[app_id]/evaluations/index.tsx | 27 +-- 2 files changed, 173 insertions(+), 73 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx index 31516ee6fb..6982203078 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx @@ -1,4 +1,4 @@ -import {_Evaluation, EvaluationStatus, JSSTheme} from "@/lib/Types" +import {_Evaluation, EvaluationStatus, EvaluatorConfig, JSSTheme} from "@/lib/Types" import { ArrowsLeftRight, Database, @@ -9,11 +9,11 @@ import { Rocket, Trash, } from "@phosphor-icons/react" -import {Button, Dropdown, DropdownProps, message, Space, Table} from "antd" +import {Button, Dropdown, DropdownProps, message, Popover, Space, Table, Tag} from "antd" import React, {useEffect, useMemo, useRef, useState} from "react" import {createUseStyles} from "react-jss" import {ColumnsType} from "antd/es/table" -import {MoreOutlined} from "@ant-design/icons" +import {EditOutlined, InfoCircleOutlined, MoreOutlined} from "@ant-design/icons" import EvaluatorsModal from "./EvaluatorsModal/EvaluatorsModal" import {useQueryParam} from "@/hooks/useQuery" import {formatDay} from "@/lib/helpers/dateTimeHelper" @@ -38,47 +38,57 @@ import {runningStatuses} from "../../evaluations/cellRenderers/cellRenderers" import {useUpdateEffect} from "usehooks-ts" import {shortPoll} from "@/lib/helpers/utils" import {getFilterParams} from "./SearchFilter" - -interface AutoEvaluationProps { - evaluationList: _Evaluation[] - fetchingEvaluations: boolean - setEvaluationList: React.Dispatch> -} +import {uniqBy} from "lodash" +import NewEvaluatorModal from "../evaluators/NewEvaluatorModal" const useStyles = createUseStyles((theme: JSSTheme) => ({ + resultTag: { + minWidth: 150, + display: "flex", + cursor: "pointer", + alignItems: "stretch", + borderRadius: theme.borderRadiusSM, + border: `1px solid ${theme.colorBorder}`, + textAlign: "center", + "& > div:nth-child(1)": { + backgroundColor: "rgba(0, 0, 0, 0.02)", + lineHeight: theme.lineHeight, + flex: 1, + borderRight: `1px solid ${theme.colorBorder}`, + padding: "0 7px", + }, + "& > div:nth-child(2)": { + padding: "0 7px", + }, + }, button: { display: "flex", alignItems: "center", }, })) -const AutoEvaluation = ({ - evaluationList, - fetchingEvaluations, - setEvaluationList, -}: AutoEvaluationProps) => { +const AutoEvaluation = () => { const classes = useStyles() const appId = useAppId() const router = useRouter() const [selectedRowKeys, setSelectedRowKeys] = useState([]) - const [isConfigEvaluatorModalOpen, setIsConfigEvaluatorModalOpen] = useQueryParam( - "configureEvaluatorModal", - "", - ) - // create new evaluation + const [evaluationList, setEvaluationList] = useState<_Evaluation[]>([]) const [newEvalModalOpen, setNewEvalModalOpen] = useState(false) const [isEvalLoading, setIsEvalLoading] = useState(false) const [evaluators, setEvaluators] = useAtom(evaluatorsAtom) const setEvaluatorConfigs = useAtom(evaluatorConfigsAtom)[1] - // delete evaluation const [selectedEvalRecord, setSelectedEvalRecord] = useState<_Evaluation>() const [isDeleteEvalModalOpen, setIsDeleteEvalModalOpen] = useState(false) const [isDeleteEvalMultipleModalOpen, setIsDeleteEvalMultipleModalOpen] = useState(false) - //edit columns const [editColumns, setEditColumns] = useState([]) const [isFilterColsDropdownOpen, setIsFilterColsDropdownOpen] = useState(false) - // + const [selectedConfigEdit, setSelectedConfigEdit] = useState() + const [isEditEvalConfigOpen, setIsEditEvalConfigOpen] = useState(false) + const [isConfigEvaluatorModalOpen, setIsConfigEvaluatorModalOpen] = useQueryParam( + "configureEvaluatorModal", + "", + ) const stoppers = useRef() const fetchEvaluations = async () => { @@ -166,6 +176,24 @@ const AutoEvaluation = ({ router.push(`/apps/${appId}/playground?variant=${variantName}&revision=${revisionNum}`) } + const evaluatorConfigs = useMemo( + () => + uniqBy( + evaluationList + .map((item) => + item.aggregated_results.map((item) => ({ + ...item.evaluator_config, + evaluator: evaluators.find( + (e) => e.key === item.evaluator_config.evaluator_key, + ), + })), + ) + .flat(), + "id", + ), + [evaluationList], + ) + const columns: ColumnsType<_Evaluation> = [ { title: "Variant", @@ -214,32 +242,110 @@ const AutoEvaluation = ({ { title: "Results", key: "results", - children: [ - { - title: "Evaluator 1", - // dataIndex: "aggregated_results", - key: "results", - onHeaderCell: () => ({ - style: {minWidth: 240}, - }), - }, - { - title: "Evaluator 2", - // dataIndex: "aggregated_results", - key: "results", - onHeaderCell: () => ({ - style: {minWidth: 240}, - }), + onHeaderCell: () => ({style: {minWidth: 240}}), + children: evaluatorConfigs.map((evaluator, idx) => ({ + title: evaluator.name, + key: `results-${idx}`, + onHeaderCell: () => ({style: {minWidth: 240}}), + showSorterTooltip: false, + sorter: { + compare: (a, b) => { + const getSortValue = (item: _Evaluation) => { + if (item.aggregated_results && item.aggregated_results.length > 0) { + const result = item.aggregated_results[0].result + if (result && typeof result.value === "number") { + return result.value + } + } + return 0 + } + return getSortValue(a) - getSortValue(b) + }, }, - { - title: "Evaluator 3", - // dataIndex: "aggregated_results", - key: "results", - onHeaderCell: () => ({ - style: {minWidth: 240}, - }), + render: (_, record) => { + if (!evaluators?.length) return + + const matchingResults = record.aggregated_results.filter( + (result) => result.evaluator_config.id === evaluator.id, + ) + + return ( + + {matchingResults.map((result, index) => + result.result.error ? ( + + {result.result.error?.stacktrace} + + } + title={result.result.error?.message} + > + + + ) : ( + e.stopPropagation()} + > +
+ {result.evaluator_config.name} +
+
{getTypedValue(result.result)}
+ + } + title={ +
e.stopPropagation()} + > + + {evaluator?.name} + +
+ } + > +
e.stopPropagation()} + className={classes.resultTag} + > +
{result.evaluator_config.name}
+
{getTypedValue(result.result)}
+
+
+ ), + )} +
+ ) }, - ], + })), }, { title: "Created on", @@ -357,6 +463,12 @@ const AutoEvaluation = ({ [evaluationList], ) + useEffect(() => { + if (!appId) return + + fetchEvaluations() + }, [appId]) + useUpdateEffect(() => { stoppers.current?.() @@ -459,7 +571,7 @@ const AutoEvaluation = ({
)} + { + setIsEditEvalConfigOpen(false) + fetchEvaluations() + }} + newEvalModalConfigOpen={isEditEvalConfigOpen} + setNewEvalModalConfigOpen={setIsEditEvalConfigOpen} + setNewEvalModalOpen={() => {}} + editMode={true} + initialValues={selectedConfigEdit} + /> + {selectedEvalRecord && ( ({ @@ -36,40 +33,18 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ })) const EvaluationsPage = () => { - const appId = useAppId() const classes = useStyles() - const [autoEvaluationList, setAutoEvaluationList] = useState<_Evaluation[]>([]) const [selectedEvaluation, setSelectedEvaluation] = useQueryParam( "selectedEvaluation", "auto_evaluation", ) - const [fetchingEvaluations, setFetchingEvaluations] = useState(false) - - useEffect(() => { - if (!appId) return - - setFetchingEvaluations(true) - - fetchAllEvaluations(appId) - .then((autoEvalResult) => { - setAutoEvaluationList(autoEvalResult) - }) - .catch(console.error) - .finally(() => setFetchingEvaluations(false)) - }, [appId]) const items: TabsProps["items"] = [ { key: "auto_evaluation", label: "Automatic Evaluation", icon: , - children: ( - - ), + children: , }, { key: "ab_testing_evaluation", From a1cbbd6b1260ac81122d609d280800e5c73ac147 Mon Sep 17 00:00:00 2001 From: ashrafchowdury Date: Wed, 4 Sep 2024 19:20:42 +0600 Subject: [PATCH 083/149] refactor(frontend): removed unsed code --- .../NewEvaluationModal.tsx | 0 .../autoEvaluation/AutoEvaluation.tsx | 6 +- .../autoEvaluation/NewEvaluationModel.tsx | 244 -------- .../evaluationResults/EmptyEvaluations.tsx | 89 --- .../evaluationResults/EvaluationResults.tsx | 540 ------------------ .../AutomaticEvalOverview.tsx | 2 +- 6 files changed, 2 insertions(+), 879 deletions(-) rename agenta-web/src/components/pages/evaluations/{evaluationResults => NewEvaluation}/NewEvaluationModal.tsx (100%) delete mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/NewEvaluationModel.tsx delete mode 100644 agenta-web/src/components/pages/evaluations/evaluationResults/EmptyEvaluations.tsx delete mode 100644 agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx b/agenta-web/src/components/pages/evaluations/NewEvaluation/NewEvaluationModal.tsx similarity index 100% rename from agenta-web/src/components/pages/evaluations/evaluationResults/NewEvaluationModal.tsx rename to agenta-web/src/components/pages/evaluations/NewEvaluation/NewEvaluationModal.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx index 6982203078..58386d3faa 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx @@ -19,7 +19,7 @@ import {useQueryParam} from "@/hooks/useQuery" import {formatDay} from "@/lib/helpers/dateTimeHelper" import {calcEvalDuration, getTypedValue} from "@/lib/helpers/evaluate" import {variantNameWithRev} from "@/lib/helpers/variantHelper" -import NewEvaluationModal from "./NewEvaluationModel" +import NewEvaluationModal from "@/components/pages/evaluations/NewEvaluation/NewEvaluationModal" import { deleteEvaluations, fetchAllEvaluations, @@ -594,10 +594,6 @@ const AutoEvaluation = () => { { - setIsConfigEvaluatorModalOpen("open") - setNewEvalModalOpen(false) - }} onCancel={() => { setNewEvalModalOpen(false) }} diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/NewEvaluationModel.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/NewEvaluationModel.tsx deleted file mode 100644 index 8092744ae5..0000000000 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/NewEvaluationModel.tsx +++ /dev/null @@ -1,244 +0,0 @@ -import React, {useEffect, useState} from "react" -import {useAppId} from "@/hooks/useAppId" -import {JSSTheme, Variant, testset} from "@/lib/Types" -import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" -import {apiKeyObject, redirectIfNoLLMKeys} from "@/lib/helpers/utils" -import {fetchVariants} from "@/services/api" -import {CreateEvaluationData, createEvalutaiton} from "@/services/evaluations/api" -import {fetchTestsets} from "@/services/testsets/api" -import {Button, Divider, Form, Modal, Select, Spin} from "antd" -import {useAtom} from "jotai" -import {createUseStyles} from "react-jss" -import {ChartDonut, Plus} from "@phosphor-icons/react" - -const useStyles = createUseStyles((theme: JSSTheme) => ({ - spinContainer: { - display: "grid", - placeItems: "center", - height: "100%", - }, - selector: { - width: 300, - }, - configRow: { - display: "flex", - alignItems: "center", - justifyContent: "space-between", - }, - divider: { - margin: "1rem -1.5rem", - width: "unset", - }, - container: { - "& .ant-modal-footer": { - display: "flex", - alignItems: "center", - justifyContent: "flex-end", - }, - }, - modalContainer: { - display: "flex", - alignItems: "center", - }, - selectItemLabels: { - fontSize: theme.fontSizeSM, - lineHeight: theme.lineHeightSM, - color: theme.colorTextDescription, - margin: "0px 5px", - }, -})) - -type Props = { - onSuccess?: () => void - onOpenEvaluatorModal: () => void -} & React.ComponentProps - -const NewEvaluationModal: React.FC = ({onSuccess, onOpenEvaluatorModal, ...props}) => { - const classes = useStyles() - const appId = useAppId() - const [fetching, setFetching] = useState(false) - const [testSets, setTestSets] = useState([]) - const [variants, setVariants] = useState([]) - const [evaluatorConfigs] = useAtom(evaluatorConfigsAtom) - const [evaluators] = useAtom(evaluatorsAtom) - const [submitLoading, setSubmitLoading] = useState(false) - const [form] = Form.useForm() - - useEffect(() => { - setFetching(true) - form.resetFields() - Promise.all([fetchTestsets(appId), fetchVariants(appId)]) - .then(([testSets, variants]) => { - setTestSets(testSets) - setVariants(variants) - }) - .catch(console.error) - .finally(() => setFetching(false)) - }, [props.open, appId]) - - const rateLimitValues = { - batch_size: 10, - max_retries: 3, - retry_delay: 3, - delay_between_batches: 5, - } - const correctAnswerColumn = "correct_answer" - - const onSubmit = (values: CreateEvaluationData) => { - // redirect if no llm keys and an AI Critique config is selected - if ( - values.evaluators_configs.some( - (id) => - evaluatorConfigs.find((config) => config.id === id)?.evaluator_key === - "auto_ai_critique", - ) && - redirectIfNoLLMKeys() - ) - return - setSubmitLoading(true) - createEvalutaiton(appId, { - testset_id: values.testset_id, - variant_ids: values.variant_ids, - evaluators_configs: values.evaluators_configs, - rate_limit: rateLimitValues, - lm_providers_keys: apiKeyObject(), - correct_answer_column: correctAnswerColumn, - }) - .then(onSuccess) - .catch(console.error) - .finally(() => setSubmitLoading(false)) - } - - return ( - , - loading: submitLoading, - className: classes.modalContainer, - }} - className={classes.container} - {...props} - > - - -
- - - - - - - - - - - - -
-
- ) -} - -export default NewEvaluationModal diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/EmptyEvaluations.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/EmptyEvaluations.tsx deleted file mode 100644 index 64de7a3f51..0000000000 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/EmptyEvaluations.tsx +++ /dev/null @@ -1,89 +0,0 @@ -import {JSSTheme} from "@/lib/Types" -import {PlusCircleOutlined, SlidersOutlined} from "@ant-design/icons" -import {Button, Empty, Space, Tooltip, Typography} from "antd" -import Image from "next/image" -import React from "react" -import {createUseStyles} from "react-jss" -import evaluationIllustration from "@/media/eval-illustration.png" - -const useStyles = createUseStyles((theme: JSSTheme) => ({ - emptyRoot: { - height: "calc(100vh - 260px)", - display: "grid", - placeItems: "center", - }, - empty: { - "& .ant-empty-description": { - fontSize: 18, - marginTop: "0.75rem", - marginBottom: "1.5rem", - }, - }, - emptyImg: { - width: 120, - height: 120, - objectFit: "contain", - filter: theme.isDark ? "invert(1)" : "none", - opacity: 0.85, - }, -})) - -interface Props { - onConfigureEvaluators?: () => void - onBeginEvaluation?: () => void -} - -const EmptyEvaluations: React.FC = ({onConfigureEvaluators, onBeginEvaluation}) => { - const classes = useStyles() - - return ( -
- - Get Started with Your First Evaluation -
- - } - image={ - - } - > - - - - - Or - - - - -
-
- ) -} - -export default EmptyEvaluations diff --git a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx b/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx deleted file mode 100644 index 97cae7ee85..0000000000 --- a/agenta-web/src/components/pages/evaluations/evaluationResults/EvaluationResults.tsx +++ /dev/null @@ -1,540 +0,0 @@ -import React, {useEffect, useMemo, useRef, useState} from "react" -import {AgGridReact} from "ag-grid-react" -import {useAppTheme} from "@/components/Layout/ThemeContextProvider" -import {ColDef, ValueGetterParams} from "ag-grid-community" -import {createUseStyles} from "react-jss" -import {Button, DropdownProps, Space, Spin, Tag, Tooltip, Typography, theme} from "antd" -import { - DeleteOutlined, - DownloadOutlined, - PlusCircleOutlined, - SlidersOutlined, - SwapOutlined, -} from "@ant-design/icons" -import {EvaluationStatus, GenericObject, JSSTheme, _Evaluation} from "@/lib/Types" -import {uniqBy} from "lodash" -import dayjs from "dayjs" -import relativeTime from "dayjs/plugin/relativeTime" -import duration from "dayjs/plugin/duration" -import NewEvaluationModal from "./NewEvaluationModal" -import {useAppId} from "@/hooks/useAppId" -import { - deleteEvaluations, - fetchAllEvaluations, - fetchEvaluationStatus, -} from "@/services/evaluations/api" -import {useUpdateEffect} from "usehooks-ts" -import {shortPoll} from "@/lib/helpers/utils" -import AlertPopup from "@/components/AlertPopup/AlertPopup" -import { - DateFromNowRenderer, - LinkCellRenderer, - StatusRenderer, - runningStatuses, - statusMapper, -} from "../cellRenderers/cellRenderers" -import {useAtom} from "jotai" -import {evaluatorsAtom} from "@/lib/atoms/evaluation" -import AgCustomHeader from "@/components/AgCustomHeader/AgCustomHeader" -import {useRouter} from "next/router" -import EmptyEvaluations from "./EmptyEvaluations" -import {calcEvalDuration, getFilterParams, getTypedValue} from "@/lib/helpers/evaluate" -import Link from "next/link" -import FilterColumns, {generateFilterItems} from "../FilterColumns/FilterColumns" -import {variantNameWithRev} from "@/lib/helpers/variantHelper" -import {getAppValues} from "@/contexts/app.context" -import {convertToCsv, downloadCsv} from "@/lib/helpers/fileManipulations" -import {formatDate24} from "@/lib/helpers/dateTimeHelper" -import {useQueryParam} from "@/hooks/useQuery" - -dayjs.extend(relativeTime) -dayjs.extend(duration) - -const useStyles = createUseStyles((theme: JSSTheme) => ({ - root: { - display: "flex", - flexDirection: "column", - gap: "1rem", - }, - table: { - height: "calc(100vh - 260px)", - }, - buttonsGroup: { - marginTop: "1rem", - alignSelf: "flex-end", - }, - dropdownMenu: { - "&>.ant-dropdown-menu-item": { - "& .anticon-check": { - display: "none", - }, - }, - "&>.ant-dropdown-menu-item-selected": { - "&:not(:hover)": { - backgroundColor: "transparent !important", - }, - "& .anticon-check": { - display: "inline-flex !important", - }, - }, - }, -})) - -interface Props {} - -const EvaluationResults: React.FC = () => { - const {appTheme} = useAppTheme() - const classes = useStyles() - const appId = useAppId() - const [evaluations, setEvaluations] = useState<_Evaluation[]>([]) - const [evaluators] = useAtom(evaluatorsAtom) - const [newEvalModalOpen, setNewEvalModalOpen] = useState(false) - const [queryNewEvalModalOpen, setQueryNewEvalModalOpen] = - useQueryParam("openNewEvaluationModal") - const [fetching, setFetching] = useState(false) - const [selected, setSelected] = useState<_Evaluation[]>([]) - const stoppers = useRef() - const router = useRouter() - const {token} = theme.useToken() - const gridRef = useRef() - const [hiddenCols, setHiddenCols] = useState([]) - const [isFilterColsDropdownOpen, setIsFilterColsDropdownOpen] = useState(false) - - const runningEvaluationIds = useMemo( - () => - evaluations - .filter((item) => runningStatuses.includes(item.status.value)) - .map((item) => item.id), - [evaluations], - ) - - const onDelete = () => { - AlertPopup({ - title: "Delete Evaluations", - message: `Are you sure you want to delete all ${selected.length} selected evaluations?`, - onOk: () => - deleteEvaluations(selected.map((item) => item.id)) - .catch(console.error) - .then(fetcher), - }) - } - - const fetcher = () => { - setFetching(true) - fetchAllEvaluations(appId) - .then(setEvaluations) - .catch(console.error) - .finally(() => setFetching(false)) - } - - useEffect(() => { - fetcher() - }, [appId]) - - //update status of running evaluations through short polling - useUpdateEffect(() => { - stoppers.current?.() - - if (runningEvaluationIds.length) { - stoppers.current = shortPoll( - () => - Promise.all(runningEvaluationIds.map((id) => fetchEvaluationStatus(id))) - .then((res) => { - setEvaluations((prev) => { - const newEvals = [...prev] - runningEvaluationIds.forEach((id, ix) => { - const index = newEvals.findIndex((e) => e.id === id) - if (index !== -1) { - newEvals[index].status = res[ix].status - newEvals[index].duration = calcEvalDuration(newEvals[index]) - } - }) - if ( - res.some((item) => !runningStatuses.includes(item.status.value)) - ) - fetcher() - return newEvals - }) - }) - .catch(console.error), - {delayMs: 2000, timeoutMs: Infinity}, - ).stopper - } - - return () => { - stoppers.current?.() - } - }, [JSON.stringify(runningEvaluationIds)]) - - const evaluatorConfigs = useMemo( - () => - uniqBy( - evaluations - .map((item) => - item.aggregated_results.map((item) => ({ - ...item.evaluator_config, - evaluator: evaluators.find( - (e) => e.key === item.evaluator_config.evaluator_key, - ), - })), - ) - .flat(), - "id", - ), - [evaluations], - ) - - const compareDisabled = useMemo( - () => - selected.length < 2 || - selected.some( - (item) => - item.status.value === EvaluationStatus.STARTED || - item.status.value === EvaluationStatus.INITIALIZED || - item.testset.id !== selected[0].testset.id, - ), - [selected], - ) - - const colDefs = useMemo(() => { - const colDefs: ColDef<_Evaluation>[] = [ - { - field: "variants", - flex: 1, - minWidth: 160, - pinned: "left", - headerCheckboxSelection: true, - hide: hiddenCols.includes("Variant"), - checkboxSelection: true, - showDisabledCheckboxes: true, - cellRenderer: (params: any) => { - const {revisions, variants} = params.data - return ( - - {params.value} - - ) - }, - onCellClicked(params: any) { - const {revisions, variants} = params.data - router.push( - `/apps/${appId}/playground?variant=${variants[0].variantName}&revision=${revisions[0]}`, - ) - }, - valueGetter: (params) => - variantNameWithRev({ - variant_name: params.data?.variants[0].variantName ?? "", - revision: params.data?.revisions[0], - }), - headerName: "Variant", - tooltipValueGetter: (params) => params.data?.variants[0].variantName, - ...getFilterParams("text"), - }, - { - field: "testset.name", - hide: hiddenCols.includes("Testset"), - headerName: "Testset", - cellRenderer: (params: any) => ( - - ), - flex: 1, - minWidth: 160, - tooltipValueGetter: (params) => params.value, - ...getFilterParams("text"), - onCellClicked(params) { - router.push(`/apps/${appId}/testsets/${params.data?.testset.id}`) - }, - }, - ...evaluatorConfigs.map( - (config) => - ({ - flex: 1, - minWidth: 190, - hide: hiddenCols.includes(config.name), - field: "aggregated_results", - headerName: config.name, - headerComponent: (props: any) => ( - - - - - {config.name} - - - {config.evaluator?.name} - - - - ), - autoHeaderHeight: true, - ...getFilterParams("number"), - cellRenderer: (params: ValueGetterParams<_Evaluation, any>) => { - const result = params.data?.aggregated_results.find( - (item) => item.evaluator_config.id === config.id, - )?.result - - return result?.error ? ( - - Error - - ) : ( - {getTypedValue(result)} - ) - }, - valueGetter: (params) => - getTypedValue( - params.data?.aggregated_results.find( - (item) => item.evaluator_config.id === config.id, - )?.result, - ), - tooltipValueGetter: (params) => - params.data?.aggregated_results - .find((item) => item.evaluator_config.id === config.id) - ?.result?.value?.toString() || "", - }) as ColDef<_Evaluation>, - ), - { - flex: 1, - headerName: "Status", - hide: hiddenCols.includes("Status"), - field: "status", - minWidth: 185, - pinned: "right", - ...getFilterParams("text"), - filterValueGetter: (params) => - statusMapper(token)(params.data?.status.value as EvaluationStatus).label, - cellRenderer: StatusRenderer, - valueGetter: (params) => - statusMapper(token)(params.data?.status.value as EvaluationStatus).label, - }, - { - flex: 1, - field: "average_latency", - headerName: "Avg. Latency", - hide: hiddenCols.includes("Latency"), - minWidth: 120, - ...getFilterParams("number"), - valueGetter: (params) => getTypedValue(params?.data?.average_latency), - }, - { - flex: 1, - field: "total_cost", - headerName: "Total Cost", - hide: hiddenCols.includes("Cost"), - minWidth: 120, - ...getFilterParams("number"), - valueGetter: (params) => getTypedValue(params?.data?.total_cost), - }, - { - flex: 1, - field: "created_at", - headerName: "Created", - hide: hiddenCols.includes("Created"), - minWidth: 160, - ...getFilterParams("date"), - cellRenderer: DateFromNowRenderer, - sort: "desc", - valueFormatter: (params) => formatDate24(params.value), - }, - ] - return colDefs - }, [evaluatorConfigs, hiddenCols]) - - const compareBtnNode = ( - - ) - const onToggleEvaluatorVisibility = (evalConfigId: string) => { - if (!hiddenCols.includes(evalConfigId)) { - setHiddenCols([...hiddenCols, evalConfigId]) - } else { - setHiddenCols(hiddenCols.filter((item) => item !== evalConfigId)) - } - } - - const shownCols = useMemo( - () => - colDefs - .map((item) => item.headerName) - .filter((item) => item !== undefined && !hiddenCols.includes(item)) as string[], - [colDefs], - ) - - const handleOpenChangeFilterCols: DropdownProps["onOpenChange"] = (nextOpen, info) => { - if (info.source === "trigger" || nextOpen) { - setIsFilterColsDropdownOpen(nextOpen) - } - } - - const onExport = () => { - if (!gridRef.current) return - const {currentApp} = getAppValues() - const filename = `${currentApp?.app_name}_evaluation_scenarios.csv` - if (!!selected.length) { - const csvData = convertToCsv( - selected.map((item) => ({ - Variant: variantNameWithRev({ - variant_name: item.variants[0].variantName ?? "", - revision: item.revisions[0], - }), - Testset: item.testset.name, - ...item.aggregated_results.reduce((acc, curr) => { - if (!acc[curr.evaluator_config.name]) { - acc[curr.evaluator_config.name] = getTypedValue(curr.result) - } - return acc - }, {} as GenericObject), - "Avg. Latency": getTypedValue(item.average_latency), - "Total Cost": getTypedValue(item.average_cost), - Created: formatDate24(item.created_at), - Status: statusMapper(token)(item.status.value as EvaluationStatus).label, - })), - colDefs.map((col) => col.headerName!), - ) - downloadCsv(csvData, filename) - } else { - gridRef.current.api.exportDataAsCsv({ - fileName: filename, - }) - } - } - return ( - <> - {!fetching && !evaluations.length ? ( - - router.push(`/apps/${appId}/evaluations/new-evaluator`) - } - onBeginEvaluation={() => { - setNewEvalModalOpen(true) - }} - /> - ) : ( -
- - - {compareDisabled ? ( - - {compareBtnNode} - - ) : ( - compareBtnNode - )} - - - - - { - onToggleEvaluatorVisibility(key) - setIsFilterColsDropdownOpen(true) - }} - /> - - - - -
- - ref={gridRef as any} - rowData={evaluations} - columnDefs={colDefs} - rowStyle={{ - cursor: "pointer", - }} - getRowId={(params) => params.data.id} - onRowClicked={(params) => { - // ignore clicks on the checkbox col - if ( - params.eventPath?.find( - (item: any) => item.ariaColIndex === "1", - ) - ) - return - ;(EvaluationStatus.FINISHED === params.data?.status.value || - EvaluationStatus.FINISHED_WITH_ERRORS === - params.data?.status.value || - EvaluationStatus.AGGREGATION_FAILED === - params.data?.status.value) && - router.push( - `/apps/${appId}/evaluations/results/${params.data?.id}`, - ) - }} - rowSelection="multiple" - suppressRowClickSelection - onSelectionChanged={(event) => - setSelected(event.api.getSelectedRows()) - } - tooltipShowDelay={0} - /> -
-
-
- )} - { - setNewEvalModalOpen(false) - setQueryNewEvalModalOpen("") - }} - onSuccess={() => { - setNewEvalModalOpen(false) - setQueryNewEvalModalOpen("") - fetcher() - }} - /> - - ) -} - -export default EvaluationResults diff --git a/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx b/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx index 0c52b085d5..acd74fc2fd 100644 --- a/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx +++ b/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx @@ -23,7 +23,7 @@ import {useRouter} from "next/router" import React, {useEffect, useMemo, useRef, useState} from "react" import {createUseStyles} from "react-jss" import StatusRenderer from "./StatusRenderer" -import NewEvaluationModal from "../../evaluations/evaluationResults/NewEvaluationModal" +import NewEvaluationModal from "../../evaluations/NewEvaluation/NewEvaluationModal" import {useAtom} from "jotai" import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" import {runningStatuses} from "../../evaluations/cellRenderers/cellRenderers" From f3c541bf72193f002a8c81db2f767865056848ff Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Wed, 4 Sep 2024 15:31:32 +0100 Subject: [PATCH 084/149] refactor(frontend): improved component name for clarify --- .../autoEvaluation/EvaluatorsModal/Evaluators/index.tsx | 8 ++++---- .../autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx index 840fa52520..1668419b8e 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx @@ -7,7 +7,7 @@ import {createUseStyles} from "react-jss" import EvaluatorCard from "./EvaluatorCard" import EvaluatorList from "./EvaluatorList" -type ConfigureEvaluatorModalProps = { +type EvaluatorsProps = { evaluatorConfigs: EvaluatorConfig[] handleOnCancel: () => void setCurrent: React.Dispatch> @@ -51,13 +51,13 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ }, })) -const ConfigureEvaluatorModal = ({ +const Evaluators = ({ evaluatorConfigs, handleOnCancel, setCurrent, setSelectedEvaluator, fetchingEvalConfigs, -}: ConfigureEvaluatorModalProps) => { +}: EvaluatorsProps) => { const classes = useStyles() const [searchTerm, setSearchTerm] = useState("") const [evaluatorsDisplay, setEvaluatorsDisplay] = useState("card") @@ -139,4 +139,4 @@ const ConfigureEvaluatorModal = ({ ) } -export default ConfigureEvaluatorModal +export default Evaluators diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx index 621227b93d..b428d86456 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx @@ -7,7 +7,7 @@ import {createUseStyles} from "react-jss" import NewEvaluatorList from "./NewEvaluatorList" import NewEvaluatorCard from "./NewEvaluatorCard" -type CreateNewEvaluatorProps = { +type NewEvaluatorProps = { setCurrent: React.Dispatch> handleOnCancel: () => void evaluators: Evaluator[] @@ -50,12 +50,12 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ }, })) -const CreateNewEvaluator = ({ +const NewEvaluator = ({ evaluators, setCurrent, handleOnCancel, setSelectedEvaluator, -}: CreateNewEvaluatorProps) => { +}: NewEvaluatorProps) => { const classes = useStyles() const [searchTerm, setSearchTerm] = useState("") const [evaluatorsDisplay, setEvaluatorsDisplay] = useState("card") @@ -162,4 +162,4 @@ const CreateNewEvaluator = ({ ) } -export default CreateNewEvaluator +export default NewEvaluator From dc2067e6f89d14bf6bb0b173e6d7bc5f657adb8a Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Wed, 4 Sep 2024 15:32:13 +0100 Subject: [PATCH 085/149] fix(frontend): transform trace tree and setup mapping --- .../ConfigureEvaluator/AdvancedSettings.tsx | 18 +++- .../ConfigureEvaluator/DynamicFormField.tsx | 16 ++- .../ConfigureEvaluator/index.tsx | 102 ++++++++++-------- agenta-web/src/lib/transformers.ts | 28 +++++ 4 files changed, 117 insertions(+), 47 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx index 28c8f52451..7365f3d3b0 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx @@ -1,9 +1,10 @@ import React from "react" -import {Form, Input, InputNumber, Switch, Tooltip, Collapse, theme} from "antd" +import {Form, Input, InputNumber, Switch, Tooltip, Collapse, theme, AutoComplete} from "antd" import {CaretRightOutlined, InfoCircleOutlined} from "@ant-design/icons" import {createUseStyles} from "react-jss" import {Editor} from "@monaco-editor/react" import {useAppTheme} from "@/components/Layout/ThemeContextProvider" +import {generatePaths} from "@/lib/transformers" const useStyles = createUseStyles((theme: any) => ({ label: { @@ -20,9 +21,10 @@ const useStyles = createUseStyles((theme: any) => ({ type AdvancedSettingsProps = { settings: Record[] + selectedTestcase: Record | null } -const AdvancedSettings: React.FC = ({settings}) => { +const AdvancedSettings: React.FC = ({settings, selectedTestcase}) => { const classes = useStyles() const {appTheme} = useAppTheme() const {token} = theme.useToken() @@ -57,7 +59,17 @@ const AdvancedSettings: React.FC = ({settings}) => { initialValue={field.default} rules={rules} > - {field.type === "string" || field.type === "regex" ? ( + {(field.type === "string" || field.type === "regex") && + selectedTestcase ? ( + + option!.value + .toUpperCase() + .indexOf(inputValue.toUpperCase()) !== -1 + } + /> + ) : field.type === "string" || field.type === "regex" ? ( ) : field.type === "number" ? ( diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx index 1bab9a6e8c..aabd464a86 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DynamicFormField.tsx @@ -1,15 +1,17 @@ import {useAppTheme} from "@/components/Layout/ThemeContextProvider" import {isValidRegex} from "@/lib/helpers/validators" +import {generatePaths} from "@/lib/transformers" import {EvaluationSettingsTemplate, JSSTheme} from "@/lib/Types" import {InfoCircleOutlined} from "@ant-design/icons" import {Editor} from "@monaco-editor/react" -import {theme, Form, Tooltip, InputNumber, Switch, Input} from "antd" +import {theme, Form, Tooltip, InputNumber, Switch, Input, AutoComplete} from "antd" import {Rule} from "antd/es/form" import Link from "next/link" import {createUseStyles} from "react-jss" type DynamicFormFieldProps = EvaluationSettingsTemplate & { name: string | string[] + traceTree: Record } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -49,6 +51,7 @@ export const DynamicFormField: React.FC = ({ min, max, required, + traceTree, }) => { const {appTheme} = useAppTheme() const classes = useStyles() @@ -97,7 +100,16 @@ export const DynamicFormField: React.FC = ({ initialValue={defaultVal} rules={rules} > - {type === "string" || type === "regex" ? ( + {name[1] === "question_key" || + name[1] === "answer_key" || + name[1] === "contexts_key" ? ( + + option!.value.toUpperCase().indexOf(inputValue.toUpperCase()) !== -1 + } + /> + ) : type === "string" || type === "regex" ? ( ) : type === "number" ? ( diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index e852c35ce8..374bd0754e 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -1,4 +1,4 @@ -import {Evaluator, GenericObject, JSSTheme, Parameter, testset, Variant} from "@/lib/Types" +import {Evaluator, JSSTheme, Parameter, testset, Variant} from "@/lib/Types" import {CloseCircleOutlined, CloseOutlined} from "@ant-design/icons" import { ArrowLeft, @@ -30,12 +30,13 @@ import EvaluatorVariantModal from "./EvaluatorVariantModal" import { CreateEvaluationConfigData, createEvaluatorConfig, + createEvaluatorDataMapping, updateEvaluatorConfig, } from "@/services/evaluations/api" import {useAppId} from "@/hooks/useAppId" import {useLocalStorage} from "usehooks-ts" import {getAllVariantParameters} from "@/lib/helpers/variantHelper" -import {getStringOrJson, randString, removeKeys} from "@/lib/helpers/utils" +import {getStringOrJson, removeKeys} from "@/lib/helpers/utils" import {callVariant} from "@/services/api" import {Editor} from "@monaco-editor/react" import {useAppTheme} from "@/components/Layout/ThemeContextProvider" @@ -43,7 +44,7 @@ import {isBaseResponse, isFuncResponse} from "@/lib/helpers/playgroundResp" import {formatCurrency, formatLatency} from "@/lib/helpers/formatters" import {fromBaseResponseToTraceSpanType, transformTraceTreeToJson} from "@/lib/transformers" -type ConfigureNewEvaluatorProps = { +type ConfigureEvaluatorProps = { setCurrent: React.Dispatch> handleOnCancel: () => void onSuccess: () => void @@ -94,7 +95,7 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ }, })) -const ConfigureNewEvaluator = ({ +const ConfigureEvaluator = ({ setCurrent, selectedEvaluator, handleOnCancel, @@ -105,7 +106,7 @@ const ConfigureNewEvaluator = ({ setSelectedTestcase, selectedVariant, setSelectedVariant, -}: ConfigureNewEvaluatorProps) => { +}: ConfigureEvaluatorProps) => { const appId = useAppId() const classes = useStyles() const {appTheme} = useAppTheme() @@ -119,7 +120,23 @@ const ConfigureNewEvaluator = ({ const abortControllersRef = useRef(null) const [isRunningVariant, setIsRunningVariant] = useState(false) const [variantResult, setVariantResult] = useState("") - const [traceTree, setTraceTree] = useState>({}) + const [traceTree, setTraceTree] = useState<{ + testcase: Record | null + }>({ + testcase: null, + }) + + const fetchEvalMapper = async () => { + try { + const response = await createEvaluatorDataMapping({ + inputs: {trace: traceTree}, + mapping: form.getFieldValue("settings_values"), + }) + console.log(response) + } catch (error) { + console.error(error) + } + } const evalFields = useMemo( () => @@ -161,24 +178,6 @@ const ConfigureNewEvaluator = ({ } } - useEffect(() => { - if (optInputs && selectedTestcase) { - setSelectedTestcase(() => { - let result: GenericObject = {} - - optInputs.forEach((data) => { - if (selectedTestcase.hasOwnProperty(data.name)) { - result[data.name] = selectedTestcase[data.name] - } - }) - - result["id"] = randString(6) - - return result - }) - } - }, [optInputs]) - useEffect(() => { if (!selectedVariant || !selectedTestcase) return @@ -232,11 +231,13 @@ const ConfigureNewEvaluator = ({ }), ) if (trace?.spans) { - setTraceTree( - transformTraceTreeToJson( + setTraceTree({ + ...transformTraceTreeToJson( fromBaseResponseToTraceSpanType(trace.spans, trace.trace_id)[0], ), - ) + + testcase: selectedTestcase, + }) } } else { console.error("Unknown response type:", result) @@ -333,7 +334,7 @@ const ConfigureNewEvaluator = ({ > - - + */} @@ -360,20 +361,28 @@ const ConfigureNewEvaluator = ({ Parameters - {basicSettingsFields.map((field) => ( - - ))} + {basicSettingsFields.map((field) => { + const {testcase, ...tree} = traceTree + + return ( + + ) + })} ) : ( "" )} {advancedSettingsFields.length > 0 && ( - + )} @@ -468,9 +477,14 @@ const ConfigureNewEvaluator = ({ language="json" theme={`vs-${appTheme}`} value={getStringOrJson(traceTree)} - // onChange={(value) => { - // console.log(value) - // }} + onChange={(value) => { + try { + if (value) { + const parsedValue = JSON.parse(value) + setTraceTree(parsedValue) + } + } catch (error) {} + }} options={{wordWrap: "on"}} /> @@ -494,7 +508,11 @@ const ConfigureNewEvaluator = ({ Output - @@ -523,4 +541,4 @@ const ConfigureNewEvaluator = ({ ) } -export default ConfigureNewEvaluator +export default ConfigureEvaluator diff --git a/agenta-web/src/lib/transformers.ts b/agenta-web/src/lib/transformers.ts index 786c22c2c7..51f720e328 100644 --- a/agenta-web/src/lib/transformers.ts +++ b/agenta-web/src/lib/transformers.ts @@ -238,3 +238,31 @@ export const transformTraceTreeToJson = (tree: TraceSpan[]) => { return nodeMap } + +export const generatePaths = (obj: Record, currentPath = "") => { + let paths: {value: string}[] = [] + + if (typeof obj === "object" && obj !== null && !Array.isArray(obj)) { + Object.entries(obj).forEach(([key, value]) => { + const newPath = currentPath ? `${currentPath}.${key}` : key + if (value && typeof value === "object" && Object.keys(value).length) { + paths.push({value: newPath}) + paths = paths.concat(generatePaths(value, newPath)) + } else if (value && typeof value !== "object") { + paths.push({value: newPath}) + } + }) + } else if (Array.isArray(obj)) { + obj.forEach((value, index) => { + const newPath = `${currentPath}[${index}]` + if (value && typeof value === "object" && Object.keys(value).length) { + paths.push({value: newPath}) + paths = paths.concat(generatePaths(value, newPath)) + } else if (value && typeof value !== "object") { + paths.push({value: newPath}) + } + }) + } + + return paths +} From c7f1afdd93c195d29512a2b5d887484ec7f1c14d Mon Sep 17 00:00:00 2001 From: ashrafchowdury Date: Thu, 5 Sep 2024 12:09:15 +0600 Subject: [PATCH 086/149] fix(frontend): failing cypress test due to layout change --- agenta-web/cypress/e2e/eval.comparison.cy.ts | 22 ++++++-------- agenta-web/cypress/e2e/eval.evaluations.cy.ts | 18 ++++++----- agenta-web/cypress/e2e/eval.scenarios.cy.ts | 13 ++++---- .../cypress/support/commands/evaluations.ts | 6 +--- .../HumanEvaluations/AbTestingEvaluation.tsx | 5 +++- .../SingleModelEvaluation.tsx | 5 +++- .../autoEvaluation/AutoEvaluation.tsx | 30 ++++++++++++------- .../autoEvaluation/SearchFilter.tsx | 1 + .../AutomaticEvalOverview.tsx | 2 +- .../automaticEvaluation/StatusRenderer.tsx | 2 +- 10 files changed, 57 insertions(+), 47 deletions(-) diff --git a/agenta-web/cypress/e2e/eval.comparison.cy.ts b/agenta-web/cypress/e2e/eval.comparison.cy.ts index cf97725acb..a5e1e33fe4 100644 --- a/agenta-web/cypress/e2e/eval.comparison.cy.ts +++ b/agenta-web/cypress/e2e/eval.comparison.cy.ts @@ -39,8 +39,8 @@ describe("Evaluation Comparison Test", function () { context("Executing Evaluation Comparison Workflow", () => { beforeEach(() => { - cy.visit(`/apps/${app_id}/evaluations/results`) - cy.location("pathname").should("include", "/evaluations/results") + cy.visit(`/apps/${app_id}/evaluations`) + cy.location("pathname").should("include", "/evaluations") }) it("Should create 2 new Evaluations", () => { @@ -48,11 +48,7 @@ describe("Evaluation Comparison Test", function () { url: `${Cypress.env().baseApiURL}/evaluations/?app_id=${app_id}`, method: "GET", }).then((resp) => { - if (resp.body.length) { - cy.get('[data-cy="new-evaluation-button"]').click() - } else { - cy.get('[data-cy="new-evaluation-button__no_variants"]').click() - } + cy.get('[data-cy="new-evaluation-button"]').click() }) cy.get(".ant-modal-content").should("exist") @@ -73,19 +69,19 @@ describe("Evaluation Comparison Test", function () { }) it("Should verify that there are completed evaluations in the list", () => { - cy.get('.ag-row[row-index="0"]').should("exist") - cy.get('.ag-row[row-index="1"]').should("exist") - cy.get('.ag-cell[col-id="status"]', {timeout: 60000}) + cy.get(".ant-table-row").eq(0).should("exist") + cy.get(".ant-table-row").eq(1).should("exist") + cy.get('[data-cy="evaluation-status-cell"]', {timeout: 60000}) .eq(0) .should("contain.text", "Completed") - cy.get('.ag-cell[col-id="status"]', {timeout: 60000}) + cy.get('[data-cy="evaluation-status-cell"]', {timeout: 60000}) .eq(1) .should("contain.text", "Completed") }) it("Should select 2 evaluations, click on the compare button, and successfully navigate to the comparison page", () => { - cy.get("div.ag-selection-checkbox input").eq(0).check() - cy.get("div.ag-selection-checkbox input").eq(1).check() + cy.get(".ant-checkbox-input").eq(0).check() + cy.get('[data-cy="evaluation-results-compare-button"]').should("not.be.disabled") cy.get('[data-cy="evaluation-results-compare-button"]').click() cy.location("pathname").should("include", "/evaluations/results/compare") diff --git a/agenta-web/cypress/e2e/eval.evaluations.cy.ts b/agenta-web/cypress/e2e/eval.evaluations.cy.ts index 248a4e4778..52ad572f98 100644 --- a/agenta-web/cypress/e2e/eval.evaluations.cy.ts +++ b/agenta-web/cypress/e2e/eval.evaluations.cy.ts @@ -9,8 +9,8 @@ describe("Evaluations CRUD Operations Test", function () { context("Executing Evaluations CRUD operations", () => { beforeEach(() => { - cy.visit(`/apps/${app_id}/evaluations/results`) - cy.location("pathname").should("include", "/evaluations/results") + cy.visit(`/apps/${app_id}/evaluations`) + cy.location("pathname").should("include", "/evaluations") }) it("Should successfully create an Evaluation", () => { @@ -26,15 +26,17 @@ describe("Evaluations CRUD Operations Test", function () { }) it("Should verify the successful creation and completion of the evaluation", () => { - cy.get('.ag-row[row-index="0"]').should("exist") - cy.get('.ag-cell[col-id="status"]').should("contain.text", "Completed") + cy.get(".ant-table-row").eq(0).should("exist") + cy.get('[data-cy="evaluation-status-cell"]').should("contain.text", "Completed") }) it("Should select evaluation and successfully delete it", () => { - cy.get(".ag-root-wrapper").should("exist") - cy.get("div.ag-selection-checkbox input").eq(0).check() - cy.get(":nth-child(1) > .ant-btn > .ant-btn-icon > .anticon > svg").click() - cy.get(".ant-modal-confirm-btns > :nth-child(2) > span").click() + cy.get(".ant-checkbox-wrapper").should("exist") + cy.get(".ant-checkbox-input").eq(0).check() + cy.get('[data-cy="delete-evaluation-button"]').click() + + cy.get(".ant-modal-content").should("exist") + cy.get(".ant-modal-footer > .ant-btn-primary").click() }) }) diff --git a/agenta-web/cypress/e2e/eval.scenarios.cy.ts b/agenta-web/cypress/e2e/eval.scenarios.cy.ts index 51d9bf3714..5c545b13bd 100644 --- a/agenta-web/cypress/e2e/eval.scenarios.cy.ts +++ b/agenta-web/cypress/e2e/eval.scenarios.cy.ts @@ -9,8 +9,8 @@ describe("Evaluation Scenarios Test", function () { context("Executing Evaluation Scenarios Workflow", () => { beforeEach(() => { - cy.visit(`/apps/${app_id}/evaluations/results`) - cy.location("pathname").should("include", "/evaluations/results") + cy.visit(`/apps/${app_id}/evaluations`) + cy.location("pathname").should("include", "/evaluations") }) it("Should successfully create an Evaluation", () => { @@ -18,15 +18,14 @@ describe("Evaluation Scenarios Test", function () { }) it("Should verify that evalaution was created and completed successfully", () => { - cy.get('.ag-row[row-index="0"]').should("exist") - cy.get('.ag-cell[col-id="status"]').should("contain.text", "Completed") + cy.get(".ant-table-row").eq(0).should("exist") + cy.get('[data-cy="evaluation-status-cell"]').should("contain.text", "Completed") }) it("Should double click on the Evaluation and successfully navigate to the evalaution results page", () => { - cy.get(".ag-root-wrapper").should("exist") - cy.get('.ag-row-first > [col-id="aggregated_results"]').click() + cy.get(".ant-table-row").eq(0).should("exist") + cy.get(".ant-table-row").click() cy.wait(1000) - cy.get(".ag-cell-focus").dblclick() cy.contains(/Evaluation Results/i) cy.get('[data-cy="evalaution-scenarios-table"]').should("exist") }) diff --git a/agenta-web/cypress/support/commands/evaluations.ts b/agenta-web/cypress/support/commands/evaluations.ts index 78215ed34f..f47417bb0f 100644 --- a/agenta-web/cypress/support/commands/evaluations.ts +++ b/agenta-web/cypress/support/commands/evaluations.ts @@ -105,11 +105,7 @@ Cypress.Commands.add("createNewEvaluation", () => { url: `${Cypress.env().baseApiURL}/evaluations/?app_id=${app_id}`, method: "GET", }).then((resp) => { - if (resp.body.length) { - cy.get('[data-cy="new-evaluation-button"]').click() - } else { - cy.get('[data-cy="new-evaluation-button__no_variants"]').click() - } + cy.get('[data-cy="new-evaluation-button"]').click() }) cy.get(".ant-modal-content").should("exist") diff --git a/agenta-web/src/components/HumanEvaluations/AbTestingEvaluation.tsx b/agenta-web/src/components/HumanEvaluations/AbTestingEvaluation.tsx index 2c8b6b4d4f..a36a69ef14 100644 --- a/agenta-web/src/components/HumanEvaluations/AbTestingEvaluation.tsx +++ b/agenta-web/src/components/HumanEvaluations/AbTestingEvaluation.tsx @@ -414,7 +414,10 @@ const AbTestingEvaluation = ({viewType}: {viewType: "evaluation" | "overview"})
A/B Testing Evaluations - diff --git a/agenta-web/src/components/HumanEvaluations/SingleModelEvaluation.tsx b/agenta-web/src/components/HumanEvaluations/SingleModelEvaluation.tsx index b2a2ec26b5..590849affa 100644 --- a/agenta-web/src/components/HumanEvaluations/SingleModelEvaluation.tsx +++ b/agenta-web/src/components/HumanEvaluations/SingleModelEvaluation.tsx @@ -311,7 +311,10 @@ const SingleModelEvaluation = ({viewType}: {viewType: "evaluation" | "overview"} Single Model Evaluations - diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx index 58386d3faa..5a3feabd10 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx @@ -247,19 +247,22 @@ const AutoEvaluation = () => { title: evaluator.name, key: `results-${idx}`, onHeaderCell: () => ({style: {minWidth: 240}}), - showSorterTooltip: false, + sortDirections: ["descend", "ascend"], sorter: { compare: (a, b) => { - const getSortValue = (item: _Evaluation) => { - if (item.aggregated_results && item.aggregated_results.length > 0) { - const result = item.aggregated_results[0].result - if (result && typeof result.value === "number") { - return result.value - } + const getSortValue = (item: _Evaluation, evaluatorId: string) => { + const matchingResult = item.aggregated_results.find( + (result) => result.evaluator_config.id === evaluatorId, + ) + + if (matchingResult && typeof matchingResult.result.value === "number") { + return matchingResult.result.value } + return 0 } - return getSortValue(a) - getSortValue(b) + + return getSortValue(a, evaluator.id) - getSortValue(b, evaluator.id) }, }, render: (_, record) => { @@ -269,6 +272,10 @@ const AutoEvaluation = () => { (result) => result.evaluator_config.id === evaluator.id, ) + if (matchingResults.length === 0) { + return - + } + return ( {matchingResults.map((result, index) => @@ -276,7 +283,7 @@ const AutoEvaluation = () => { @@ -522,6 +529,7 @@ const AutoEvaluation = () => { icon={} className={classes.button} onClick={() => setNewEvalModalOpen(true)} + data-cy="new-evaluation-button" > Start new evaluation @@ -541,6 +549,7 @@ const AutoEvaluation = () => { className={classes.button} onClick={() => setIsDeleteEvalMultipleModalOpen(true)} disabled={selectedRowKeys.length == 0} + data-cy="delete-evaluation-button" > Delete @@ -549,6 +558,7 @@ const AutoEvaluation = () => { icon={} className={classes.button} disabled={compareDisabled} + data-cy="evaluation-results-compare-button" onClick={() => router.push( `/apps/${appId}/evaluations/results/compare?evaluations=${selectedRowKeys.join(",")}`, @@ -588,7 +598,7 @@ const AutoEvaluation = () => { pagination={false} onRow={(record) => ({ style: {cursor: "pointer"}, - onClick: () => {}, + onClick: () => router.push(`/apps/${appId}/evaluations/results/${record.id}`), })} /> diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/SearchFilter.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/SearchFilter.tsx index 45d9e51435..b8f1434dd2 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/SearchFilter.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/SearchFilter.tsx @@ -35,6 +35,7 @@ export function getFilterParams( confirm({closeDropdown: false}) }} style={{display: "block"}} + step={0.1} type={type} /> )} diff --git a/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx b/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx index acd74fc2fd..2eaa0dfe62 100644 --- a/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx +++ b/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx @@ -430,7 +430,7 @@ const AutomaticEvalOverview = () => {
Automatic Evaluations - diff --git a/agenta-web/src/components/pages/overview/automaticEvaluation/StatusRenderer.tsx b/agenta-web/src/components/pages/overview/automaticEvaluation/StatusRenderer.tsx index 107498527d..72b67f4f43 100644 --- a/agenta-web/src/components/pages/overview/automaticEvaluation/StatusRenderer.tsx +++ b/agenta-web/src/components/pages/overview/automaticEvaluation/StatusRenderer.tsx @@ -43,7 +43,7 @@ const StatusRenderer = (record: _Evaluation) => { const errorStacktrace = record.status.error?.stacktrace return ( - +
{label} {errorMsg && ( From 827680e0c7b5a1a6facab1c60d5ee2cde548046f Mon Sep 17 00:00:00 2001 From: ashrafchowdury Date: Thu, 5 Sep 2024 12:20:19 +0600 Subject: [PATCH 087/149] enhance(frontend): improved structure --- .../pages/evaluations/autoEvaluation/AutoEvaluation.tsx | 6 +++--- .../autoEvaluation/{ => Filters}/EditColumns.tsx | 0 .../autoEvaluation/{ => Filters}/SearchFilter.tsx | 0 .../cellRenderers}/StatusRenderer.tsx | 2 +- .../overview/automaticEvaluation/AutomaticEvalOverview.tsx | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) rename agenta-web/src/components/pages/evaluations/autoEvaluation/{ => Filters}/EditColumns.tsx (100%) rename agenta-web/src/components/pages/evaluations/autoEvaluation/{ => Filters}/SearchFilter.tsx (100%) rename agenta-web/src/components/pages/{overview/automaticEvaluation => evaluations/cellRenderers}/StatusRenderer.tsx (95%) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx index 5a3feabd10..a60462ba5b 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx @@ -32,12 +32,12 @@ import {useAtom} from "jotai" import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" import DeleteEvaluationModal from "@/components/DeleteEvaluationModal/DeleteEvaluationModal" import {useRouter} from "next/router" -import EditColumns, {generateEditItems} from "./EditColumns" -import StatusRenderer from "../../overview/automaticEvaluation/StatusRenderer" +import EditColumns, {generateEditItems} from "./Filters/EditColumns" +import StatusRenderer from "../cellRenderers/StatusRenderer" import {runningStatuses} from "../../evaluations/cellRenderers/cellRenderers" import {useUpdateEffect} from "usehooks-ts" import {shortPoll} from "@/lib/helpers/utils" -import {getFilterParams} from "./SearchFilter" +import {getFilterParams} from "./Filters/SearchFilter" import {uniqBy} from "lodash" import NewEvaluatorModal from "../evaluators/NewEvaluatorModal" diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EditColumns.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/Filters/EditColumns.tsx similarity index 100% rename from agenta-web/src/components/pages/evaluations/autoEvaluation/EditColumns.tsx rename to agenta-web/src/components/pages/evaluations/autoEvaluation/Filters/EditColumns.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/SearchFilter.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/Filters/SearchFilter.tsx similarity index 100% rename from agenta-web/src/components/pages/evaluations/autoEvaluation/SearchFilter.tsx rename to agenta-web/src/components/pages/evaluations/autoEvaluation/Filters/SearchFilter.tsx diff --git a/agenta-web/src/components/pages/overview/automaticEvaluation/StatusRenderer.tsx b/agenta-web/src/components/pages/evaluations/cellRenderers/StatusRenderer.tsx similarity index 95% rename from agenta-web/src/components/pages/overview/automaticEvaluation/StatusRenderer.tsx rename to agenta-web/src/components/pages/evaluations/cellRenderers/StatusRenderer.tsx index 72b67f4f43..dc7fbbf272 100644 --- a/agenta-web/src/components/pages/overview/automaticEvaluation/StatusRenderer.tsx +++ b/agenta-web/src/components/pages/evaluations/cellRenderers/StatusRenderer.tsx @@ -4,7 +4,7 @@ import {InfoCircleOutlined} from "@ant-design/icons" import {theme, Tooltip, Typography} from "antd" import React from "react" import {createUseStyles} from "react-jss" -import {runningStatuses, statusMapper} from "../../evaluations/cellRenderers/cellRenderers" +import {runningStatuses, statusMapper} from "./cellRenderers" const useStyles = createUseStyles((theme: JSSTheme) => ({ statusCell: { diff --git a/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx b/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx index 2eaa0dfe62..c6ae12388d 100644 --- a/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx +++ b/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx @@ -22,7 +22,7 @@ import {ColumnsType} from "antd/es/table" import {useRouter} from "next/router" import React, {useEffect, useMemo, useRef, useState} from "react" import {createUseStyles} from "react-jss" -import StatusRenderer from "./StatusRenderer" +import StatusRenderer from "../../evaluations/cellRenderers/StatusRenderer" import NewEvaluationModal from "../../evaluations/NewEvaluation/NewEvaluationModal" import {useAtom} from "jotai" import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" From d8bca048ffe2145ef709e026e87248a27f8f69a8 Mon Sep 17 00:00:00 2001 From: ashrafchowdury Date: Thu, 5 Sep 2024 12:30:22 +0600 Subject: [PATCH 088/149] fix(frontend): prettier format --- agenta-web/cypress/e2e/eval.comparison.cy.ts | 2 +- agenta-web/cypress/e2e/eval.evaluations.cy.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/agenta-web/cypress/e2e/eval.comparison.cy.ts b/agenta-web/cypress/e2e/eval.comparison.cy.ts index a5e1e33fe4..a8203ae672 100644 --- a/agenta-web/cypress/e2e/eval.comparison.cy.ts +++ b/agenta-web/cypress/e2e/eval.comparison.cy.ts @@ -80,7 +80,7 @@ describe("Evaluation Comparison Test", function () { }) it("Should select 2 evaluations, click on the compare button, and successfully navigate to the comparison page", () => { - cy.get(".ant-checkbox-input").eq(0).check() + cy.get(".ant-checkbox-input").eq(0).check() cy.get('[data-cy="evaluation-results-compare-button"]').should("not.be.disabled") cy.get('[data-cy="evaluation-results-compare-button"]').click() diff --git a/agenta-web/cypress/e2e/eval.evaluations.cy.ts b/agenta-web/cypress/e2e/eval.evaluations.cy.ts index 52ad572f98..633dd8b84e 100644 --- a/agenta-web/cypress/e2e/eval.evaluations.cy.ts +++ b/agenta-web/cypress/e2e/eval.evaluations.cy.ts @@ -36,7 +36,7 @@ describe("Evaluations CRUD Operations Test", function () { cy.get('[data-cy="delete-evaluation-button"]').click() cy.get(".ant-modal-content").should("exist") - cy.get(".ant-modal-footer > .ant-btn-primary").click() + cy.get(".ant-modal-footer > .ant-btn-primary").click() }) }) From baf3e683897edc4d33cedd8baf3e7aafccbdc7da Mon Sep 17 00:00:00 2001 From: ashrafchowdury Date: Thu, 5 Sep 2024 12:40:43 +0600 Subject: [PATCH 089/149] fix(frontend): lint error --- .../evaluations/autoEvaluation/Filters/SearchFilter.tsx | 9 --------- 1 file changed, 9 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/Filters/SearchFilter.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/Filters/SearchFilter.tsx index b8f1434dd2..d4211bd887 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/Filters/SearchFilter.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/Filters/SearchFilter.tsx @@ -1,7 +1,6 @@ import {_Evaluation} from "@/lib/Types" import {Input, InputRef, TableColumnType, DatePicker} from "antd" import {FilterDropdownProps} from "antd/es/table/interface" -import React, {useRef} from "react" import dayjs from "dayjs" type DataIndex = keyof _Evaluation @@ -12,8 +11,6 @@ export function getFilterParams( dataIndex: DataIndex, type: CellDataType, ): TableColumnType<_Evaluation> { - const searchInput = useRef(null) - const filterDropdown = ({setSelectedKeys, selectedKeys, confirm}: FilterDropdownProps) => { return (
e.stopPropagation()}> @@ -27,7 +24,6 @@ export function getFilterParams( /> ) : ( { @@ -70,10 +66,5 @@ export function getFilterParams( return { filterDropdown, onFilter, - onFilterDropdownOpenChange: (visible) => { - if (visible) { - setTimeout(() => searchInput.current?.select(), 100) - } - }, } } From 2410b3f9ce1ff163d9a9aad774e6799233097585 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Thu, 5 Sep 2024 08:04:54 +0100 Subject: [PATCH 090/149] feat(frontend): implemented run evaluator logic and updated EvaluatorRunExecution Types --- .../ConfigureEvaluator/index.tsx | 61 ++++++++++++++----- agenta-web/src/lib/Types.ts | 11 ++++ .../src/services/evaluations/api/index.ts | 9 ++- 3 files changed, 65 insertions(+), 16 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index 374bd0754e..f7249d40f1 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -1,4 +1,4 @@ -import {Evaluator, JSSTheme, Parameter, testset, Variant} from "@/lib/Types" +import {BaseResponse, Evaluator, JSSTheme, Parameter, testset, Variant} from "@/lib/Types" import {CloseCircleOutlined, CloseOutlined} from "@ant-design/icons" import { ArrowLeft, @@ -31,12 +31,13 @@ import { CreateEvaluationConfigData, createEvaluatorConfig, createEvaluatorDataMapping, + createEvaluatorRunExecution, updateEvaluatorConfig, } from "@/services/evaluations/api" import {useAppId} from "@/hooks/useAppId" import {useLocalStorage} from "usehooks-ts" import {getAllVariantParameters} from "@/lib/helpers/variantHelper" -import {getStringOrJson, removeKeys} from "@/lib/helpers/utils" +import {apiKeyObject, getStringOrJson, removeKeys} from "@/lib/helpers/utils" import {callVariant} from "@/services/api" import {Editor} from "@monaco-editor/react" import {useAppTheme} from "@/components/Layout/ThemeContextProvider" @@ -125,16 +126,37 @@ const ConfigureEvaluator = ({ }>({ testcase: null, }) + const [baseResponseData, setBaseResponseData] = useState(null) + const [outputResult, setOutputResult] = useState("") + const [isLoadingResult, setIsLoadingResult] = useState(false) const fetchEvalMapper = async () => { + if (!baseResponseData) return + try { - const response = await createEvaluatorDataMapping({ - inputs: {trace: traceTree}, - mapping: form.getFieldValue("settings_values"), + setIsLoadingResult(true) + const mapResponse = await createEvaluatorDataMapping({ + inputs: baseResponseData, + mapping: { + ...form.getFieldValue("settings_values"), + }, + }) + + const runResponse = await createEvaluatorRunExecution(selectedEvaluator.key, { + inputs: {...mapResponse.outputs}, + settings: { + ...form.getFieldValue("settings_values"), + }, + ...(selectedEvaluator.requires_llm_api_keys || + form.getFieldValue("settings_values")?.requires_llm_api_keys + ? {credentials: apiKeyObject()} + : {}), }) - console.log(response) + setOutputResult(getStringOrJson(runResponse)) } catch (error) { console.error(error) + } finally { + setIsLoadingResult(false) } } @@ -218,9 +240,12 @@ const ConfigureEvaluator = ({ if (typeof result === "string") { setVariantResult(getStringOrJson({data: result})) + setTraceTree({...{data: result}, testcase: selectedTestcase}) } else if (isFuncResponse(result)) { setVariantResult(getStringOrJson(result)) + setTraceTree({...{data: result}, testcase: selectedTestcase}) } else if (isBaseResponse(result)) { + setBaseResponseData(result) const {trace, data} = result setVariantResult( getStringOrJson({ @@ -499,7 +524,7 @@ const ConfigureEvaluator = ({ language="json" theme={`vs-${appTheme}`} value={variantResult} - options={{wordWrap: "on"}} + options={{wordWrap: "on", readOnly: true}} />
@@ -508,13 +533,20 @@ const ConfigureEvaluator = ({ Output - + +
diff --git a/agenta-web/src/lib/Types.ts b/agenta-web/src/lib/Types.ts index 7b12d64804..f33b285edd 100644 --- a/agenta-web/src/lib/Types.ts +++ b/agenta-web/src/lib/Types.ts @@ -356,6 +356,7 @@ export interface Evaluator { direct_use?: boolean description: string oss?: boolean + requires_llm_api_keys?: boolean } export interface EvaluatorMappingInput { @@ -367,6 +368,16 @@ export interface EvaluatorMappingOutput { outputs: Record } +export interface EvaluatorInputInterface { + inputs: Record + settings?: Record + credentials?: Record +} + +export interface EvaluatorOutputInterface { + outputs: Record +} + export interface EvaluatorConfig { id: string evaluator_key: string diff --git a/agenta-web/src/services/evaluations/api/index.ts b/agenta-web/src/services/evaluations/api/index.ts index e45bd4cc23..b04c9d8432 100644 --- a/agenta-web/src/services/evaluations/api/index.ts +++ b/agenta-web/src/services/evaluations/api/index.ts @@ -3,8 +3,10 @@ import { ComparisonResultRow, Evaluator, EvaluatorConfig, + EvaluatorInputInterface, EvaluatorMappingInput, EvaluatorMappingOutput, + EvaluatorOutputInterface, KeyValuePair, LLMRunRateLimit, TestSet, @@ -71,8 +73,11 @@ export const createEvaluatorDataMapping = async ( return response.data } -export const createEvaluatorRunExecution = async (evaluatorKey: string) => { - const response = await axios.post(`/api/evaluators/${evaluatorKey}/run/`) +export const createEvaluatorRunExecution = async ( + evaluatorKey: string, + config: EvaluatorInputInterface, +): Promise => { + const response = await axios.post(`/api/evaluators/${evaluatorKey}/run/`, {...config}) return response.data } From a1b048d291e139c99d0e08efe1437bfa17e7ff26 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Thu, 5 Sep 2024 11:32:48 +0100 Subject: [PATCH 091/149] fix(backend): Updated Evaluator model to requires_llm_api_keys field --- agenta-backend/agenta_backend/models/api/evaluation_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 363706702e..96f9ddef97 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -14,6 +14,7 @@ class Evaluator(BaseModel): settings_template: dict description: Optional[str] = None oss: Optional[bool] = False + requires_llm_api_keys: Optional[bool] = False class EvaluatorConfig(BaseModel): From 910999d09df44351b27d661b6bde1b195a009508 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Thu, 5 Sep 2024 12:37:25 +0100 Subject: [PATCH 092/149] refactor(frontend): improve handling of testcase mapping and evaluator values --- .../ConfigureEvaluator/index.tsx | 63 ++++++++++--------- agenta-web/src/lib/helpers/evaluate.ts | 20 ++++++ 2 files changed, 55 insertions(+), 28 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index f7249d40f1..687ad7b10f 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -44,6 +44,7 @@ import {useAppTheme} from "@/components/Layout/ThemeContextProvider" import {isBaseResponse, isFuncResponse} from "@/lib/helpers/playgroundResp" import {formatCurrency, formatLatency} from "@/lib/helpers/formatters" import {fromBaseResponseToTraceSpanType, transformTraceTreeToJson} from "@/lib/transformers" +import {mapTestcaseAndEvalValues} from "@/lib/helpers/evaluate" type ConfigureEvaluatorProps = { setCurrent: React.Dispatch> @@ -124,29 +125,40 @@ const ConfigureEvaluator = ({ const [traceTree, setTraceTree] = useState<{ testcase: Record | null }>({ - testcase: null, + testcase: selectedTestcase, }) const [baseResponseData, setBaseResponseData] = useState(null) const [outputResult, setOutputResult] = useState("") const [isLoadingResult, setIsLoadingResult] = useState(false) const fetchEvalMapper = async () => { - if (!baseResponseData) return + if (!baseResponseData || !selectedTestcase) return try { setIsLoadingResult(true) - const mapResponse = await createEvaluatorDataMapping({ - inputs: baseResponseData, - mapping: { - ...form.getFieldValue("settings_values"), - }, - }) + + const {testcaseObj, evalMapObj, testcaseMappingKeys} = mapTestcaseAndEvalValues( + form.getFieldValue("settings_values"), + selectedTestcase, + ) + + let outputs = {} + + if (!!Object.keys(evalMapObj).length) { + const mapResponse = await createEvaluatorDataMapping({ + inputs: baseResponseData, + mapping: evalMapObj, + }) + outputs = {...outputs, ...mapResponse.outputs} + } + + if (!!Object.keys(testcaseObj).length) { + outputs = {...outputs, ...testcaseObj} + } const runResponse = await createEvaluatorRunExecution(selectedEvaluator.key, { - inputs: {...mapResponse.outputs}, - settings: { - ...form.getFieldValue("settings_values"), - }, + inputs: outputs, + settings: {...form.getFieldValue("settings_values"), ...testcaseMappingKeys}, ...(selectedEvaluator.requires_llm_api_keys || form.getFieldValue("settings_values")?.requires_llm_api_keys ? {credentials: apiKeyObject()} @@ -240,10 +252,10 @@ const ConfigureEvaluator = ({ if (typeof result === "string") { setVariantResult(getStringOrJson({data: result})) - setTraceTree({...{data: result}, testcase: selectedTestcase}) + setTraceTree({...{data: result}, ...traceTree}) } else if (isFuncResponse(result)) { setVariantResult(getStringOrJson(result)) - setTraceTree({...{data: result}, testcase: selectedTestcase}) + setTraceTree({...{data: result}, ...traceTree}) } else if (isBaseResponse(result)) { setBaseResponseData(result) const {trace, data} = result @@ -260,8 +272,7 @@ const ConfigureEvaluator = ({ ...transformTraceTreeToJson( fromBaseResponseToTraceSpanType(trace.spans, trace.trace_id)[0], ), - - testcase: selectedTestcase, + ...traceTree, }) } } else { @@ -386,18 +397,14 @@ const ConfigureEvaluator = ({ Parameters - {basicSettingsFields.map((field) => { - const {testcase, ...tree} = traceTree - - return ( - - ) - })} + {basicSettingsFields.map((field) => ( + + ))}
) : ( "" diff --git a/agenta-web/src/lib/helpers/evaluate.ts b/agenta-web/src/lib/helpers/evaluate.ts index 2a1af7ee59..e0804eb513 100644 --- a/agenta-web/src/lib/helpers/evaluate.ts +++ b/agenta-web/src/lib/helpers/evaluate.ts @@ -352,3 +352,23 @@ const getCustomComparator = (type: CellDataType) => (valueA: string, valueB: str export const removeCorrectAnswerPrefix = (str: string) => { return str.replace(/^correctAnswer_/, "") } + +export const mapTestcaseAndEvalValues = ( + settingsValues: Record, + selectedTestcase: Record, +) => { + let testcaseObj: Record = {} + let evalMapObj: Record = {} + let testcaseMappingKeys: Record = {} + + Object.entries(settingsValues).forEach(([key, value]) => { + if (typeof value === "string" && value.startsWith("testcase.")) { + testcaseObj[key] = selectedTestcase[value.split(".")[1]] + testcaseMappingKeys[key] = value.split(".")[1] + } else { + evalMapObj[key] = value + } + }) + + return {testcaseObj, evalMapObj, testcaseMappingKeys} +} From aecbc5b25a373b1db77ba461e7a5ee5661cc5744 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Thu, 5 Sep 2024 16:08:51 +0100 Subject: [PATCH 093/149] fix(backend): fixed rag evaluator inputs and bug in exact match evaluator --- .../services/evaluators_service.py | 35 +++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index d316db702d..96247e33a3 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -130,6 +130,8 @@ def get_correct_answer( correct_answer_key = settings_values.get("correct_answer_key") if correct_answer_key is None: raise ValueError("No correct answer keys provided.") + if len(correct_answer_key.split(".")) > 1: + correct_answer_key = correct_answer_key.split(".")[-1] if correct_answer_key not in data_point: raise ValueError( f"Correct answer column '{correct_answer_key}' not found in the test set." @@ -164,7 +166,9 @@ async def auto_exact_match( output = validate_string_output("exact_match", output) correct_answer = get_correct_answer(data_point, settings_values) inputs = {"ground_truth": correct_answer, "prediction": output} - response = exact_match(input=EvaluatorInputInterface(**{"inputs": inputs})) + response = await exact_match( + input=EvaluatorInputInterface(**{"inputs": inputs}) + ) result = Result(type="bool", value=response["outputs"]["success"]) return result except ValueError as e: @@ -186,7 +190,7 @@ async def auto_exact_match( ) -def exact_match(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: +async def exact_match(input: EvaluatorInputInterface) -> EvaluatorOutputInterface: prediction = input.inputs.get("prediction", "") ground_truth = input.inputs.get("ground_truth", "") success = True if prediction == ground_truth else False @@ -862,9 +866,9 @@ async def measure_rag_consistency( # Initialize RAG evaluator to calculate faithfulness score faithfulness = Faithfulness(api_key=openai_api_key) eval_score = await faithfulness._run_eval_async( - output=input.inputs["answer"], - input=input.inputs["question"], - context=input.inputs["context"], + output=input.inputs["answer_key"], + input=input.inputs["question_key"], + context=input.inputs["contexts_key"], ) return {"outputs": {"score": eval_score.score}} @@ -929,9 +933,9 @@ async def rag_faithfulness( input=EvaluatorInputInterface( **{ "inputs": { - "question": question_val, - "context": contexts_val, - "answer": answer_val, + "question_key": question_val, + "contexts_key": contexts_val, + "answer_key": answer_val, }, "settings": settings_values, "credentials": lm_providers_keys, @@ -963,9 +967,9 @@ async def measure_context_coherence( # Initialize RAG evaluator to calculate context relevancy score context_rel = ContextRelevancy(api_key=openai_api_key) eval_score = await context_rel._run_eval_async( - output=input.inputs["answer"], - input=input.inputs["question"], - context=input.inputs["context"], + output=input.inputs["answer_key"], + input=input.inputs["question_key"], + context=input.inputs["contexts_key"], ) return {"outputs": {"score": eval_score.score}} @@ -1030,9 +1034,9 @@ async def rag_context_relevancy( input=EvaluatorInputInterface( **{ "inputs": { - "question": question_val, - "context": contexts_val, - "answer": answer_val, + "question_key": question_val, + "contexts_key": contexts_val, + "answer_key": answer_val, }, "settings": settings_values, "credentials": lm_providers_keys, @@ -1167,6 +1171,9 @@ async def similarity_match(input: EvaluatorInputInterface) -> EvaluatorOutputInt set2 = set(input.inputs["ground_truth"].split()) intersect = set1.intersection(set2) union = set1.union(set2) + print(set1) + print(set2) + print(union) similarity = len(intersect) / len(union) is_similar = True if similarity > input.settings["similarity_threshold"] else False From 7e0c469af197be8089d47c21a7ac544039b31df3 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Thu, 5 Sep 2024 16:11:58 +0100 Subject: [PATCH 094/149] refactor(frontend): clean up and optimize fetchEvalMapper logic --- .../ConfigureEvaluator/index.tsx | 31 +++++++++++++------ agenta-web/src/lib/helpers/evaluate.ts | 4 +-- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index 687ad7b10f..b5c0186d73 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -137,14 +137,14 @@ const ConfigureEvaluator = ({ try { setIsLoadingResult(true) - const {testcaseObj, evalMapObj, testcaseMappingKeys} = mapTestcaseAndEvalValues( - form.getFieldValue("settings_values"), + const settingsValues = form.getFieldValue("settings_values") + const {testcaseObj, evalMapObj} = mapTestcaseAndEvalValues( + settingsValues, selectedTestcase, ) - let outputs = {} - if (!!Object.keys(evalMapObj).length) { + if (Object.keys(evalMapObj).length && selectedEvaluator.key.startsWith("rag_")) { const mapResponse = await createEvaluatorDataMapping({ inputs: baseResponseData, mapping: evalMapObj, @@ -152,15 +152,26 @@ const ConfigureEvaluator = ({ outputs = {...outputs, ...mapResponse.outputs} } - if (!!Object.keys(testcaseObj).length) { + if (Object.keys(testcaseObj).length) { outputs = {...outputs, ...testcaseObj} } + if (!selectedEvaluator.key.startsWith("rag_")) { + const correctAnswerKey = settingsValues.correct_answer_key + const groundTruthKey = correctAnswerKey.startsWith("testcase.") + ? correctAnswerKey.split(".")[1] + : correctAnswerKey + + outputs = { + ground_truth: selectedTestcase[groundTruthKey], + prediction: JSON.parse(variantResult)?.message, + } + } + const runResponse = await createEvaluatorRunExecution(selectedEvaluator.key, { inputs: outputs, - settings: {...form.getFieldValue("settings_values"), ...testcaseMappingKeys}, - ...(selectedEvaluator.requires_llm_api_keys || - form.getFieldValue("settings_values")?.requires_llm_api_keys + settings: settingsValues, + ...(selectedEvaluator.requires_llm_api_keys || settingsValues?.requires_llm_api_keys ? {credentials: apiKeyObject()} : {}), }) @@ -251,7 +262,9 @@ const ConfigureEvaluator = ({ ) if (typeof result === "string") { - setVariantResult(getStringOrJson({data: result})) + setVariantResult( + getStringOrJson({...(typeof result === "string" ? {message: result} : result)}), + ) setTraceTree({...{data: result}, ...traceTree}) } else if (isFuncResponse(result)) { setVariantResult(getStringOrJson(result)) diff --git a/agenta-web/src/lib/helpers/evaluate.ts b/agenta-web/src/lib/helpers/evaluate.ts index e0804eb513..042c93ae75 100644 --- a/agenta-web/src/lib/helpers/evaluate.ts +++ b/agenta-web/src/lib/helpers/evaluate.ts @@ -359,16 +359,14 @@ export const mapTestcaseAndEvalValues = ( ) => { let testcaseObj: Record = {} let evalMapObj: Record = {} - let testcaseMappingKeys: Record = {} Object.entries(settingsValues).forEach(([key, value]) => { if (typeof value === "string" && value.startsWith("testcase.")) { testcaseObj[key] = selectedTestcase[value.split(".")[1]] - testcaseMappingKeys[key] = value.split(".")[1] } else { evalMapObj[key] = value } }) - return {testcaseObj, evalMapObj, testcaseMappingKeys} + return {testcaseObj, evalMapObj} } From 751245dada58a38afac2221570dadc573f1250d5 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Thu, 5 Sep 2024 17:19:30 +0100 Subject: [PATCH 095/149] fix(frontend): bug fixes --- .../EvaluatorsModal/ConfigureEvaluator/index.tsx | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index b5c0186d73..6bf6369252 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -137,7 +137,7 @@ const ConfigureEvaluator = ({ try { setIsLoadingResult(true) - const settingsValues = form.getFieldValue("settings_values") + const settingsValues = form.getFieldValue("settings_values") || {} const {testcaseObj, evalMapObj} = mapTestcaseAndEvalValues( settingsValues, selectedTestcase, @@ -158,13 +158,19 @@ const ConfigureEvaluator = ({ if (!selectedEvaluator.key.startsWith("rag_")) { const correctAnswerKey = settingsValues.correct_answer_key - const groundTruthKey = correctAnswerKey.startsWith("testcase.") - ? correctAnswerKey.split(".")[1] - : correctAnswerKey + const groundTruthKey = + typeof correctAnswerKey === "string" && correctAnswerKey.startsWith("testcase.") + ? correctAnswerKey.split(".")[1] + : correctAnswerKey outputs = { ground_truth: selectedTestcase[groundTruthKey], - prediction: JSON.parse(variantResult)?.message, + prediction: + selectedEvaluator.key.includes("json") || + selectedEvaluator.key.includes("field_match_test") + ? JSON.stringify({message: JSON.parse(variantResult)?.message}) + : JSON.parse(variantResult)?.message, + ...(selectedEvaluator.key === "auto_custom_code_run" ? {app_config: {}} : {}), } } From c43e6c1c97e6842b0b24f2164ac32c2fc4aa3c60 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Thu, 5 Sep 2024 22:29:37 +0100 Subject: [PATCH 096/149] fix(frontend): added EvaluationErrorPopover component and improve evaluation result output display --- .../EvaluationErrorPopover.tsx | 43 +++++++++++ .../autoEvaluation/AutoEvaluation.tsx | 75 ++----------------- .../AutomaticEvalOverview.tsx | 30 +------- 3 files changed, 53 insertions(+), 95 deletions(-) create mode 100644 agenta-web/src/components/pages/evaluations/EvaluationErrorProps/EvaluationErrorPopover.tsx diff --git a/agenta-web/src/components/pages/evaluations/EvaluationErrorProps/EvaluationErrorPopover.tsx b/agenta-web/src/components/pages/evaluations/EvaluationErrorProps/EvaluationErrorPopover.tsx new file mode 100644 index 0000000000..5261232393 --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/EvaluationErrorProps/EvaluationErrorPopover.tsx @@ -0,0 +1,43 @@ +import {EvaluationError, JSSTheme, TypedValue} from "@/lib/Types" +import {InfoCircleOutlined} from "@ant-design/icons" +import {Button, Popover, Typography} from "antd" +import React from "react" +import {createUseStyles} from "react-jss" + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + errModalStackTrace: { + maxWidth: 300, + "& code": { + display: "block", + width: "100%", + }, + }, +})) + +const EvaluationErrorPopover = (result: { + result: TypedValue & { + error: null | EvaluationError + } +}) => { + const classes = useStyles() + + return ( + + {result.result.error?.stacktrace} + + } + title={result.result.error?.message} + > + + + ) +} + +export default EvaluationErrorPopover diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx index a60462ba5b..94ba4fc83c 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx @@ -9,11 +9,11 @@ import { Rocket, Trash, } from "@phosphor-icons/react" -import {Button, Dropdown, DropdownProps, message, Popover, Space, Table, Tag} from "antd" +import {Button, Dropdown, DropdownProps, message, Space, Table, Typography} from "antd" import React, {useEffect, useMemo, useRef, useState} from "react" import {createUseStyles} from "react-jss" import {ColumnsType} from "antd/es/table" -import {EditOutlined, InfoCircleOutlined, MoreOutlined} from "@ant-design/icons" +import {MoreOutlined} from "@ant-design/icons" import EvaluatorsModal from "./EvaluatorsModal/EvaluatorsModal" import {useQueryParam} from "@/hooks/useQuery" import {formatDay} from "@/lib/helpers/dateTimeHelper" @@ -40,6 +40,7 @@ import {shortPoll} from "@/lib/helpers/utils" import {getFilterParams} from "./Filters/SearchFilter" import {uniqBy} from "lodash" import NewEvaluatorModal from "../evaluators/NewEvaluatorModal" +import EvaluationErrorPopover from "../EvaluationErrorProps/EvaluationErrorPopover" const useStyles = createUseStyles((theme: JSSTheme) => ({ resultTag: { @@ -280,73 +281,11 @@ const AutoEvaluation = () => { {matchingResults.map((result, index) => result.result.error ? ( - - {result.result.error?.stacktrace} -
- } - title={result.result.error?.message} - > - - + ) : ( - e.stopPropagation()} - > -
- {result.evaluator_config.name} -
-
{getTypedValue(result.result)}
- - } - title={ -
e.stopPropagation()} - > - - {evaluator?.name} - -
- } - > -
e.stopPropagation()} - className={classes.resultTag} - > -
{result.evaluator_config.name}
-
{getTypedValue(result.result)}
-
-
+ + {getTypedValue(result.result)} + ), )} diff --git a/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx b/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx index c6ae12388d..04be6b8fd0 100644 --- a/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx +++ b/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx @@ -9,13 +9,7 @@ import { fetchAllEvaluators, fetchEvaluationStatus, } from "@/services/evaluations/api" -import { - EditOutlined, - InfoCircleOutlined, - MoreOutlined, - PlusOutlined, - SwapOutlined, -} from "@ant-design/icons" +import {EditOutlined, MoreOutlined, PlusOutlined, SwapOutlined} from "@ant-design/icons" import {Database, GearSix, Note, Rocket, Trash} from "@phosphor-icons/react" import {Button, Dropdown, message, Popover, Space, Spin, Table, Tag, Typography} from "antd" import {ColumnsType} from "antd/es/table" @@ -31,6 +25,7 @@ import {useUpdateEffect} from "usehooks-ts" import {shortPoll} from "@/lib/helpers/utils" import NewEvaluatorModal from "../../evaluations/evaluators/NewEvaluatorModal" import DeleteEvaluationModal from "@/components/DeleteEvaluationModal/DeleteEvaluationModal" +import EvaluationErrorPopover from "../../evaluations/EvaluationErrorProps/EvaluationErrorPopover" const {Title} = Typography @@ -250,26 +245,7 @@ const AutomaticEvalOverview = () => { ) return result.result.error ? ( - - {result.result.error?.stacktrace} - - } - title={result.result.error?.message} - > - - + ) : ( Date: Fri, 6 Sep 2024 09:54:31 +0100 Subject: [PATCH 097/149] fix(frontend): run variant with chat template --- .../autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index 6bf6369252..8496d7602f 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -262,7 +262,7 @@ const ConfigureEvaluator = ({ optParams || [], appId, selectedVariant.baseId, - isChatVariant ? selectedTestcase.chat || [{}] : [], + isChatVariant ? JSON.parse(selectedTestcase.chat) || [{}] : [], controller.signal, true, ) From 9cf40f7dfdc5d97b74643ed94acd47500891661a Mon Sep 17 00:00:00 2001 From: ashrafchowdury Date: Fri, 6 Sep 2024 17:02:27 +0600 Subject: [PATCH 098/149] enhance(frontend): edit columns and short columns --- .../autoEvaluation/AutoEvaluation.tsx | 38 +++++++++++++++---- .../autoEvaluation/Filters/EditColumns.tsx | 37 +++++++++++++----- .../autoEvaluation/Filters/SearchFilter.tsx | 11 +++++- 3 files changed, 68 insertions(+), 18 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx index 94ba4fc83c..d02da22610 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx @@ -9,7 +9,7 @@ import { Rocket, Trash, } from "@phosphor-icons/react" -import {Button, Dropdown, DropdownProps, message, Space, Table, Typography} from "antd" +import {Button, Dropdown, DropdownProps, message, Space, Table, Tag, Typography} from "antd" import React, {useEffect, useMemo, useRef, useState} from "react" import {createUseStyles} from "react-jss" import {ColumnsType} from "antd/es/table" @@ -244,9 +244,18 @@ const AutoEvaluation = () => { title: "Results", key: "results", onHeaderCell: () => ({style: {minWidth: 240}}), - children: evaluatorConfigs.map((evaluator, idx) => ({ - title: evaluator.name, - key: `results-${idx}`, + children: evaluatorConfigs.map((evaluator) => ({ + title: () => { + return ( +
+ {evaluator.name} + + {evaluator.evaluator?.name} + +
+ ) + }, + key: evaluator.name, onHeaderCell: () => ({style: {minWidth: 240}}), sortDirections: ["descend", "ascend"], sorter: { @@ -312,6 +321,10 @@ const AutoEvaluation = () => { onHeaderCell: () => ({ style: {minWidth: 160}, }), + sorter: { + compare: (a, b) => + Number(a.average_latency?.value) - Number(b.average_latency?.value), + }, render: (_, record) => { return getTypedValue(record.average_latency) }, @@ -324,6 +337,9 @@ const AutoEvaluation = () => { onHeaderCell: () => ({ style: {minWidth: 160}, }), + sorter: { + compare: (a, b) => Number(a.average_cost?.value) - Number(b.average_cost?.value), + }, render: (_, record) => { return getTypedValue(record.average_cost) }, @@ -450,13 +466,21 @@ const AutoEvaluation = () => { }, [JSON.stringify(runningEvaluationIds)]) useEffect(() => { - const defaultColumnNames = columns.map((item) => item.key as string) - setEditColumns(defaultColumnNames) - }, []) + const defaultColumnNames = columns.flatMap((col) => + "children" in col ? [col.key, ...col.children.map((child) => child.key)] : [col.key], + ) + setEditColumns(defaultColumnNames as string[]) + }, [isEvalLoading]) const editedColumns = columns.map((item) => ({ ...item, hidden: !editColumns?.includes(item.key as string), + ...("children" in item && { + children: item.children.map((child) => ({ + ...child, + hidden: !editColumns.includes(child.key as string), + })), + }), })) return ( diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/Filters/EditColumns.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/Filters/EditColumns.tsx index 979b519b02..2346402a7f 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/Filters/EditColumns.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/Filters/EditColumns.tsx @@ -30,15 +30,34 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ export const generateEditItems = (columns: ColumnsType, editColumns: string[]) => { return columns .filter((col) => col.key !== "key") - .map((col) => ({ - key: col.key, - label: ( - - - <>{col.title} - - ), - })) + .flatMap((col) => [ + { + key: col.key, + label: ( + + + {col.title as string} + + ), + }, + ...(("children" in col && + col.children?.map((child) => ({ + key: child.key, + label: ( + + + {child.key as string} + + ), + }))) || + []), + ]) } interface EditColumnsProps { diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/Filters/SearchFilter.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/Filters/SearchFilter.tsx index d4211bd887..ddcc2db2fa 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/Filters/SearchFilter.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/Filters/SearchFilter.tsx @@ -1,7 +1,8 @@ -import {_Evaluation} from "@/lib/Types" -import {Input, InputRef, TableColumnType, DatePicker} from "antd" +import {_Evaluation, EvaluationStatus} from "@/lib/Types" +import {Input, TableColumnType, DatePicker} from "antd" import {FilterDropdownProps} from "antd/es/table/interface" import dayjs from "dayjs" +import {statusMapper} from "@/components/pages/evaluations/cellRenderers/cellRenderers" type DataIndex = keyof _Evaluation @@ -46,6 +47,12 @@ export function getFilterParams( if (type === "date") { return dayjs(cellValue).isSame(dayjs(value), "day") } + if (dataIndex === "status") { + const statusLabel = statusMapper({} as any)(record.status.value as EvaluationStatus) + .label as EvaluationStatus + return statusLabel.toLowerCase().includes(value.toLowerCase()) + } + if (typeof cellValue === "object" && cellValue !== null) { if (Array.isArray(cellValue)) { return cellValue.some((item) => From 2ab3da370945d859b301179021ce01e8c2e0eee6 Mon Sep 17 00:00:00 2001 From: ashrafchowdury Date: Fri, 6 Sep 2024 21:26:30 +0600 Subject: [PATCH 099/149] fix(frontend): fixed status update issue --- .../autoEvaluation/AutoEvaluation.tsx | 110 +++++++++--------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx index d02da22610..aadcdf37f7 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx @@ -92,6 +92,61 @@ const AutoEvaluation = () => { ) const stoppers = useRef() + const runningEvaluationIds = useMemo( + () => + evaluationList + .filter((item) => runningStatuses.includes(item.status.value)) + .map((item) => item.id), + [evaluationList], + ) + + useUpdateEffect(() => { + stoppers.current?.() + + if (runningEvaluationIds.length) { + stoppers.current = shortPoll( + () => + Promise.all(runningEvaluationIds.map((id) => fetchEvaluationStatus(id))) + .then((res) => { + setEvaluationList((prev) => { + const newEvals = [...prev] + runningEvaluationIds.forEach((id, ix) => { + const index = newEvals.findIndex((e) => e.id === id) + if (index !== -1) { + newEvals[index].status = res[ix].status + newEvals[index].duration = calcEvalDuration(newEvals[index]) + } + }) + if ( + res.some((item) => !runningStatuses.includes(item.status.value)) + ) + fetchEvaluations() + return newEvals + }) + }) + .catch(console.error), + {delayMs: 2000, timeoutMs: Infinity}, + ).stopper + } + + return () => { + stoppers.current?.() + } + }, [JSON.stringify(runningEvaluationIds)]) + + useEffect(() => { + if (!appId) return + + fetchEvaluations() + }, [appId]) + + useEffect(() => { + const defaultColumnNames = columns.flatMap((col) => + "children" in col ? [col.key, ...col.children.map((child) => child.key)] : [col.key], + ) + setEditColumns(defaultColumnNames as string[]) + }, [isEvalLoading]) + const fetchEvaluations = async () => { try { setIsEvalLoading(true) @@ -417,61 +472,6 @@ const AutoEvaluation = () => { }, ] - const runningEvaluationIds = useMemo( - () => - evaluationList - .filter((item) => runningStatuses.includes(item.status.value)) - .map((item) => item.id), - [evaluationList], - ) - - useEffect(() => { - if (!appId) return - - fetchEvaluations() - }, [appId]) - - useUpdateEffect(() => { - stoppers.current?.() - - if (runningEvaluationIds.length) { - stoppers.current = shortPoll( - () => - Promise.all(runningEvaluationIds.map((id) => fetchEvaluationStatus(id))) - .then((res) => { - setEvaluationList((prev) => { - const newEvals = [...prev] - runningEvaluationIds.forEach((id, ix) => { - const index = newEvals.findIndex((e) => e.id === id) - if (index !== -1) { - newEvals[index].status = res[ix].status - newEvals[index].duration = calcEvalDuration(newEvals[index]) - } - }) - if ( - res.some((item) => !runningStatuses.includes(item.status.value)) - ) - fetchEvaluations() - return newEvals - }) - }) - .catch(console.error), - {delayMs: 2000, timeoutMs: Infinity}, - ).stopper - } - - return () => { - stoppers.current?.() - } - }, [JSON.stringify(runningEvaluationIds)]) - - useEffect(() => { - const defaultColumnNames = columns.flatMap((col) => - "children" in col ? [col.key, ...col.children.map((child) => child.key)] : [col.key], - ) - setEditColumns(defaultColumnNames as string[]) - }, [isEvalLoading]) - const editedColumns = columns.map((item) => ({ ...item, hidden: !editColumns?.includes(item.key as string), From 8c3cadf845a744bb46095ca1e01913d90a4b66d7 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Fri, 6 Sep 2024 23:59:33 +0100 Subject: [PATCH 100/149] fix(frontend): update json variant result, json field title and updated trace tree --- .../ConfigureEvaluator/index.tsx | 40 ++++++++----------- 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index 8496d7602f..9e856176a8 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -42,7 +42,6 @@ import {callVariant} from "@/services/api" import {Editor} from "@monaco-editor/react" import {useAppTheme} from "@/components/Layout/ThemeContextProvider" import {isBaseResponse, isFuncResponse} from "@/lib/helpers/playgroundResp" -import {formatCurrency, formatLatency} from "@/lib/helpers/formatters" import {fromBaseResponseToTraceSpanType, transformTraceTreeToJson} from "@/lib/transformers" import {mapTestcaseAndEvalValues} from "@/lib/helpers/evaluate" @@ -124,8 +123,10 @@ const ConfigureEvaluator = ({ const [variantResult, setVariantResult] = useState("") const [traceTree, setTraceTree] = useState<{ testcase: Record | null + trace: Record | string | null }>({ testcase: selectedTestcase, + trace: null, }) const [baseResponseData, setBaseResponseData] = useState(null) const [outputResult, setOutputResult] = useState("") @@ -168,8 +169,8 @@ const ConfigureEvaluator = ({ prediction: selectedEvaluator.key.includes("json") || selectedEvaluator.key.includes("field_match_test") - ? JSON.stringify({message: JSON.parse(variantResult)?.message}) - : JSON.parse(variantResult)?.message, + ? JSON.stringify({message: variantResult}) + : variantResult, ...(selectedEvaluator.key === "auto_custom_code_run" ? {app_config: {}} : {}), } } @@ -181,7 +182,7 @@ const ConfigureEvaluator = ({ ? {credentials: apiKeyObject()} : {}), }) - setOutputResult(getStringOrJson(runResponse)) + setOutputResult(getStringOrJson(runResponse.outputs)) } catch (error) { console.error(error) } finally { @@ -268,30 +269,21 @@ const ConfigureEvaluator = ({ ) if (typeof result === "string") { - setVariantResult( - getStringOrJson({...(typeof result === "string" ? {message: result} : result)}), - ) - setTraceTree({...{data: result}, ...traceTree}) + setVariantResult(getStringOrJson(result)) + setTraceTree({...traceTree, trace: result}) } else if (isFuncResponse(result)) { setVariantResult(getStringOrJson(result)) - setTraceTree({...{data: result}, ...traceTree}) + setTraceTree({...traceTree, trace: result}) } else if (isBaseResponse(result)) { setBaseResponseData(result) const {trace, data} = result - setVariantResult( - getStringOrJson({ - ...(typeof data === "string" ? {message: data} : data), - cost: formatCurrency(trace?.cost), - usage: trace?.usage, - latency: formatLatency(trace?.latency), - }), - ) + setVariantResult(getStringOrJson(data)) if (trace?.spans) { setTraceTree({ - ...transformTraceTreeToJson( + ...traceTree, + trace: transformTraceTreeToJson( fromBaseResponseToTraceSpanType(trace.spans, trace.trace_id)[0], ), - ...traceTree, }) } } else { @@ -518,9 +510,9 @@ const ConfigureEvaluator = ({ -
+
- JSON + JSON Data
-
+
App Output @@ -554,10 +546,10 @@ const ConfigureEvaluator = ({ />
-
+
- Output + Evaluator Output Date: Sat, 7 Sep 2024 13:18:13 +0100 Subject: [PATCH 101/149] refactor(frontend): added hepler to transform trace settings to remove "trace." from payload and added helper text for json field title --- .../ConfigureEvaluator/index.tsx | 27 ++++++++++++------- agenta-web/src/lib/helpers/evaluate.ts | 21 +++++++++++++++ 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index 9e856176a8..b02e432f42 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -1,5 +1,5 @@ import {BaseResponse, Evaluator, JSSTheme, Parameter, testset, Variant} from "@/lib/Types" -import {CloseCircleOutlined, CloseOutlined} from "@ant-design/icons" +import {CloseCircleOutlined, CloseOutlined, InfoCircleOutlined} from "@ant-design/icons" import { ArrowLeft, CaretDoubleLeft, @@ -43,7 +43,7 @@ import {Editor} from "@monaco-editor/react" import {useAppTheme} from "@/components/Layout/ThemeContextProvider" import {isBaseResponse, isFuncResponse} from "@/lib/helpers/playgroundResp" import {fromBaseResponseToTraceSpanType, transformTraceTreeToJson} from "@/lib/transformers" -import {mapTestcaseAndEvalValues} from "@/lib/helpers/evaluate" +import {mapTestcaseAndEvalValues, transformTraceKeysInSettings} from "@/lib/helpers/evaluate" type ConfigureEvaluatorProps = { setCurrent: React.Dispatch> @@ -148,7 +148,7 @@ const ConfigureEvaluator = ({ if (Object.keys(evalMapObj).length && selectedEvaluator.key.startsWith("rag_")) { const mapResponse = await createEvaluatorDataMapping({ inputs: baseResponseData, - mapping: evalMapObj, + mapping: transformTraceKeysInSettings(evalMapObj), }) outputs = {...outputs, ...mapResponse.outputs} } @@ -177,7 +177,7 @@ const ConfigureEvaluator = ({ const runResponse = await createEvaluatorRunExecution(selectedEvaluator.key, { inputs: outputs, - settings: settingsValues, + settings: transformTraceKeysInSettings(settingsValues), ...(selectedEvaluator.requires_llm_api_keys || settingsValues?.requires_llm_api_keys ? {credentials: apiKeyObject()} : {}), @@ -510,10 +510,19 @@ const ConfigureEvaluator = ({ -
- - JSON Data - +
+ + + JSON Data + + + + +
-
+
App Output diff --git a/agenta-web/src/lib/helpers/evaluate.ts b/agenta-web/src/lib/helpers/evaluate.ts index 042c93ae75..a8be04caa9 100644 --- a/agenta-web/src/lib/helpers/evaluate.ts +++ b/agenta-web/src/lib/helpers/evaluate.ts @@ -370,3 +370,24 @@ export const mapTestcaseAndEvalValues = ( return {testcaseObj, evalMapObj} } + +export const transformTraceKeysInSettings = ( + settingsValues: Record, +): Record => { + return Object.keys(settingsValues).reduce( + (acc, curr) => { + if ( + !acc[curr] && + typeof settingsValues[curr] === "string" && + settingsValues[curr].startsWith("trace.") + ) { + acc[curr] = settingsValues[curr].replace("trace.", "") + } else { + acc[curr] = settingsValues[curr] + } + + return acc + }, + {} as Record, + ) +} From 10db7b7ae51ad1f104a43990161351448f002180 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Sat, 7 Sep 2024 14:00:54 +0100 Subject: [PATCH 102/149] fix(frontend): filter empty/falsy values from json data output --- agenta-web/src/lib/transformers.ts | 48 ++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/agenta-web/src/lib/transformers.ts b/agenta-web/src/lib/transformers.ts index 51f720e328..b260399b3f 100644 --- a/agenta-web/src/lib/transformers.ts +++ b/agenta-web/src/lib/transformers.ts @@ -215,19 +215,18 @@ export const transformTraceTreeToJson = (tree: TraceSpan[]) => { function addTree(item: TraceSpan) { if (item.name) { + const content = { + ...item.content, + ...(item.children ? transformTraceTreeToJson(item.children) : null), + } + if (!nodeMap[item.name]) { - nodeMap[item.name] = { - ...item.content, - ...(item.children ? transformTraceTreeToJson(item.children) : null), - } + nodeMap[item.name] = content } else { if (!Array.isArray(nodeMap[item.name])) { nodeMap[item.name] = [nodeMap[item.name]] } - nodeMap[item.name].push({ - ...item.content, - ...(item.children ? transformTraceTreeToJson(item.children) : null), - }) + nodeMap[item.name].push(content) } } } @@ -236,7 +235,38 @@ export const transformTraceTreeToJson = (tree: TraceSpan[]) => { addTree(item) }) - return nodeMap + const filterEmptyValues = (obj: Record): any => { + if (Array.isArray(obj)) { + return obj + .map(filterEmptyValues) + .filter( + (item) => + item !== null && + !(typeof item === "object" && Object.keys(item).length === 0), + ) + } else if (typeof obj === "object" && obj !== null) { + return Object.entries(obj).reduce( + (acc, [key, value]) => { + const filteredValue = filterEmptyValues(value) + if ( + filteredValue !== null && + !( + typeof filteredValue === "object" && + Object.keys(filteredValue).length === 0 + ) + ) { + acc[key] = filteredValue + } + return acc + }, + {} as Record, + ) + } else { + return obj + } + } + + return filterEmptyValues(nodeMap) } export const generatePaths = (obj: Record, currentPath = "") => { From 72e8be4db0bcc543f01fdad63c5ed4738588d91b Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Sat, 7 Sep 2024 22:04:45 +0100 Subject: [PATCH 103/149] feat(frontend): setup view configuration and deletion --- .../ConfigureEvaluator/index.tsx | 83 +++++++++++++------ .../Evaluators/DeleteModal.tsx | 71 ++++++++++++++++ .../Evaluators/EvaluatorCard.tsx | 44 +++++++++- .../EvaluatorsModal/Evaluators/index.tsx | 15 +++- .../EvaluatorsModal/EvaluatorsModal.tsx | 18 +++- 5 files changed, 197 insertions(+), 34 deletions(-) create mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/DeleteModal.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index b02e432f42..03f8eaaabf 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -1,4 +1,12 @@ -import {BaseResponse, Evaluator, JSSTheme, Parameter, testset, Variant} from "@/lib/Types" +import { + BaseResponse, + Evaluator, + EvaluatorConfig, + JSSTheme, + Parameter, + testset, + Variant, +} from "@/lib/Types" import {CloseCircleOutlined, CloseOutlined, InfoCircleOutlined} from "@ant-design/icons" import { ArrowLeft, @@ -9,19 +17,7 @@ import { Lightning, Play, } from "@phosphor-icons/react" -import { - Button, - Divider, - Flex, - Form, - Input, - message, - Select, - Space, - Tag, - Tooltip, - Typography, -} from "antd" +import {Button, Divider, Flex, Form, Input, message, Select, Space, Tooltip, Typography} from "antd" import React, {useEffect, useMemo, useRef, useState} from "react" import {createUseStyles} from "react-jss" import AdvancedSettings from "./AdvancedSettings" @@ -53,9 +49,12 @@ type ConfigureEvaluatorProps = { variants: Variant[] | null testsets: testset[] | null selectedTestcase: Record | null - setSelectedTestcase: React.Dispatch | null>> setSelectedVariant: React.Dispatch> selectedVariant: Variant | null + editMode: boolean + editEvalEditValues: EvaluatorConfig | null + setEditEvalEditValues: React.Dispatch> + setEditMode: (value: React.SetStateAction) => void } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -104,9 +103,12 @@ const ConfigureEvaluator = ({ testsets, onSuccess, selectedTestcase, - setSelectedTestcase, selectedVariant, setSelectedVariant, + editMode, + editEvalEditValues, + setEditEvalEditValues, + setEditMode, }: ConfigureEvaluatorProps) => { const appId = useAppId() const classes = useStyles() @@ -216,8 +218,8 @@ const ConfigureEvaluator = ({ evaluator_key: selectedEvaluator.key, settings_values: settingsValues, } - ;(false - ? updateEvaluatorConfig("initialValues?.id"!, data) + ;(editMode + ? updateEvaluatorConfig(editEvalEditValues?.id!, data) : createEvaluatorConfig(appId, data) ) .then(onSuccess) @@ -300,18 +302,45 @@ const ConfigureEvaluator = ({ } } + useEffect(() => { + form.resetFields() + if (editMode) { + form.setFieldsValue(editEvalEditValues) + } + }, [editMode]) + return (
-
diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/DeleteModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/DeleteModal.tsx new file mode 100644 index 0000000000..3c826ee70f --- /dev/null +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/DeleteModal.tsx @@ -0,0 +1,71 @@ +import {checkIfResourceValidForDeletion} from "@/lib/helpers/evaluate" +import {EvaluatorConfig, JSSTheme} from "@/lib/Types" +import {deleteEvaluatorConfig} from "@/services/evaluations/api" +import {ExclamationCircleOutlined} from "@ant-design/icons" +import {Modal, Space, theme, Typography} from "antd" +import React, {useState} from "react" +import {createUseStyles} from "react-jss" + +type DeleteModalProps = { + selectedEvalConfig: EvaluatorConfig + onSuccess: () => void +} & React.ComponentProps + +const useStyles = createUseStyles((theme: JSSTheme) => ({ + title: { + fontSize: theme.fontSizeLG, + fontWeight: theme.fontWeightStrong, + lineHeight: theme.lineHeightLG, + }, +})) + +const DeleteModal = ({selectedEvalConfig, onSuccess, ...props}: DeleteModalProps) => { + const classes = useStyles() + const { + token: {colorWarning}, + } = theme.useToken() + const [isLoading, setIsLoading] = useState(false) + + const handleDelete = async () => { + try { + if ( + !(await checkIfResourceValidForDeletion({ + resourceType: "evaluator_config", + resourceIds: [selectedEvalConfig.id], + })) + ) + return + try { + setIsLoading(true) + await deleteEvaluatorConfig(selectedEvalConfig.id) + await onSuccess() + props.onCancel?.({} as any) + } catch (error) { + console.error(error) + } + } catch (error) { + console.error(error) + } finally { + setIsLoading(false) + } + } + return ( + + + Delete evaluator + + } + centered + okText={"Delete"} + okButtonProps={{danger: true, loading: isLoading}} + onOk={handleDelete} + {...props} + > + Are you sure you want to delete this evaluator? + + ) +} + +export default DeleteModal diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx index 3a553a570a..798a2092d0 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx @@ -1,12 +1,20 @@ -import {EvaluatorConfig, JSSTheme} from "@/lib/Types" +import {evaluatorsAtom} from "@/lib/atoms/evaluation" +import {Evaluator, EvaluatorConfig, JSSTheme} from "@/lib/Types" import {MoreOutlined} from "@ant-design/icons" import {Copy, Note, Trash} from "@phosphor-icons/react" import {Button, Card, Dropdown, Tag, Typography} from "antd" -import React from "react" +import {useAtom} from "jotai" +import React, {useState} from "react" import {createUseStyles} from "react-jss" +import DeleteModal from "./DeleteModal" interface EvaluatorCardProps { evaluatorConfigs: EvaluatorConfig[] + setEditMode: React.Dispatch> + setCurrent: React.Dispatch> + setSelectedEvaluator: React.Dispatch> + setEditEvalEditValues: React.Dispatch> + onSuccess: () => void } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -53,8 +61,18 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ }, })) -const EvaluatorCard = ({evaluatorConfigs}: EvaluatorCardProps) => { +const EvaluatorCard = ({ + evaluatorConfigs, + setEditMode, + setCurrent, + setSelectedEvaluator, + setEditEvalEditValues, + onSuccess, +}: EvaluatorCardProps) => { const classes = useStyles() + const evaluators = useAtom(evaluatorsAtom)[0] + const [openDeleteModal, setOpenDeleteModal] = useState(false) + const [selectedDelEval, setSelectedDelEval] = useState(null) const formatEvluatorConfigs = Object.entries( evaluatorConfigs.reduce( @@ -96,6 +114,15 @@ const EvaluatorCard = ({evaluatorConfigs}: EvaluatorCardProps) => { icon: , onClick: (e: any) => { e.domEvent.stopPropagation() + const selectedEval = evaluators.find( + (e) => e.key === item.evaluator_key, + ) + if (selectedEval) { + setEditMode(true) + setSelectedEvaluator(selectedEval) + setEditEvalEditValues(item) + setCurrent(2) + } }, }, { @@ -114,6 +141,8 @@ const EvaluatorCard = ({evaluatorConfigs}: EvaluatorCardProps) => { danger: true, onClick: (e: any) => { e.domEvent.stopPropagation() + setOpenDeleteModal(true) + setSelectedDelEval(item) }, }, ], @@ -143,6 +172,15 @@ const EvaluatorCard = ({evaluatorConfigs}: EvaluatorCardProps) => {
))} + + {selectedDelEval && ( + setOpenDeleteModal(false)} + selectedEvalConfig={selectedDelEval} + onSuccess={onSuccess} + /> + )}
) } diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx index 1668419b8e..cea0ac12cc 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx @@ -13,6 +13,9 @@ type EvaluatorsProps = { setCurrent: React.Dispatch> setSelectedEvaluator: React.Dispatch> fetchingEvalConfigs: boolean + setEditMode: React.Dispatch> + setEditEvalEditValues: React.Dispatch> + onSuccess: () => void } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -57,6 +60,9 @@ const Evaluators = ({ setCurrent, setSelectedEvaluator, fetchingEvalConfigs, + setEditMode, + setEditEvalEditValues, + onSuccess, }: EvaluatorsProps) => { const classes = useStyles() const [searchTerm, setSearchTerm] = useState("") @@ -132,7 +138,14 @@ const Evaluators = ({ {evaluatorsDisplay === "list" ? ( ) : ( - + )}
diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx index 754b596a61..57b9f69262 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx @@ -1,6 +1,6 @@ import {useAppId} from "@/hooks/useAppId" import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" -import {Evaluator, JSSTheme, testset, Variant} from "@/lib/Types" +import {Evaluator, EvaluatorConfig, JSSTheme, testset, Variant} from "@/lib/Types" import {fetchAllEvaluatorConfigs, fetchAllEvaluators} from "@/services/evaluations/api" import {Modal} from "antd" import {useAtom} from "jotai" @@ -38,6 +38,8 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { const [fetchingEvalConfigs, setFetchingEvalConfigs] = useState(false) const [selectedTestcase, setSelectedTestcase] = useState | null>(null) const [selectedVariant, setSelectedVariant] = useState(null) + const [editMode, setEditMode] = useState(false) + const [editEvalEditValues, setEditEvalEditValues] = useState(null) const evalConfigFetcher = () => { setFetchingEvalConfigs(true) @@ -70,6 +72,9 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { setCurrent={setCurrent} setSelectedEvaluator={setSelectedEvaluator} fetchingEvalConfigs={fetchingEvalConfigs} + setEditMode={setEditMode} + setEditEvalEditValues={setEditEvalEditValues} + onSuccess={() => evalConfigFetcher()} /> ), }, @@ -91,7 +96,11 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { props.onCancel?.({} as any)} + handleOnCancel={() => { + props.onCancel?.({} as any) + setEditMode(false) + setEditEvalEditValues(null) + }} variants={variants} testsets={testsets} onSuccess={() => { @@ -99,9 +108,12 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { setCurrent(0) }} selectedTestcase={selectedTestcase} - setSelectedTestcase={setSelectedTestcase} selectedVariant={selectedVariant} setSelectedVariant={setSelectedVariant} + editMode={editMode} + editEvalEditValues={editEvalEditValues} + setEditEvalEditValues={setEditEvalEditValues} + setEditMode={setEditMode} /> ), }) From 84f7cba1e6c7fdbffb66f9976fe8b0bfb5d8d8d1 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Sun, 8 Sep 2024 01:43:31 +0100 Subject: [PATCH 104/149] feat(frontend): setup open config and clone features --- .../ConfigureEvaluator/index.tsx | 10 +- .../Evaluators/EvaluatorCard.tsx | 26 +++- .../Evaluators/EvaluatorList.tsx | 118 +++++++++++++----- .../EvaluatorsModal/Evaluators/index.tsx | 19 ++- .../EvaluatorsModal/EvaluatorsModal.tsx | 11 ++ .../NewEvaluator/NewEvaluatorList.tsx | 1 + .../EvaluatorsModal/NewEvaluator/index.tsx | 5 +- 7 files changed, 154 insertions(+), 36 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index 03f8eaaabf..2725434a97 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -55,6 +55,8 @@ type ConfigureEvaluatorProps = { editEvalEditValues: EvaluatorConfig | null setEditEvalEditValues: React.Dispatch> setEditMode: (value: React.SetStateAction) => void + cloneConfig: boolean + setCloneConfig: React.Dispatch> } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -109,6 +111,8 @@ const ConfigureEvaluator = ({ editEvalEditValues, setEditEvalEditValues, setEditMode, + cloneConfig, + setCloneConfig, }: ConfigureEvaluatorProps) => { const appId = useAppId() const classes = useStyles() @@ -306,8 +310,10 @@ const ConfigureEvaluator = ({ form.resetFields() if (editMode) { form.setFieldsValue(editEvalEditValues) + } else if (cloneConfig) { + form.setFieldValue("settings_values", editEvalEditValues?.settings_values) } - }, [editMode]) + }, [editMode, cloneConfig]) return (
@@ -321,6 +327,7 @@ const ConfigureEvaluator = ({ onClick={() => { setCurrent(0) setEditMode(false) + setCloneConfig(false) setEditEvalEditValues(null) }} /> @@ -334,6 +341,7 @@ const ConfigureEvaluator = ({ onClick={() => { setCurrent(1) setEditMode(false) + setCloneConfig(false) setEditEvalEditValues(null) }} /> diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx index 798a2092d0..7dad547b8d 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx @@ -11,6 +11,7 @@ import DeleteModal from "./DeleteModal" interface EvaluatorCardProps { evaluatorConfigs: EvaluatorConfig[] setEditMode: React.Dispatch> + setCloneConfig: React.Dispatch> setCurrent: React.Dispatch> setSelectedEvaluator: React.Dispatch> setEditEvalEditValues: React.Dispatch> @@ -68,6 +69,7 @@ const EvaluatorCard = ({ setSelectedEvaluator, setEditEvalEditValues, onSuccess, + setCloneConfig, }: EvaluatorCardProps) => { const classes = useStyles() const evaluators = useAtom(evaluatorsAtom)[0] @@ -100,11 +102,22 @@ const EvaluatorCard = ({ { + const selectedEval = evaluators.find( + (e) => e.key === item.evaluator_key, + ) + if (selectedEval) { + setEditMode(true) + setSelectedEvaluator(selectedEval) + setEditEvalEditValues(item) + setCurrent(2) + } + }} title={item.name} extra={ , onClick: (e: any) => { e.domEvent.stopPropagation() + const selectedEval = evaluators.find( + (e) => e.key === item.evaluator_key, + ) + if (selectedEval) { + setCloneConfig(true) + setSelectedEvaluator(selectedEval) + setEditEvalEditValues(item) + setCurrent(2) + } }, }, {type: "divider"}, diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx index a288b382c9..6a472e7265 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx @@ -1,26 +1,46 @@ -import {EvaluatorConfig} from "@/lib/Types" +import {evaluatorsAtom} from "@/lib/atoms/evaluation" +import {Evaluator, EvaluatorConfig} from "@/lib/Types" import {MoreOutlined} from "@ant-design/icons" import {Copy, GearSix, Note, Trash} from "@phosphor-icons/react" import {Button, Dropdown, Table, Tag} from "antd" import {ColumnsType} from "antd/es/table" +import {useAtom} from "jotai" import React, {useState} from "react" +import DeleteModal from "./DeleteModal" interface EvaluatorListProps { evaluatorConfigs: EvaluatorConfig[] + setEditMode: React.Dispatch> + setCloneConfig: React.Dispatch> + setCurrent: React.Dispatch> + setSelectedEvaluator: React.Dispatch> + setEditEvalEditValues: React.Dispatch> + onSuccess: () => void } -const EvaluatorList = ({evaluatorConfigs}: EvaluatorListProps) => { +const EvaluatorList = ({ + evaluatorConfigs, + setCloneConfig, + setCurrent, + setEditEvalEditValues, + setEditMode, + setSelectedEvaluator, + onSuccess, +}: EvaluatorListProps) => { const [selectedRowKeys, setSelectedRowKeys] = useState([]) + const evaluators = useAtom(evaluatorsAtom)[0] + const [openDeleteModal, setOpenDeleteModal] = useState(false) + const [selectedDelEval, setSelectedDelEval] = useState(null) const columns: ColumnsType = [ - { - title: "Version", - dataIndex: "version", - key: "version", - onHeaderCell: () => ({ - style: {minWidth: 80}, - }), - }, + // { + // title: "Version", + // dataIndex: "version", + // key: "version", + // onHeaderCell: () => ({ + // style: {minWidth: 80}, + // }), + // }, { title: "Name", dataIndex: "name", @@ -71,6 +91,15 @@ const EvaluatorList = ({evaluatorConfigs}: EvaluatorListProps) => { icon: , onClick: (e: any) => { e.domEvent.stopPropagation() + const selectedEval = evaluators.find( + (e) => e.key === record.evaluator_key, + ) + if (selectedEval) { + setEditMode(true) + setSelectedEvaluator(selectedEval) + setEditEvalEditValues(record) + setCurrent(2) + } }, }, { @@ -79,6 +108,15 @@ const EvaluatorList = ({evaluatorConfigs}: EvaluatorListProps) => { icon: , onClick: (e: any) => { e.domEvent.stopPropagation() + const selectedEval = evaluators.find( + (e) => e.key === record.evaluator_key, + ) + if (selectedEval) { + setCloneConfig(true) + setSelectedEvaluator(selectedEval) + setEditEvalEditValues(record) + setCurrent(2) + } }, }, {type: "divider"}, @@ -89,6 +127,8 @@ const EvaluatorList = ({evaluatorConfigs}: EvaluatorListProps) => { danger: true, onClick: (e: any) => { e.domEvent.stopPropagation() + setOpenDeleteModal(true) + setSelectedDelEval(record) }, }, ], @@ -107,26 +147,44 @@ const EvaluatorList = ({evaluatorConfigs}: EvaluatorListProps) => { ] return ( -
{ - setSelectedRowKeys(selectedRowKeys) - }, - fixed: "left", - }} - className="ph-no-capture" - columns={columns} - rowKey={"id"} - dataSource={evaluatorConfigs} - scroll={{x: true}} - bordered - onRow={(record) => ({ - style: {cursor: "pointer"}, - onClick: () => {}, - })} - /> + <> +
{ + setSelectedRowKeys(selectedRowKeys) + }, + fixed: "left", + }} + className="ph-no-capture" + columns={columns} + rowKey={"id"} + dataSource={evaluatorConfigs} + scroll={{x: true}} + bordered + onRow={(record) => ({ + style: {cursor: "pointer"}, + onClick: () => { + const selectedEval = evaluators.find((e) => e.key === record.evaluator_key) + if (selectedEval) { + setEditMode(true) + setSelectedEvaluator(selectedEval) + setEditEvalEditValues(record) + setCurrent(2) + } + }, + })} + /> + {selectedDelEval && ( + setOpenDeleteModal(false)} + selectedEvalConfig={selectedDelEval} + onSuccess={onSuccess} + /> + )} + ) } diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx index cea0ac12cc..bceae4cc14 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx @@ -6,6 +6,7 @@ import React, {useMemo, useState} from "react" import {createUseStyles} from "react-jss" import EvaluatorCard from "./EvaluatorCard" import EvaluatorList from "./EvaluatorList" +import {useLocalStorage} from "usehooks-ts" type EvaluatorsProps = { evaluatorConfigs: EvaluatorConfig[] @@ -14,8 +15,11 @@ type EvaluatorsProps = { setSelectedEvaluator: React.Dispatch> fetchingEvalConfigs: boolean setEditMode: React.Dispatch> + setCloneConfig: React.Dispatch> setEditEvalEditValues: React.Dispatch> onSuccess: () => void + setEvaluatorsDisplay: any + evaluatorsDisplay: string } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -63,10 +67,12 @@ const Evaluators = ({ setEditMode, setEditEvalEditValues, onSuccess, + setCloneConfig, + setEvaluatorsDisplay, + evaluatorsDisplay, }: EvaluatorsProps) => { const classes = useStyles() const [searchTerm, setSearchTerm] = useState("") - const [evaluatorsDisplay, setEvaluatorsDisplay] = useState("card") const [selectedEvaluatorCategory, setSelectedEvaluatorCategory] = useState("view_all") const filteredEvalConfigs = useMemo(() => { @@ -136,7 +142,15 @@ const Evaluators = ({ {evaluatorsDisplay === "list" ? ( - + ) : ( )} diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx index 57b9f69262..c3200df2bf 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx @@ -12,6 +12,7 @@ import TestcaseTab from "./TestcaseTab/TestcaseTab" import ConfigureEvaluator from "./ConfigureEvaluator" import NewEvaluator from "./NewEvaluator" import Evaluators from "./Evaluators" +import {useLocalStorage} from "usehooks-ts" type EvaluatorsModalProps = {} & React.ComponentProps @@ -39,7 +40,9 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { const [selectedTestcase, setSelectedTestcase] = useState | null>(null) const [selectedVariant, setSelectedVariant] = useState(null) const [editMode, setEditMode] = useState(false) + const [cloneConfig, setCloneConfig] = useState(false) const [editEvalEditValues, setEditEvalEditValues] = useState(null) + const [evaluatorsDisplay, setEvaluatorsDisplay] = useLocalStorage("evaluator_view", "card") const evalConfigFetcher = () => { setFetchingEvalConfigs(true) @@ -75,6 +78,9 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { setEditMode={setEditMode} setEditEvalEditValues={setEditEvalEditValues} onSuccess={() => evalConfigFetcher()} + setCloneConfig={setCloneConfig} + setEvaluatorsDisplay={setEvaluatorsDisplay} + evaluatorsDisplay={evaluatorsDisplay} /> ), }, @@ -85,6 +91,8 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { setCurrent={setCurrent} handleOnCancel={() => props.onCancel?.({} as any)} setSelectedEvaluator={setSelectedEvaluator} + setEvaluatorsDisplay={setEvaluatorsDisplay} + evaluatorsDisplay={evaluatorsDisplay} /> ), }, @@ -99,6 +107,7 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { handleOnCancel={() => { props.onCancel?.({} as any) setEditMode(false) + setCloneConfig(false) setEditEvalEditValues(null) }} variants={variants} @@ -114,6 +123,8 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { editEvalEditValues={editEvalEditValues} setEditEvalEditValues={setEditEvalEditValues} setEditMode={setEditMode} + cloneConfig={cloneConfig} + setCloneConfig={setCloneConfig} /> ), }) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx index e8e853d34d..ca96e6ad07 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx @@ -65,6 +65,7 @@ const CreateEvaluatorList = ({ rowKey={"key"} className="ph-no-capture" scroll={{x: true, y: 550}} + style={{cursor: "pointer"}} onRow={(record) => ({ onClick: () => { setSelectedEvaluator(record) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx index b428d86456..ac43c12795 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx @@ -12,6 +12,8 @@ type NewEvaluatorProps = { handleOnCancel: () => void evaluators: Evaluator[] setSelectedEvaluator: React.Dispatch> + setEvaluatorsDisplay: any + evaluatorsDisplay: string } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -55,10 +57,11 @@ const NewEvaluator = ({ setCurrent, handleOnCancel, setSelectedEvaluator, + setEvaluatorsDisplay, + evaluatorsDisplay, }: NewEvaluatorProps) => { const classes = useStyles() const [searchTerm, setSearchTerm] = useState("") - const [evaluatorsDisplay, setEvaluatorsDisplay] = useState("card") const [selectedEvaluatorCategory, setSelectedEvaluatorCategory] = useState("view_all") const filteredEvaluators = useMemo(() => { From 55828ae8d4af9c2be6a3751643df4e5e9513d075 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Sun, 8 Sep 2024 01:44:24 +0100 Subject: [PATCH 105/149] fix(frontend): removed old evaluators code --- .../evaluators/AdvancedSettings.tsx | 95 ---- .../evaluations/evaluators/EvaluatorCard.tsx | 126 ----- .../evaluations/evaluators/Evaluators.tsx | 119 ----- .../evaluators/NewEvaluatorModal.tsx | 467 ------------------ 4 files changed, 807 deletions(-) delete mode 100644 agenta-web/src/components/pages/evaluations/evaluators/AdvancedSettings.tsx delete mode 100644 agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx delete mode 100644 agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx delete mode 100644 agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx diff --git a/agenta-web/src/components/pages/evaluations/evaluators/AdvancedSettings.tsx b/agenta-web/src/components/pages/evaluations/evaluators/AdvancedSettings.tsx deleted file mode 100644 index 3db209fdaa..0000000000 --- a/agenta-web/src/components/pages/evaluations/evaluators/AdvancedSettings.tsx +++ /dev/null @@ -1,95 +0,0 @@ -import React from "react" -import {Form, Input, InputNumber, Switch, Tooltip, Collapse, theme} from "antd" -import {CaretRightOutlined, InfoCircleOutlined} from "@ant-design/icons" -import {createUseStyles} from "react-jss" -import {Editor} from "@monaco-editor/react" -import {useAppTheme} from "@/components/Layout/ThemeContextProvider" - -const useStyles = createUseStyles((theme: any) => ({ - label: { - display: "flex", - alignItems: "center", - gap: "0.5rem", - }, - editor: { - border: `1px solid ${theme.colorBorder}`, - borderRadius: theme.borderRadius, - overflow: "hidden", - }, -})) - -type AdvancedSettingsProps = { - settings: Record[] -} - -const AdvancedSettings: React.FC = ({settings}) => { - const classes = useStyles() - const {appTheme} = useAppTheme() - const {token} = theme.useToken() - - return ( - } - className={"my-[10px]"} - > - - {settings.map((field) => { - const rules = [ - {required: field.required ?? true, message: "This field is required"}, - ] - - return ( - - {field.label} - {field.description && ( - - - - )} - - } - initialValue={field.default} - rules={rules} - > - {field.type === "string" || field.type === "regex" ? ( - - ) : field.type === "number" ? ( - - ) : field.type === "boolean" || field.type === "bool" ? ( - - ) : field.type === "text" ? ( - - ) : field.type === "code" ? ( - - ) : field.type === "object" ? ( - - ) : null} - - ) - })} - - - ) -} - -export default AdvancedSettings diff --git a/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx deleted file mode 100644 index 85bc2c83db..0000000000 --- a/agenta-web/src/components/pages/evaluations/evaluators/EvaluatorCard.tsx +++ /dev/null @@ -1,126 +0,0 @@ -import React from "react" -import {EvaluatorConfig, JSSTheme} from "@/lib/Types" -import {DeleteOutlined, EditOutlined} from "@ant-design/icons" -import {Card, Tag, Typography} from "antd" -import {createUseStyles} from "react-jss" -import dayjs from "dayjs" -import Image from "next/image" -import AlertPopup from "@/components/AlertPopup/AlertPopup" -import {deleteEvaluatorConfig} from "@/services/evaluations/api" -import {useAtom} from "jotai" -import {evaluatorsAtom} from "@/lib/atoms/evaluation" -import {checkIfResourceValidForDeletion} from "@/lib/helpers/evaluate" -import ResultComponent from "@/components/ResultComponent/ResultComponent" - -const useStyles = createUseStyles((theme: JSSTheme) => ({ - card: { - display: "flex", - flexDirection: "column", - "& .ant-card-body": { - padding: "1.25rem 0.75rem 1rem 1rem", - flex: 1, - }, - }, - body: { - display: "flex", - flexDirection: "column", - alignItems: "center", - }, - headerRow: { - display: "flex", - alignItems: "center", - alignSelf: "stretch", - justifyContent: "space-between", - marginBottom: "1.5rem", - }, - evaluationImg: { - width: 32, - height: 32, - marginRight: "8px", - filter: theme.isDark ? "invert(1)" : "none", - }, - name: { - marginTop: "0.5rem", - marginBottom: "0 !important", - fontWeight: "500 !important", - fontSize: "1rem", - }, - date: { - fontSize: "0.75rem", - color: "#8c8c8c", - }, -})) - -interface Props { - evaluatorConfig: EvaluatorConfig - onEdit?: () => void - onSuccessDelete?: () => void -} - -const EvaluatorCard: React.FC = ({evaluatorConfig, onEdit, onSuccessDelete}) => { - const classes = useStyles() - const [evaluators] = useAtom(evaluatorsAtom) - const evaluator = evaluators.find((item) => item.key === evaluatorConfig.evaluator_key)! - - const onDelete = async () => { - AlertPopup({ - title: "Delete evaluator", - message: "Are you sure you want to delete this evaluator?", - onOk: async () => { - if ( - !(await checkIfResourceValidForDeletion({ - resourceType: "evaluator_config", - resourceIds: [evaluatorConfig.id], - })) - ) - return - try { - await deleteEvaluatorConfig(evaluatorConfig.id) - onSuccessDelete?.() - } catch (error) {} - }, - }) - } - - if (!evaluator) { - return null - } - - return ( - , - , - ]} - data-cy="evaluator-card" - > -
-
- - {dayjs(evaluatorConfig.created_at).format("DD MMM YY")} - - {evaluator.name} -
- - {evaluator.icon_url && ( - - )} - - - {evaluatorConfig.name} - -
-
- ) -} - -export default EvaluatorCard diff --git a/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx b/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx deleted file mode 100644 index d02fa6a569..0000000000 --- a/agenta-web/src/components/pages/evaluations/evaluators/Evaluators.tsx +++ /dev/null @@ -1,119 +0,0 @@ -import React, {useMemo, useState} from "react" -import {createUseStyles} from "react-jss" -import EvaluatorCard from "./EvaluatorCard" -import {Button, Empty, Input, Space, Spin} from "antd" -import {PlusCircleOutlined} from "@ant-design/icons" -import NewEvaluatorModal from "./NewEvaluatorModal" -import {useAppId} from "@/hooks/useAppId" -import {fetchAllEvaluatorConfigs} from "@/services/evaluations/api" -import {useAtom} from "jotai" -import {evaluatorConfigsAtom} from "@/lib/atoms/evaluation" -import {JSSTheme} from "@/lib/Types" - -const useStyles = createUseStyles((theme: JSSTheme) => ({ - root: { - display: "flex", - flexDirection: "column", - }, - buttonsGroup: { - justifyContent: "flex-end", - width: "100%", - padding: "1rem 0", - position: "sticky", - top: 46, - zIndex: 1, - backgroundColor: theme.colorBgContainer, - }, - grid: { - display: "grid", - gridTemplateColumns: "repeat(auto-fill, minmax(min(260px, 100%), 1fr))", - gap: "1rem", - }, -})) - -interface Props {} - -const Evaluators: React.FC = () => { - const classes = useStyles() - const appId = useAppId() - const [evaluatorConfigs, setEvaluatorConfigs] = useAtom(evaluatorConfigsAtom) - const [newEvalModalOpen, setNewEvalModalOpen] = useState(false) - const [newEvalModalConfigOpen, setNewEvalModalConfigOpen] = useState(false) - const [editIndex, setEditIndex] = useState(-1) - const [fetching, setFetching] = useState(false) - const [searchTerm, setSearchTerm] = useState("") - - const fetcher = () => { - setFetching(true) - fetchAllEvaluatorConfigs(appId) - .then(setEvaluatorConfigs) - .catch(console.error) - .finally(() => setFetching(false)) - } - - const filtered = useMemo(() => { - if (!searchTerm) return evaluatorConfigs - return evaluatorConfigs.filter((item) => - item.name.toLowerCase().includes(searchTerm.toLowerCase()), - ) - }, [searchTerm, evaluatorConfigs]) - - return ( -
- - setSearchTerm(e.target.value)} - placeholder="Search" - allowClear - enterButton - /> - - - - {!fetching && !evaluatorConfigs.length ? ( - - ) : ( -
- {filtered.map((item, ix) => ( - { - setEditIndex(ix) - setNewEvalModalConfigOpen(true) - }} - onSuccessDelete={fetcher} - /> - ))} -
- )} -
- - { - setNewEvalModalOpen(false) - setNewEvalModalConfigOpen(false) - fetcher() - }} - newEvalModalConfigOpen={newEvalModalConfigOpen} - setNewEvalModalConfigOpen={setNewEvalModalConfigOpen} - setNewEvalModalOpen={setNewEvalModalOpen} - editMode={editIndex !== -1} - initialValues={evaluatorConfigs[editIndex]} - /> -
- ) -} - -export default Evaluators diff --git a/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx b/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx deleted file mode 100644 index 049ebcdc34..0000000000 --- a/agenta-web/src/components/pages/evaluations/evaluators/NewEvaluatorModal.tsx +++ /dev/null @@ -1,467 +0,0 @@ -import {useAppTheme} from "@/components/Layout/ThemeContextProvider" -import {useAppId} from "@/hooks/useAppId" -import {EvaluationSettingsTemplate, Evaluator, EvaluatorConfig, JSSTheme} from "@/lib/Types" -import {evaluatorsAtom} from "@/lib/atoms/evaluation" -import {isValidRegex} from "@/lib/helpers/validators" -import { - CreateEvaluationConfigData, - createEvaluatorConfig, - updateEvaluatorConfig, -} from "@/services/evaluations/api" -import {ArrowLeftOutlined, EditOutlined, InfoCircleOutlined, PlusOutlined} from "@ant-design/icons" -import {Editor} from "@monaco-editor/react" -import {Button, Form, Input, InputNumber, Modal, Switch, Table, Tooltip, message, theme} from "antd" -import {Rule} from "antd/es/form" -import {useAtom} from "jotai" -import Image from "next/image" -import Link from "next/link" -import React, {useEffect, useMemo, useState} from "react" -import {createUseStyles} from "react-jss" -import {ColumnsType} from "antd/es/table" -import AdvancedSettings from "./AdvancedSettings" - -const useStyles = createUseStyles((theme: JSSTheme) => ({ - label: { - display: "flex", - alignItems: "center", - gap: "0.5rem", - }, - evaluationImg: { - width: 20, - height: 20, - marginRight: "8px", - filter: theme.isDark ? "invert(1)" : "none", - }, - radioGroup: { - "& .ant-radio-button-wrapper": { - margin: "0.25rem", - borderRadius: theme.borderRadius, - borderLeft: `1px solid ${theme.colorBorder}`, - "&::before": { - display: "none", - }, - }, - "& .ant-radio-button-wrapper-checked ": { - borderLeft: `1px solid ${theme.colorPrimary}`, - }, - }, - evalNameContainer: { - display: "flex", - alignItems: "center", - }, - divider: { - margin: "1rem -1.5rem", - width: "unset", - }, - editor: { - border: `1px solid ${theme.colorBorder}`, - borderRadius: theme.borderRadius, - overflow: "hidden", - }, - ExternalHelp: { - marginBottom: "20px", - display: "flex", - alignItems: "center", - gap: "0.3em", - }, - ExternalHelpLink: { - margin: "0px", - padding: "0px", - textDecoration: "underline", - color: theme.isDark ? "rgba(255, 255, 255, 0.85)" : "#000", - - "&:hover": { - color: theme.isDark ? "rgba(255, 255, 255, 0.85)" : "#000", - textDecoration: "underline", - }, - }, - evaluatorsTable: { - maxHeight: 550, - overflowY: "scroll", - margin: "2rem 0 1rem", - border: `1px solid ${theme.colorBorder}`, - borderRadius: theme.borderRadius, - "& .ant-table-thead": { - position: "sticky", - top: 0, - zIndex: 1000, - }, - }, - evalModalBtns: { - display: "flex", - alignItems: "center", - gap: 10, - width: "100%", - justifyContent: "flex-end", - }, - evalBtnContainer: { - display: "flex", - alignItems: "center", - justifyContent: "space-between", - width: "100%", - }, - searchContainer: { - marginTop: "1rem", - width: "100%", - display: "flex", - justifyContent: "flex-end", - }, -})) - -type DynamicFormFieldProps = EvaluationSettingsTemplate & { - name: string | string[] -} - -const DynamicFormField: React.FC = ({ - name, - label, - type, - default: defaultVal, - description, - min, - max, - required, -}) => { - const {appTheme} = useAppTheme() - const classes = useStyles() - const {token} = theme.useToken() - const [showAdvancedSettings, setShowAdvancedSettings] = useState(false) - - const rules: Rule[] = [{required: required ?? true, message: "This field is required"}] - if (type === "regex") - rules.push({ - validator: (_, value) => - new Promise((res, rej) => - isValidRegex(value) ? res("") : rej("Regex pattern is not valid"), - ), - }) - - const ExternalHelpInfo = - name[1] === "webhook_url" ? ( -
- Learn - - more - - about the evaluator -
- ) : null - - return ( - <> - {label !== "Correct Answer" && ( - - {label} - {description && ( - - - - )} - - } - initialValue={defaultVal} - rules={rules} - > - {type === "string" || type === "regex" ? ( - - ) : type === "number" ? ( - - ) : type === "boolean" || type === "bool" ? ( - - ) : type === "text" ? ( - - ) : type === "code" ? ( - - ) : type === "object" ? ( - - ) : null} - - )} - - {ExternalHelpInfo} - - ) -} - -type Props = { - onSuccess?: () => void - initialValues?: EvaluatorConfig - editMode?: boolean - setNewEvalModalOpen: (value: React.SetStateAction) => void - newEvalModalConfigOpen: boolean - setNewEvalModalConfigOpen: React.Dispatch> -} & React.ComponentProps - -const NewEvaluatorModal: React.FC = ({ - onSuccess, - editMode = false, - initialValues, - setNewEvalModalOpen, - newEvalModalConfigOpen, - setNewEvalModalConfigOpen, - ...props -}) => { - const classes = useStyles() - const evaluators = useAtom(evaluatorsAtom)[0] - const [selectedEval, setSelectedEval] = useState(null) - const [submitLoading, setSubmitLoading] = useState(false) - const [searchTerm, setSearchTerm] = useState("") - const appId = useAppId() - const [form] = Form.useForm() - - const filtered = useMemo(() => { - if (!searchTerm) return evaluators - return evaluators.filter((item) => - item.name.toLowerCase().includes(searchTerm.toLowerCase()), - ) - }, [searchTerm, evaluators]) - - const handleCloseModal = () => { - setSearchTerm("") - setNewEvalModalOpen(false) - } - - const evalFields = useMemo( - () => - Object.keys(selectedEval?.settings_template || {}) - .filter((key) => !!selectedEval?.settings_template[key]?.type) - .map((key) => ({ - key, - ...selectedEval?.settings_template[key]!, - advanced: selectedEval?.settings_template[key]?.advanced || false, - })), - [selectedEval], - ) - - useEffect(() => { - form.resetFields() - if (initialValues) { - form.setFieldsValue(initialValues) - setSelectedEval( - evaluators.find((item) => item.key === initialValues?.evaluator_key) || null, - ) - } - }, [newEvalModalConfigOpen]) - - const advancedSettingsFields = evalFields.filter((field) => field.advanced) - const basicSettingsFields = evalFields.filter((field) => !field.advanced) - - const onSubmit = (values: CreateEvaluationConfigData) => { - try { - setSubmitLoading(true) - if (!selectedEval?.key) throw new Error("No selected key") - const settingsValues = values.settings_values || {} - - const data = { - ...values, - evaluator_key: selectedEval.key, - settings_values: settingsValues, - } - ;(editMode - ? updateEvaluatorConfig(initialValues?.id!, data) - : createEvaluatorConfig(appId, data) - ) - .then(onSuccess) - .catch(console.error) - .finally(() => setSubmitLoading(false)) - } catch (error: any) { - setSubmitLoading(false) - console.error(error) - message.error(error.message) - } - } - - const columns: ColumnsType = [ - { - title: "Name", - dataIndex: "name", - key: "name", - width: 200, - render(_, record) { - return ( - <> -
- {record.icon_url && ( - - )} - {record.name} -
- - ) - }, - }, - { - title: "Description", - dataIndex: "description", - key: "description", - render(_, record) { - return ( - <> -
{record.description}
- - ) - }, - }, - ] - - return ( - <> - -
- setSearchTerm(e.target.value)} - placeholder="Search" - allowClear - enterButton - style={{ - maxWidth: 300, - }} - /> -
-
{ - return { - onClick: () => { - setNewEvalModalOpen(false) - setNewEvalModalConfigOpen(true) - setSelectedEval(data) - }, - style: { - cursor: "pointer", - }, - "data-cy": `select-new-evaluator-${index}`, - } - }} - /> - - - { - setNewEvalModalConfigOpen(false) - }} - destroyOnClose - onOk={form.submit} - title={ - editMode - ? `${ - selectedEval?.name - ? `Edit the ${selectedEval.name} evaluator` - : "Edit your evaluator" - }` - : `${ - selectedEval?.name - ? `Configure the ${selectedEval.name} evaluator` - : "Configure your evaluator" - }` - } - footer={null} - data-cy="configure-new-evaluator-modal" - width={selectedEval?.key === "auto_custom_code_run" ? 800 : 600} - > -
- - - - - {basicSettingsFields.map((field) => ( - - ))} - - {advancedSettingsFields.length > 0 && ( - - )} - - -
- {!editMode && ( - - )} - -
- - -
-
-
- -
- - ) -} - -export default NewEvaluatorModal From 51ebdfef1fd27cfb12f4f83aabc6e3135d65f415 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Sun, 8 Sep 2024 01:59:26 +0100 Subject: [PATCH 106/149] minor fix --- .../automaticEvaluation/AutomaticEvalOverview.tsx | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx b/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx index 0c52b085d5..46d3ad84f0 100644 --- a/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx +++ b/agenta-web/src/components/pages/overview/automaticEvaluation/AutomaticEvalOverview.tsx @@ -29,7 +29,6 @@ import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" import {runningStatuses} from "../../evaluations/cellRenderers/cellRenderers" import {useUpdateEffect} from "usehooks-ts" import {shortPoll} from "@/lib/helpers/utils" -import NewEvaluatorModal from "../../evaluations/evaluators/NewEvaluatorModal" import DeleteEvaluationModal from "@/components/DeleteEvaluationModal/DeleteEvaluationModal" const {Title} = Typography @@ -492,19 +491,6 @@ const AutomaticEvalOverview = () => { }} /> - { - setIsEditEvalConfigOpen(false) - fetchEvaluations() - }} - newEvalModalConfigOpen={isEditEvalConfigOpen} - setNewEvalModalConfigOpen={setIsEditEvalConfigOpen} - setNewEvalModalOpen={() => {}} - editMode={true} - initialValues={selectedConfigEdit} - /> - {selectedEvalRecord && ( Date: Sun, 8 Sep 2024 13:25:34 +0100 Subject: [PATCH 107/149] feat(frontend): generated color for evaluator configs and updated types --- .../Evaluators/EvaluatorCard.tsx | 184 +++++++++--------- .../Evaluators/EvaluatorList.tsx | 3 +- .../NewEvaluator/NewEvaluatorList.tsx | 2 +- agenta-web/src/lib/Types.ts | 2 + .../src/services/evaluations/api/index.ts | 9 +- 5 files changed, 107 insertions(+), 93 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx index 7dad547b8d..da2ece083e 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx @@ -98,99 +98,103 @@ const EvaluatorCard = ({
{title}
- {items.map((item) => ( - { - const selectedEval = evaluators.find( - (e) => e.key === item.evaluator_key, - ) - if (selectedEval) { - setEditMode(true) - setSelectedEvaluator(selectedEval) - setEditEvalEditValues(item) - setCurrent(2) - } - }} - title={item.name} - extra={ - , - onClick: (e: any) => { - e.domEvent.stopPropagation() - const selectedEval = evaluators.find( - (e) => e.key === item.evaluator_key, - ) - if (selectedEval) { - setEditMode(true) - setSelectedEvaluator(selectedEval) - setEditEvalEditValues(item) - setCurrent(2) - } + {items.map((item) => { + const evaluator = evaluators.find((e) => e.key === item.evaluator_key) + + return ( + { + const selectedEval = evaluators.find( + (e) => e.key === item.evaluator_key, + ) + if (selectedEval) { + setEditMode(true) + setSelectedEvaluator(selectedEval) + setEditEvalEditValues(item) + setCurrent(2) + } + }} + title={item.name} + extra={ + , + onClick: (e: any) => { + e.domEvent.stopPropagation() + const selectedEval = evaluators.find( + (e) => e.key === item.evaluator_key, + ) + if (selectedEval) { + setEditMode(true) + setSelectedEvaluator(selectedEval) + setEditEvalEditValues(item) + setCurrent(2) + } + }, }, - }, - { - key: "clone", - label: "Clone", - icon: , - onClick: (e: any) => { - e.domEvent.stopPropagation() - const selectedEval = evaluators.find( - (e) => e.key === item.evaluator_key, - ) - if (selectedEval) { - setCloneConfig(true) - setSelectedEvaluator(selectedEval) - setEditEvalEditValues(item) - setCurrent(2) - } + { + key: "clone", + label: "Clone", + icon: , + onClick: (e: any) => { + e.domEvent.stopPropagation() + const selectedEval = evaluators.find( + (e) => e.key === item.evaluator_key, + ) + if (selectedEval) { + setCloneConfig(true) + setSelectedEvaluator(selectedEval) + setEditEvalEditValues(item) + setCurrent(2) + } + }, }, - }, - {type: "divider"}, - { - key: "delete_app", - label: "Delete", - icon: , - danger: true, - onClick: (e: any) => { - e.domEvent.stopPropagation() - setOpenDeleteModal(true) - setSelectedDelEval(item) + {type: "divider"}, + { + key: "delete_app", + label: "Delete", + icon: , + danger: true, + onClick: (e: any) => { + e.domEvent.stopPropagation() + setOpenDeleteModal(true) + setSelectedDelEval(item) + }, }, - }, - ], - }} - > -
))} diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx index 6a472e7265..a30194d929 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx @@ -60,7 +60,8 @@ const EvaluatorList = ({ style: {minWidth: 200}, }), render: (_, record) => { - return {record.evaluator_key} + const evaluator = evaluators.find((item) => item.key === record.evaluator_key) + return {evaluator?.name} }, }, { diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx index ca96e6ad07..64fbd4bdfb 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx @@ -37,7 +37,7 @@ const CreateEvaluatorList = ({ render: (_, record) => { return (
- {record.key} + {record.name}
) }, diff --git a/agenta-web/src/lib/Types.ts b/agenta-web/src/lib/Types.ts index f33b285edd..af41bfb824 100644 --- a/agenta-web/src/lib/Types.ts +++ b/agenta-web/src/lib/Types.ts @@ -384,6 +384,8 @@ export interface EvaluatorConfig { name: string settings_values: Record created_at: string + color?: string + updated_at: string } export type EvaluationError = { diff --git a/agenta-web/src/services/evaluations/api/index.ts b/agenta-web/src/services/evaluations/api/index.ts index b04c9d8432..e017849c5a 100644 --- a/agenta-web/src/services/evaluations/api/index.ts +++ b/agenta-web/src/services/evaluations/api/index.ts @@ -83,8 +83,15 @@ export const createEvaluatorRunExecution = async ( // Evaluator Configs export const fetchAllEvaluatorConfigs = async (appId: string) => { + const tagColors = getTagColors() + const response = await axios.get(`/api/evaluators/configs/`, {params: {app_id: appId}}) - return response.data as EvaluatorConfig[] + const evaluatorConfigs = (response.data || []).map((item: EvaluatorConfig) => ({ + ...item, + icon_url: evaluatorIconsMap[item.evaluator_key as keyof typeof evaluatorIconsMap], + color: tagColors[stringToNumberInRange(item.evaluator_key, 0, tagColors.length - 1)], + })) as EvaluatorConfig[] + return evaluatorConfigs } export type CreateEvaluationConfigData = Omit From db8793e354da631ab620a859fd7b0c97bb9ca32a Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Sun, 8 Sep 2024 13:26:49 +0100 Subject: [PATCH 108/149] refactor(frontend): set evaluator display default to list and ui improvements --- .../EvaluatorsModal/ConfigureEvaluator/index.tsx | 4 ++-- .../autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx | 5 ++++- .../EvaluatorsModal/TestcaseTab/TestcaseTab.tsx | 6 +----- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index 2725434a97..044bc9856d 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -356,7 +356,7 @@ const ConfigureEvaluator = ({
-
+ {selectedEvaluator.name} @@ -391,7 +391,7 @@ const ConfigureEvaluator = ({ {selectedEvaluator.description} -
+
{ const [editMode, setEditMode] = useState(false) const [cloneConfig, setCloneConfig] = useState(false) const [editEvalEditValues, setEditEvalEditValues] = useState(null) - const [evaluatorsDisplay, setEvaluatorsDisplay] = useLocalStorage("evaluator_view", "card") + const [evaluatorsDisplay, setEvaluatorsDisplay] = useLocalStorage<"card" | "list">( + "evaluator_view", + "list", + ) const evalConfigFetcher = () => { setFetchingEvalConfigs(true) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/TestcaseTab/TestcaseTab.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/TestcaseTab/TestcaseTab.tsx index ab57765fb6..b985a7a975 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/TestcaseTab/TestcaseTab.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/TestcaseTab/TestcaseTab.tsx @@ -107,12 +107,8 @@ const TestcaseTab = ({
- - Select test case - - Lorem ipsum, dolor sit amet consectetur adipisicing elit. Itaque culpa - similique reiciendis + Select a test case to use for debugging the evaluators
Date: Sun, 8 Sep 2024 20:12:28 +0100 Subject: [PATCH 109/149] fix(frontend): removed bad code --- .../autoEvaluation/AutoEvaluation.tsx | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx index aadcdf37f7..7455c01d80 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/AutoEvaluation.tsx @@ -1,4 +1,4 @@ -import {_Evaluation, EvaluationStatus, EvaluatorConfig, JSSTheme} from "@/lib/Types" +import {_Evaluation, EvaluationStatus, JSSTheme} from "@/lib/Types" import { ArrowsLeftRight, Database, @@ -39,7 +39,6 @@ import {useUpdateEffect} from "usehooks-ts" import {shortPoll} from "@/lib/helpers/utils" import {getFilterParams} from "./Filters/SearchFilter" import {uniqBy} from "lodash" -import NewEvaluatorModal from "../evaluators/NewEvaluatorModal" import EvaluationErrorPopover from "../EvaluationErrorProps/EvaluationErrorPopover" const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -84,7 +83,6 @@ const AutoEvaluation = () => { const [isDeleteEvalMultipleModalOpen, setIsDeleteEvalMultipleModalOpen] = useState(false) const [editColumns, setEditColumns] = useState([]) const [isFilterColsDropdownOpen, setIsFilterColsDropdownOpen] = useState(false) - const [selectedConfigEdit, setSelectedConfigEdit] = useState() const [isEditEvalConfigOpen, setIsEditEvalConfigOpen] = useState(false) const [isConfigEvaluatorModalOpen, setIsConfigEvaluatorModalOpen] = useQueryParam( "configureEvaluatorModal", @@ -583,19 +581,6 @@ const AutoEvaluation = () => { /> )} - { - setIsEditEvalConfigOpen(false) - fetchEvaluations() - }} - newEvalModalConfigOpen={isEditEvalConfigOpen} - setNewEvalModalConfigOpen={setIsEditEvalConfigOpen} - setNewEvalModalOpen={() => {}} - editMode={true} - initialValues={selectedConfigEdit} - /> - {selectedEvalRecord && ( Date: Mon, 9 Sep 2024 10:15:22 +0600 Subject: [PATCH 110/149] test(frontend): fixed evaluator tests --- agenta-web/cypress/e2e/eval.evaluators.cy.ts | 39 ++++++++++++------- .../ConfigureEvaluator/index.tsx | 9 ++++- .../Evaluators/EvaluatorList.tsx | 2 + .../EvaluatorsModal/Evaluators/index.tsx | 1 + .../NewEvaluator/NewEvaluatorList.tsx | 1 + 5 files changed, 35 insertions(+), 17 deletions(-) diff --git a/agenta-web/cypress/e2e/eval.evaluators.cy.ts b/agenta-web/cypress/e2e/eval.evaluators.cy.ts index 0708d157d5..f167ba55e7 100644 --- a/agenta-web/cypress/e2e/eval.evaluators.cy.ts +++ b/agenta-web/cypress/e2e/eval.evaluators.cy.ts @@ -2,6 +2,7 @@ import {randString} from "../../src/lib/helpers/utils" describe("Evaluators CRUD Operations Test", function () { let newEvalName = randString(5) + let editedEvalName = randString(5) let app_id before(() => { cy.createVariant() @@ -12,30 +13,38 @@ describe("Evaluators CRUD Operations Test", function () { context("Executing Evaluators CRUD operations", () => { beforeEach(() => { - cy.visit(`/apps/${app_id}/evaluations/new-evaluator`) - cy.location("pathname").should("include", "/evaluations/new-evaluator") + cy.visit(`/apps/${app_id}/evaluations?configureEvaluatorModal=open`) + cy.url().should("include", "/evaluations?configureEvaluatorModal=open") }) - it("Should successfully create an Evaluator", () => { - cy.get('[data-cy="evaluator-card"]').should("exist") - cy.get(".ant-space > :nth-child(2) > .ant-btn").click() - cy.get('[data-cy="new-evaluator-modal"]').should("exist") - cy.get('[data-cy^="select-new-evaluator"]').eq(0).click() - cy.get('[data-cy="configure-new-evaluator-modal"]').should("exist") + it("Should successfully create an evaluator", () => { + cy.get(".ant-modal-content").should("exist") + cy.get('[data-cy="create-new-evaluator-button"]').click() + cy.get('[data-cy="new-evaluator-list"]').eq(0).click() + cy.contains(/configure new evaluator/i) cy.get('[data-cy="configure-new-evaluator-modal-input"]').type(newEvalName) cy.get('[data-cy="configure-new-evaluator-modal-save-btn"]').click() - cy.get('[data-cy="evaluator-card"]').should("have.length", 2) + cy.get('[data-cy="evaluator-list"]').should("have.length.gt", 2) }) - it("Should click on the edit button and successfully edit an evaluator", () => { - cy.get('[data-cy^="evaluator-card-edit-button"]').eq(0).click() - cy.get('[data-cy="configure-new-evaluator-modal-input"]').type("edit") + it("Should successfully edit an evaluator", () => { + cy.get(".ant-modal-content").should("exist") + cy.get('[data-cy="evaluator-menu-button"]').eq(0).trigger("mouseover") + cy.get(".ant-dropdown-menu").should("be.visible") + cy.get(".ant-dropdown-menu-item").eq(0).click() + cy.get('[data-cy="configure-new-evaluator-modal-input"]').clear() + cy.get('[data-cy="configure-new-evaluator-modal-input"]').type(editedEvalName) cy.get('[data-cy="configure-new-evaluator-modal-save-btn"]').click() }) - it("Should click on the delete button and successfully delete an evaluator", () => { - cy.get('[data-cy^="evaluator-card-delete-button"]').eq(0).click() - cy.get(".ant-modal-confirm-btns > :nth-child(2) > span").click() + it("Should successfully delete an evaluator", () => { + cy.get(".ant-modal-content").should("exist") + cy.get('[data-cy="evaluator-menu-button"]').eq(0).trigger("mouseover") + cy.get(".ant-dropdown-menu").should("be.visible") + cy.get(".ant-dropdown-menu-item") + .contains(/delete/i) + .click() + cy.get(".ant-modal-footer > .ant-btn-primary").click() }) }) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index 044bc9856d..a7ba1d0dc1 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -416,7 +416,7 @@ const ConfigureEvaluator = ({ ]} className="flex-1" > - + {/* form.resetFields()}> Reset - diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx index a30194d929..e6891a8170 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx @@ -140,6 +140,7 @@ const EvaluatorList = ({ onClick={(e) => e.stopPropagation()} icon={} size="small" + data-cy="evaluator-menu-button" /> ) @@ -166,6 +167,7 @@ const EvaluatorList = ({ bordered onRow={(record) => ({ style: {cursor: "pointer"}, + "data-cy": "evaluator-list", onClick: () => { const selectedEval = evaluators.find((e) => e.key === record.evaluator_key) if (selectedEval) { diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx index bceae4cc14..d08cc51afb 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx @@ -93,6 +93,7 @@ const Evaluators = ({ type="primary" icon={} onClick={() => setCurrent(1)} + data-cy="create-new-evaluator-button" > Create new evaluator diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx index 64fbd4bdfb..2f50049d06 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx @@ -67,6 +67,7 @@ const CreateEvaluatorList = ({ scroll={{x: true, y: 550}} style={{cursor: "pointer"}} onRow={(record) => ({ + "data-cy": "new-evaluator-list", onClick: () => { setSelectedEvaluator(record) setCurrent(2) From 27e3f840e54b6c0ec7d2378129fa33c75565f9c2 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Mon, 9 Sep 2024 10:23:02 +0100 Subject: [PATCH 111/149] fix(backend): updated auto_custom_code_run default code --- .../agenta_backend/resources/evaluators/evaluators.py | 2 +- agenta-backend/agenta_backend/services/security/sandbox.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/agenta-backend/agenta_backend/resources/evaluators/evaluators.py b/agenta-backend/agenta_backend/resources/evaluators/evaluators.py index 1a8f6f5b77..b0560ee484 100644 --- a/agenta-backend/agenta_backend/resources/evaluators/evaluators.py +++ b/agenta-backend/agenta_backend/resources/evaluators/evaluators.py @@ -219,7 +219,7 @@ "code": { "label": "Evaluation Code", "type": "code", - "default": "from typing import Dict\n\ndef evaluate(\n app_params: Dict[str, str],\n inputs: Dict[str, str],\n output: Union[str, Dict[str, Any]], # output of the llm app\n datapoint: Dict[str, str] # contains the testset row \n) -> float:\n if output in datapoint.get('correct_answer', None):\n return 1.0\n else:\n return 0.0\n", + "default": "from typing import Dict, Union, Any\n\ndef evaluate(\n app_params: Dict[str, str],\n inputs: Dict[str, str],\n output: Union[str, Dict[str, Any]], # output of the llm app\n correct_answer: str # contains the testset row \n) -> float:\n if output in correct_answer:\n return 1.0\n else:\n return 0.0\n", "description": "Code for evaluating submissions", "required": True, }, diff --git a/agenta-backend/agenta_backend/services/security/sandbox.py b/agenta-backend/agenta_backend/services/security/sandbox.py index b31e9fe911..95850265b2 100644 --- a/agenta-backend/agenta_backend/services/security/sandbox.py +++ b/agenta-backend/agenta_backend/services/security/sandbox.py @@ -65,6 +65,7 @@ def execute_code_safely( "json", "requests", "numpy", + "typing", ] # Create a dictionary to simulate allowed imports From 4d1ae191d706e64a9e9777a37f54fa0949249042 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Mon, 9 Sep 2024 10:25:18 +0100 Subject: [PATCH 112/149] fix(frontend): added maxWidth to configuration form, disabled editor minimap, and removed custom header for table view --- .../ConfigureEvaluator/index.tsx | 15 ++++- .../EvaluatorsModal/NewEvaluator/index.tsx | 62 +++++++------------ 2 files changed, 34 insertions(+), 43 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index 044bc9856d..146c1010d8 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -75,6 +75,7 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ formContainer: { display: "flex", flexDirection: "column", + maxWidth: 552, gap: theme.padding, overflowY: "auto", maxHeight: 580, @@ -574,7 +575,7 @@ const ConfigureEvaluator = ({ } } catch (error) {} }} - options={{wordWrap: "on"}} + options={{wordWrap: "on", minimap: {enabled: false}}} />
@@ -588,7 +589,11 @@ const ConfigureEvaluator = ({ language="json" theme={`vs-${appTheme}`} value={variantResult} - options={{wordWrap: "on", readOnly: true}} + options={{ + wordWrap: "on", + minimap: {enabled: false}, + readOnly: true, + }} />
@@ -618,7 +623,11 @@ const ConfigureEvaluator = ({ width="100%" language="json" theme={`vs-${appTheme}`} - options={{wordWrap: "on", readOnly: true}} + options={{ + wordWrap: "on", + minimap: {enabled: false}, + readOnly: true, + }} value={outputResult} />
diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx index ac43c12795..5cbc1a2b6b 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx @@ -76,52 +76,34 @@ const NewEvaluator = ({
- {evaluatorsDisplay === "list" ? ( - Configure evaluators - ) : ( - <> -
- {evaluatorsDisplay === "list" ? ( - - @@ -513,7 +513,7 @@ const ConfigureEvaluator = ({ disabled={testsets?.length === 0} > - Load test case + Load testcase
@@ -108,7 +108,7 @@ const TestcaseTab = ({
- Select a test case to use for debugging the evaluators + Select a testcase to use for debugging the evaluators
- Select test cases + Select testcases
- Load test case + Load testcase From 4fe5e1f4a138220d53aee15b9e9362a522a8ca0d Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Mon, 9 Sep 2024 20:22:41 +0100 Subject: [PATCH 117/149] feat(frontend/backend): enabled evaluator filtering by category --- .../models/api/evaluation_model.py | 1 + .../resources/evaluators/evaluators.py | 18 +++++++++ .../EvaluatorsModal/Evaluators/index.tsx | 35 ++++++++++++---- .../EvaluatorsModal/EvaluatorsModal.tsx | 5 +++ .../EvaluatorsModal/NewEvaluator/index.tsx | 40 ++++++++++++------- agenta-web/src/lib/Types.ts | 1 + agenta-web/src/lib/helpers/evaluate.ts | 31 ++++++++++++++ 7 files changed, 109 insertions(+), 22 deletions(-) diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py index 96f9ddef97..f751945813 100644 --- a/agenta-backend/agenta_backend/models/api/evaluation_model.py +++ b/agenta-backend/agenta_backend/models/api/evaluation_model.py @@ -15,6 +15,7 @@ class Evaluator(BaseModel): description: Optional[str] = None oss: Optional[bool] = False requires_llm_api_keys: Optional[bool] = False + tags: List[str] class EvaluatorConfig(BaseModel): diff --git a/agenta-backend/agenta_backend/resources/evaluators/evaluators.py b/agenta-backend/agenta_backend/resources/evaluators/evaluators.py index b0560ee484..23393a2e78 100644 --- a/agenta-backend/agenta_backend/resources/evaluators/evaluators.py +++ b/agenta-backend/agenta_backend/resources/evaluators/evaluators.py @@ -41,6 +41,7 @@ }, "description": "Exact Match evaluator determines if the output exactly matches the specified correct answer, ensuring precise alignment with expected results.", "oss": True, + "tags": ["functional"], }, { "name": "Contains JSON", @@ -49,6 +50,7 @@ "settings_template": {}, "description": "'Contains JSON' evaluator checks if the output contains the a valid JSON.", "oss": True, + "tags": ["functional", "classifiers"], }, { "name": "Similarity Match", @@ -75,6 +77,7 @@ }, "description": "Similarity Match evaluator checks if the generated answer is similar to the expected answer. You need to provide the similarity threshold. It uses the Jaccard similarity to compare the answers.", "oss": True, + "tags": ["similarity", "functional"], }, { "name": "Semantic Similarity Match", @@ -93,6 +96,7 @@ }, }, "oss": True, + "tags": ["similarity", "ai_llm"], }, { "name": "Regex Test", @@ -115,6 +119,7 @@ }, }, "oss": True, + "tags": ["classifiers", "functional"], }, { "name": "JSON Field Match", @@ -139,6 +144,7 @@ }, "description": "JSON Field Match evaluator compares specific fields within JSON (JavaScript Object Notation) data. This matching can involve finding similarities or correspondences between fields in different JSON objects.", "oss": True, + "tags": ["functional"], }, { "name": "JSON Diff Match", @@ -177,6 +183,7 @@ }, }, "oss": True, + "tags": ["similarity", "functional"], }, { "name": "LLM-as-a-judge", @@ -202,6 +209,7 @@ }, "description": "AI Critique evaluator sends the generated answer and the correct_answer to an LLM model and uses it to evaluate the correctness of the answer. You need to provide the evaluation prompt (or use the default prompt).", "oss": True, + "tags": ["ai_llm", "functional"], }, { "name": "Code Evaluation", @@ -234,6 +242,7 @@ }, "description": "Code Evaluation allows you to write your own evaluator in Python. You need to provide the Python code for the evaluator.", "oss": True, + "tags": ["functional"], }, { "name": "Webhook test", @@ -265,6 +274,7 @@ }, "description": "Webhook test evaluator sends the generated answer and the correct_answer to a webhook and expects a response, in JSON format, indicating the correctness of the answer, along with a 200 HTTP status. You need to provide the URL of the webhook and the response of the webhook must be between 0 and 1.", "oss": True, + "tags": ["functional"], }, { "name": "Starts With", @@ -286,6 +296,7 @@ }, "description": "Starts With evaluator checks if the output starts with a specified prefix, considering case sensitivity based on the settings.", "oss": True, + "tags": ["classifiers", "functional"], }, { "name": "Ends With", @@ -307,6 +318,7 @@ }, "description": "Ends With evaluator checks if the output ends with a specified suffix, considering case sensitivity based on the settings.", "oss": True, + "tags": ["classifiers", "functional"], }, { "name": "Contains", @@ -328,6 +340,7 @@ }, "description": "Contains evaluator checks if the output contains a specified substring, considering case sensitivity based on the settings.", "oss": True, + "tags": ["classifiers", "functional"], }, { "name": "Contains Any", @@ -349,6 +362,7 @@ }, "description": "Contains Any evaluator checks if the output contains any of the specified substrings from a comma-separated list, considering case sensitivity based on the settings.", "oss": True, + "tags": ["classifiers", "functional"], }, { "name": "Contains All", @@ -370,6 +384,7 @@ }, "description": "Contains All evaluator checks if the output contains all of the specified substrings from a comma-separated list, considering case sensitivity based on the settings.", "oss": True, + "tags": ["classifiers", "functional"], }, { "name": "Levenshtein Distance", @@ -393,6 +408,7 @@ }, "description": "This evaluator calculates the Levenshtein distance between the output and the correct answer. If a threshold is provided in the settings, it returns a boolean indicating whether the distance is within the threshold. If no threshold is provided, it returns the actual Levenshtein distance as a numerical value.", "oss": True, + "tags": ["functional"], }, { "name": "RAG Faithfulness", @@ -401,6 +417,7 @@ "requires_llm_api_keys": True, "settings_template": rag_evaluator_settings_template, "description": "RAG Faithfulness evaluator assesses the accuracy and reliability of responses generated by Retrieval-Augmented Generation (RAG) models. It evaluates how faithfully the responses adhere to the retrieved documents or sources, ensuring that the generated text accurately reflects the information from the original sources.", + "tags": ["rag"], }, { "name": "RAG Context Relevancy", @@ -409,6 +426,7 @@ "requires_llm_api_keys": True, "settings_template": rag_evaluator_settings_template, "description": "RAG Context Relevancy evaluator measures how relevant the retrieved documents or contexts are to the given question or prompt. It ensures that the selected documents provide the necessary information for generating accurate and meaningful responses, improving the overall quality of the RAG model's output.", + "tags": ["rag"], }, ] diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx index eeb2cbf27f..16aca79bd4 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/index.tsx @@ -6,6 +6,7 @@ import React, {useMemo, useState} from "react" import {createUseStyles} from "react-jss" import EvaluatorCard from "./EvaluatorCard" import EvaluatorList from "./EvaluatorList" +import {getEvaluatorTags} from "@/lib/helpers/evaluate" type EvaluatorsProps = { evaluatorConfigs: EvaluatorConfig[] @@ -19,6 +20,8 @@ type EvaluatorsProps = { onSuccess: () => void setEvaluatorsDisplay: any evaluatorsDisplay: string + setSelectedEvaluatorCategory: React.Dispatch> + selectedEvaluatorCategory: string } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -69,10 +72,12 @@ const Evaluators = ({ setCloneConfig, setEvaluatorsDisplay, evaluatorsDisplay, + selectedEvaluatorCategory, + setSelectedEvaluatorCategory, }: EvaluatorsProps) => { const classes = useStyles() const [searchTerm, setSearchTerm] = useState("") - const [selectedEvaluatorCategory, setSelectedEvaluatorCategory] = useState("view_all") + const evaluatorTags = getEvaluatorTags() const filteredEvalConfigs = useMemo(() => { if (!searchTerm) return evaluatorConfigs @@ -81,6 +86,22 @@ const Evaluators = ({ ) }, [searchTerm, evaluatorConfigs]) + // const filteredEvaluators = useMemo(() => { + // let filtered = evaluatorConfigs + + // if (selectedEvaluatorCategory !== "view_all") { + // filtered = filtered.filter((item) => item.tags.includes(selectedEvaluatorCategory)) + // } + + // if (searchTerm) { + // filtered = filtered.filter((item) => + // item.name.toLowerCase().includes(searchTerm.toLowerCase()), + // ) + // } + + // return filtered + // }, [searchTerm, selectedEvaluatorCategory, evaluatorConfigs]) + return (
@@ -108,13 +129,11 @@ const Evaluators = ({ > View all - {["RAG", "Classifiers", "Similarity", "AI / LLM", "Functional"].map( - (val, idx) => ( - - {val} - - ), - )} + {evaluatorTags.map((val, idx) => ( + + {val.label} + + ))} { "evaluator_view", "list", ) + const [selectedEvaluatorCategory, setSelectedEvaluatorCategory] = useState("view_all") const evalConfigFetcher = () => { setFetchingEvalConfigs(true) @@ -84,6 +85,8 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { setCloneConfig={setCloneConfig} setEvaluatorsDisplay={setEvaluatorsDisplay} evaluatorsDisplay={evaluatorsDisplay} + selectedEvaluatorCategory={selectedEvaluatorCategory} + setSelectedEvaluatorCategory={setSelectedEvaluatorCategory} /> ), }, @@ -96,6 +99,8 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { setSelectedEvaluator={setSelectedEvaluator} setEvaluatorsDisplay={setEvaluatorsDisplay} evaluatorsDisplay={evaluatorsDisplay} + selectedEvaluatorCategory={selectedEvaluatorCategory} + setSelectedEvaluatorCategory={setSelectedEvaluatorCategory} /> ), }, diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx index d54ba6e5a0..02d9931c6e 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/index.tsx @@ -6,6 +6,7 @@ import React, {useMemo, useState} from "react" import {createUseStyles} from "react-jss" import NewEvaluatorList from "./NewEvaluatorList" import NewEvaluatorCard from "./NewEvaluatorCard" +import {getEvaluatorTags} from "@/lib/helpers/evaluate" type NewEvaluatorProps = { setCurrent: React.Dispatch> @@ -14,6 +15,8 @@ type NewEvaluatorProps = { setSelectedEvaluator: React.Dispatch> setEvaluatorsDisplay: any evaluatorsDisplay: string + setSelectedEvaluatorCategory: React.Dispatch> + selectedEvaluatorCategory: string } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -59,17 +62,28 @@ const NewEvaluator = ({ setSelectedEvaluator, setEvaluatorsDisplay, evaluatorsDisplay, + selectedEvaluatorCategory, + setSelectedEvaluatorCategory, }: NewEvaluatorProps) => { const classes = useStyles() const [searchTerm, setSearchTerm] = useState("") - const [selectedEvaluatorCategory, setSelectedEvaluatorCategory] = useState("view_all") + const evaluatorTags = getEvaluatorTags() const filteredEvaluators = useMemo(() => { - if (!searchTerm) return evaluators - return evaluators.filter((item) => - item.name.toLowerCase().includes(searchTerm.toLowerCase()), - ) - }, [searchTerm, evaluators]) + let filtered = evaluators + + if (selectedEvaluatorCategory !== "view_all") { + filtered = filtered.filter((item) => item.tags.includes(selectedEvaluatorCategory)) + } + + if (searchTerm) { + filtered = filtered.filter((item) => + item.name.toLowerCase().includes(searchTerm.toLowerCase()), + ) + } + + return filtered + }, [searchTerm, selectedEvaluatorCategory, evaluators]) return (
@@ -94,14 +108,12 @@ const NewEvaluator = ({ onChange={(e) => setSelectedEvaluatorCategory(e.target.value)} > View all - - {["RAG", "Classifiers", "Similarity", "AI / LLM", "Functional"].map( - (val, idx) => ( - - {val} - - ), - )} + + {evaluatorTags.map((val, idx) => ( + + {val.label} + + ))} diff --git a/agenta-web/src/lib/Types.ts b/agenta-web/src/lib/Types.ts index af41bfb824..0941080c4d 100644 --- a/agenta-web/src/lib/Types.ts +++ b/agenta-web/src/lib/Types.ts @@ -357,6 +357,7 @@ export interface Evaluator { description: string oss?: boolean requires_llm_api_keys?: boolean + tags: string[] } export interface EvaluatorMappingInput { diff --git a/agenta-web/src/lib/helpers/evaluate.ts b/agenta-web/src/lib/helpers/evaluate.ts index a8be04caa9..9fd97f3cf6 100644 --- a/agenta-web/src/lib/helpers/evaluate.ts +++ b/agenta-web/src/lib/helpers/evaluate.ts @@ -15,6 +15,7 @@ import {capitalize, round} from "lodash" import dayjs from "dayjs" import {runningStatuses} from "@/components/pages/evaluations/cellRenderers/cellRenderers" import {formatCurrency, formatLatency} from "./formatters" +import {isDemo} from "./utils" export const exportExactEvaluationData = (evaluation: Evaluation, rows: GenericObject[]) => { const exportRow = rows.map((data, ix) => { @@ -391,3 +392,33 @@ export const transformTraceKeysInSettings = ( {} as Record, ) } + +export const getEvaluatorTags = () => { + const evaluatorTags = [ + { + label: "Classifiers", + value: "classifiers", + }, + { + label: "Similarity", + value: "similarity", + }, + { + label: "AI / LLM", + value: "ai_llm", + }, + { + label: "Functional", + value: "functional", + }, + ] + + if (isDemo()) { + evaluatorTags.unshift({ + label: "RAG", + value: "rag", + }) + } + + return evaluatorTags +} From 2517a0730adcab2205ab87fd18d3b21298e955d2 Mon Sep 17 00:00:00 2001 From: ashrafchowdury Date: Tue, 10 Sep 2024 14:25:00 +0600 Subject: [PATCH 118/149] fix(frontend): new cypress tests failures --- agenta-web/cypress/e2e/eval.evaluations.cy.ts | 37 +++++++++---------- .../ConfigureEvaluator/AdvancedSettings.tsx | 2 +- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/agenta-web/cypress/e2e/eval.evaluations.cy.ts b/agenta-web/cypress/e2e/eval.evaluations.cy.ts index 2e23d17cdc..604e191b70 100644 --- a/agenta-web/cypress/e2e/eval.evaluations.cy.ts +++ b/agenta-web/cypress/e2e/eval.evaluations.cy.ts @@ -58,35 +58,32 @@ describe("Evaluations CRUD Operations Test", function () { }) it("Should successfully create an Evaluator", () => { - cy.visit(`/apps/${app_id}/evaluations/new-evaluator`) - cy.location("pathname").should("include", "/evaluations/new-evaluator") - cy.get('[data-cy="evaluator-card"]').should("exist") - cy.get(".ant-space > :nth-child(2) > .ant-btn").click() - cy.get('[data-cy="new-evaluator-modal"]').should("exist") - cy.get('[data-cy^="select-new-evaluator"]').eq(0).click() - cy.get('[data-cy="configure-new-evaluator-modal"]').should("exist") - cy.get('[data-cy="configure-new-evaluator-modal-input"]').type(newEvalName, { - force: true, - }) + cy.visit(`/apps/${app_id}/evaluations?configureEvaluatorModal=open`) + cy.url().should("include", "/evaluations?configureEvaluatorModal=open") + cy.get(".ant-modal-content").should("exist") + cy.get('[data-cy="create-new-evaluator-button"]').click() + cy.get('[data-cy="new-evaluator-list"]').eq(0).click() + cy.contains(/configure new evaluator/i) + cy.get('[data-cy="configure-new-evaluator-modal-input"]').type(newEvalName) + cy.get('[data-cy="new-evaluator-advance-settings"]').click() - cy.get('[data-cy="new-evaluator-column-name"]').clear() - cy.get('[data-cy="new-evaluator-column-name"]').type("answer") + cy.get('[data-cy="new-evaluator-advance-settings-input"]').clear() + cy.get('[data-cy="new-evaluator-advance-settings-input"]').type("answer") cy.get('[data-cy="configure-new-evaluator-modal-save-btn"]').click() - cy.get('[data-cy="evaluator-card"]').should("have.length", 2) - cy.wait(1000) + cy.get('[data-cy="evaluator-list"]').should("have.length.gt", 2) }) it("Should successfully create an Evaluation", () => { - cy.visit(`/apps/${app_id}/evaluations/results`) - cy.location("pathname").should("include", "/evaluations/results") + cy.visit(`/apps/${app_id}/evaluations`) + cy.location("pathname").should("include", "/evaluations") cy.createNewEvaluation(newEvalName) }) it("Should verify the successful creation and completion of the evaluation", () => { - cy.visit(`/apps/${app_id}/evaluations/results`) - cy.location("pathname").should("include", "/evaluations/results") - cy.get('.ag-row[row-index="0"]').should("exist") - cy.get('.ag-cell[col-id="status"]').should("contain.text", "Completed") + cy.visit(`/apps/${app_id}/evaluations`) + cy.location("pathname").should("include", "/evaluations") + cy.get(".ant-table-row").eq(0).should("exist") + cy.get('[data-cy="evaluation-status-cell"]').should("contain.text", "Completed") }) }) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx index c3cf1313af..4c7c35eabf 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/AdvancedSettings.tsx @@ -68,12 +68,12 @@ const AdvancedSettings: React.FC = ({settings, selectedTe selectedTestcase ? ( option!.value .toUpperCase() .indexOf(inputValue.toUpperCase()) !== -1 } - data-cy="new-evaluator-column-name" /> ) : field.type === "string" || field.type === "regex" ? ( From cf37bb5846c0450de7055bdf2efd09e83591df25 Mon Sep 17 00:00:00 2001 From: ashrafchowdury Date: Tue, 10 Sep 2024 16:18:00 +0600 Subject: [PATCH 119/149] fix(frontend): select new evaluator table column issue --- .../EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx index 2f50049d06..9c7f44f07b 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/NewEvaluator/NewEvaluatorList.tsx @@ -33,7 +33,7 @@ const CreateEvaluatorList = ({ title: "Category", dataIndex: "key", key: "key", - width: 160, + width: 200, render: (_, record) => { return (
@@ -46,7 +46,6 @@ const CreateEvaluatorList = ({ title: "Type", dataIndex: "description", key: "description", - width: "100%", render: (_, record) => { return (
From 0b832ee3ad236e0a59d4eadfbee4cc4514842868 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Tue, 10 Sep 2024 12:59:03 +0100 Subject: [PATCH 120/149] fix(frontend): added tags and improved styles --- .../Evaluators/EvaluatorCard.tsx | 231 +++++++++--------- .../Evaluators/EvaluatorList.tsx | 3 + .../EvaluatorsModal/Evaluators/index.tsx | 45 ++-- .../NewEvaluator/NewEvaluatorCard.tsx | 53 ++-- agenta-web/src/lib/Types.ts | 1 + 5 files changed, 171 insertions(+), 162 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx index da2ece083e..e70372608a 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorCard.tsx @@ -2,7 +2,7 @@ import {evaluatorsAtom} from "@/lib/atoms/evaluation" import {Evaluator, EvaluatorConfig, JSSTheme} from "@/lib/Types" import {MoreOutlined} from "@ant-design/icons" import {Copy, Note, Trash} from "@phosphor-icons/react" -import {Button, Card, Dropdown, Tag, Typography} from "antd" +import {Button, Card, Dropdown, Empty, Tag, Typography} from "antd" import {useAtom} from "jotai" import React, {useState} from "react" import {createUseStyles} from "react-jss" @@ -21,9 +21,10 @@ interface EvaluatorCardProps { const useStyles = createUseStyles((theme: JSSTheme) => ({ container: { display: "flex", - flexDirection: "column", - gap: theme.paddingLG, - height: 600, + flexWrap: "wrap", + gap: theme.padding, + height: "100%", + maxHeight: 600, overflowY: "auto", }, cardTitle: { @@ -34,6 +35,7 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ evaluatorCard: { width: 276, display: "flex", + height: "fit-content", flexDirection: "column", transition: "all 0.025s ease-in", cursor: "pointer", @@ -60,6 +62,12 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ }, "&:hover": {}, }, + centeredItem: { + display: "grid", + placeItems: "center", + width: "100%", + height: 600, + }, })) const EvaluatorCard = ({ @@ -76,128 +84,111 @@ const EvaluatorCard = ({ const [openDeleteModal, setOpenDeleteModal] = useState(false) const [selectedDelEval, setSelectedDelEval] = useState(null) - const formatEvluatorConfigs = Object.entries( - evaluatorConfigs.reduce( - (acc, curr) => { - if (!acc[curr.evaluator_key]) { - acc[curr.evaluator_key] = [] - } - acc[curr.evaluator_key].push(curr) - return acc - }, - {} as Record, - ), - ).map(([title, items]) => ({ - title, - items, - })) - return (
- {formatEvluatorConfigs.map(({title, items}) => ( -
- {title} -
- {items.map((item) => { - const evaluator = evaluators.find((e) => e.key === item.evaluator_key) + {evaluatorConfigs.length ? ( + evaluatorConfigs.map((item) => { + const evaluator = evaluators.find((e) => e.key === item.evaluator_key) - return ( - { - const selectedEval = evaluators.find( - (e) => e.key === item.evaluator_key, - ) - if (selectedEval) { - setEditMode(true) - setSelectedEvaluator(selectedEval) - setEditEvalEditValues(item) - setCurrent(2) - } + return ( + { + const selectedEval = evaluators.find( + (e) => e.key === item.evaluator_key, + ) + if (selectedEval) { + setEditMode(true) + setSelectedEvaluator(selectedEval) + setEditEvalEditValues(item) + setCurrent(2) + } + }} + title={item.name} + extra={ + , + onClick: (e: any) => { + e.domEvent.stopPropagation() + const selectedEval = evaluators.find( + (e) => e.key === item.evaluator_key, + ) + if (selectedEval) { + setEditMode(true) + setSelectedEvaluator(selectedEval) + setEditEvalEditValues(item) + setCurrent(2) + } + }, + }, + { + key: "clone", + label: "Clone", + icon: , + onClick: (e: any) => { + e.domEvent.stopPropagation() + const selectedEval = evaluators.find( + (e) => e.key === item.evaluator_key, + ) + if (selectedEval) { + setCloneConfig(true) + setSelectedEvaluator(selectedEval) + setEditEvalEditValues(item) + setCurrent(2) + } + }, + }, + {type: "divider"}, + { + key: "delete_app", + label: "Delete", + icon: , + danger: true, + onClick: (e: any) => { + e.domEvent.stopPropagation() + setOpenDeleteModal(true) + setSelectedDelEval(item) + }, + }, + ], }} - title={item.name} - extra={ - , - onClick: (e: any) => { - e.domEvent.stopPropagation() - const selectedEval = evaluators.find( - (e) => e.key === item.evaluator_key, - ) - if (selectedEval) { - setEditMode(true) - setSelectedEvaluator(selectedEval) - setEditEvalEditValues(item) - setCurrent(2) - } - }, - }, - { - key: "clone", - label: "Clone", - icon: , - onClick: (e: any) => { - e.domEvent.stopPropagation() - const selectedEval = evaluators.find( - (e) => e.key === item.evaluator_key, - ) - if (selectedEval) { - setCloneConfig(true) - setSelectedEvaluator(selectedEval) - setEditEvalEditValues(item) - setCurrent(2) - } - }, - }, - {type: "divider"}, - { - key: "delete_app", - label: "Delete", - icon: , - danger: true, - onClick: (e: any) => { - e.domEvent.stopPropagation() - setOpenDeleteModal(true) - setSelectedDelEval(item) - }, - }, - ], - }} - > -
+ - - + {selectedEvaluator.description} @@ -404,10 +384,6 @@ const ConfigureEvaluator = ({ className={classes.formContainer} > - - Identifier - -
- {/* -
{ - return ( -
- -
{record.variantName}
- {selectedVariant?.variantId === record.variantId ? ( - - Selected - - ) : ( - "" - )} -
- - - - -
- ) - }, - }, - ]} - onRow={(record) => ({ - onClick: () => { - setSelectedVariant(record) - }, - style: {cursor: "pointer"}, - })} - className={classes.table} - scroll={{y: 300}} - style={{height: 330}} - /> - - - - - - - ) -} - -export default EvaluatorVariantModal diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index 3b91362d5f..5e414b6c97 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -1,45 +1,24 @@ -import { - BaseResponse, - Evaluator, - EvaluatorConfig, - JSSTheme, - Parameter, - testset, - Variant, -} from "@/lib/Types" -import {CloseCircleOutlined, CloseOutlined, InfoCircleOutlined} from "@ant-design/icons" -import { - ArrowLeft, - CaretDoubleLeft, - CaretDoubleRight, - ClockClockwise, - Database, - Lightning, - Play, -} from "@phosphor-icons/react" -import {Button, Divider, Flex, Form, Input, message, Select, Space, Tooltip, Typography} from "antd" -import React, {useEffect, useMemo, useRef, useState} from "react" +import {Evaluator, EvaluatorConfig, JSSTheme, testset, Variant} from "@/lib/Types" +import {CloseOutlined} from "@ant-design/icons" +import {ArrowLeft, CaretDoubleLeft, CaretDoubleRight} from "@phosphor-icons/react" +import {Button, Flex, Form, Input, message, Space, Tooltip, Typography} from "antd" +import React, {useEffect, useMemo, useState} from "react" import {createUseStyles} from "react-jss" import AdvancedSettings from "./AdvancedSettings" import {DynamicFormField} from "./DynamicFormField" -import EvaluatorVariantModal from "./EvaluatorVariantModal" import { CreateEvaluationConfigData, createEvaluatorConfig, - createEvaluatorDataMapping, - createEvaluatorRunExecution, updateEvaluatorConfig, } from "@/services/evaluations/api" import {useAppId} from "@/hooks/useAppId" import {useLocalStorage} from "usehooks-ts" -import {getAllVariantParameters} from "@/lib/helpers/variantHelper" -import {apiKeyObject, getStringOrJson, removeKeys} from "@/lib/helpers/utils" -import {callVariant} from "@/services/api" -import {Editor} from "@monaco-editor/react" -import {useAppTheme} from "@/components/Layout/ThemeContextProvider" -import {isBaseResponse, isFuncResponse} from "@/lib/helpers/playgroundResp" -import {fromBaseResponseToTraceSpanType, transformTraceTreeToJson} from "@/lib/transformers" -import {mapTestcaseAndEvalValues, transformTraceKeysInSettings} from "@/lib/helpers/evaluate" +import {isDemo} from "@/lib/helpers/utils" +import {dynamicComponent} from "@/lib/helpers/dynamic" + +const DebugSection: any = dynamicComponent( + "pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/DebugSection", +) type ConfigureEvaluatorProps = { setCurrent: React.Dispatch> @@ -91,11 +70,6 @@ const useStyles = createUseStyles((theme: JSSTheme) => ({ lineHeight: theme.lineHeight, fontWeight: theme.fontWeightMedium, }, - editor: { - border: `1px solid ${theme.colorBorder}`, - borderRadius: theme.borderRadius, - overflow: "hidden", - }, })) const ConfigureEvaluator = ({ @@ -117,17 +91,9 @@ const ConfigureEvaluator = ({ }: ConfigureEvaluatorProps) => { const appId = useAppId() const classes = useStyles() - const {appTheme} = useAppTheme() const [form] = Form.useForm() const [debugEvaluator, setDebugEvaluator] = useLocalStorage("isDebugSelectionOpen", false) - const [openVariantModal, setOpenVariantModal] = useState(false) const [submitLoading, setSubmitLoading] = useState(false) - const [optInputs, setOptInputs] = useState(null) - const [optParams, setOptParams] = useState(null) - const [isChatVariant, setIsChatVariant] = useState(false) - const abortControllersRef = useRef(null) - const [isRunningVariant, setIsRunningVariant] = useState(false) - const [variantResult, setVariantResult] = useState("") const [traceTree, setTraceTree] = useState<{ testcase: Record | null trace: Record | string | null @@ -135,67 +101,6 @@ const ConfigureEvaluator = ({ testcase: selectedTestcase, trace: null, }) - const [baseResponseData, setBaseResponseData] = useState(null) - const [outputResult, setOutputResult] = useState("") - const [isLoadingResult, setIsLoadingResult] = useState(false) - - const fetchEvalMapper = async () => { - if (!baseResponseData || !selectedTestcase) return - - try { - setIsLoadingResult(true) - - const settingsValues = form.getFieldValue("settings_values") || {} - const {testcaseObj, evalMapObj} = mapTestcaseAndEvalValues( - settingsValues, - selectedTestcase, - ) - let outputs = {} - - if (Object.keys(evalMapObj).length && selectedEvaluator.key.startsWith("rag_")) { - const mapResponse = await createEvaluatorDataMapping({ - inputs: baseResponseData, - mapping: transformTraceKeysInSettings(evalMapObj), - }) - outputs = {...outputs, ...mapResponse.outputs} - } - - if (Object.keys(testcaseObj).length) { - outputs = {...outputs, ...testcaseObj} - } - - if (!selectedEvaluator.key.startsWith("rag_")) { - const correctAnswerKey = settingsValues.correct_answer_key - const groundTruthKey = - typeof correctAnswerKey === "string" && correctAnswerKey.startsWith("testcase.") - ? correctAnswerKey.split(".")[1] - : correctAnswerKey - - outputs = { - ground_truth: selectedTestcase[groundTruthKey], - prediction: - selectedEvaluator.key.includes("json") || - selectedEvaluator.key.includes("field_match_test") - ? JSON.stringify({message: variantResult}) - : variantResult, - ...(selectedEvaluator.key === "auto_custom_code_run" ? {app_config: {}} : {}), - } - } - - const runResponse = await createEvaluatorRunExecution(selectedEvaluator.key, { - inputs: outputs, - settings: transformTraceKeysInSettings(settingsValues), - ...(selectedEvaluator.requires_llm_api_keys || settingsValues?.requires_llm_api_keys - ? {credentials: apiKeyObject()} - : {}), - }) - setOutputResult(getStringOrJson(runResponse.outputs)) - } catch (error) { - console.error(error) - } finally { - setIsLoadingResult(false) - } - } const evalFields = useMemo( () => @@ -237,76 +142,6 @@ const ConfigureEvaluator = ({ } } - useEffect(() => { - if (!selectedVariant || !selectedTestcase) return - - const fetchParameters = async () => { - try { - const {parameters, inputs, isChatVariant} = await getAllVariantParameters( - appId, - selectedVariant, - ) - setOptInputs(inputs) - setOptParams(parameters) - setIsChatVariant(isChatVariant) - } catch (error) { - console.error(error) - } - } - - fetchParameters() - }, [selectedVariant]) - - const handleRunVariant = async () => { - if (!selectedTestcase || !selectedVariant) return - const controller = new AbortController() - abortControllersRef.current = controller - - try { - setIsRunningVariant(true) - const result = await callVariant( - isChatVariant ? removeKeys(selectedTestcase, ["chat"]) : selectedTestcase, - optInputs || [], - optParams || [], - appId, - selectedVariant.baseId, - isChatVariant ? JSON.parse(selectedTestcase.chat) || [{}] : [], - controller.signal, - true, - ) - - if (typeof result === "string") { - setVariantResult(getStringOrJson(result)) - setTraceTree({...traceTree, trace: result}) - } else if (isFuncResponse(result)) { - setVariantResult(getStringOrJson(result)) - setTraceTree({...traceTree, trace: result}) - } else if (isBaseResponse(result)) { - setBaseResponseData(result) - const {trace, data} = result - setVariantResult(getStringOrJson(data)) - if (trace?.spans) { - setTraceTree({ - ...traceTree, - trace: transformTraceTreeToJson( - fromBaseResponseToTraceSpanType(trace.spans, trace.trace_id)[0], - ), - }) - } - } else { - console.error("Unknown response type:", result) - } - } catch (error: any) { - if (!controller.signal.aborted) { - console.error(error) - message.error(error.message) - setVariantResult("") - } - } finally { - setIsRunningVariant(false) - } - } - useEffect(() => { form.resetFields() if (editMode) { @@ -362,12 +197,30 @@ const ConfigureEvaluator = ({ {selectedEvaluator.name} - + + + + {selectedEvaluator.description} @@ -440,177 +293,20 @@ const ConfigureEvaluator = ({ - {debugEvaluator && ( - <> - - -
- - - Debug evaluator - - - Test your evaluator by generating a test data - - - - - - Generate test data - - - - - - - {isRunningVariant ? ( - - ) : ( - - )} - - - -
- - - JSON Data - - - - - - { - try { - if (value) { - const parsedValue = JSON.parse(value) - setTraceTree(parsedValue) - } - } catch (error) {} - }} - options={{ - wordWrap: "on", - minimap: {enabled: false}, - lineNumbers: "off", - }} - /> -
- -
- - App Output - - -
- -
- - - Evaluator Output - - - - - - - -
-
- - )} + - - setOpenVariantModal(false)} - setSelectedVariant={setSelectedVariant} - selectedVariant={selectedVariant} - /> ) } diff --git a/agenta-web/src/lib/Types.ts b/agenta-web/src/lib/Types.ts index e48b7b6603..b592aebd67 100644 --- a/agenta-web/src/lib/Types.ts +++ b/agenta-web/src/lib/Types.ts @@ -360,25 +360,6 @@ export interface Evaluator { tags: string[] } -export interface EvaluatorMappingInput { - inputs: Record - mapping: Record -} - -export interface EvaluatorMappingOutput { - outputs: Record -} - -export interface EvaluatorInputInterface { - inputs: Record - settings?: Record - credentials?: Record -} - -export interface EvaluatorOutputInterface { - outputs: Record -} - export interface EvaluatorConfig { id: string evaluator_key: string diff --git a/agenta-web/src/services/evaluations/api/index.ts b/agenta-web/src/services/evaluations/api/index.ts index e017849c5a..d251d0e261 100644 --- a/agenta-web/src/services/evaluations/api/index.ts +++ b/agenta-web/src/services/evaluations/api/index.ts @@ -3,10 +3,6 @@ import { ComparisonResultRow, Evaluator, EvaluatorConfig, - EvaluatorInputInterface, - EvaluatorMappingInput, - EvaluatorMappingOutput, - EvaluatorOutputInterface, KeyValuePair, LLMRunRateLimit, TestSet, @@ -66,21 +62,6 @@ export const fetchAllEvaluators = async () => { return evaluators } -export const createEvaluatorDataMapping = async ( - config: EvaluatorMappingInput, -): Promise => { - const response = await axios.post("/api/evaluators/map/", {...config}) - return response.data -} - -export const createEvaluatorRunExecution = async ( - evaluatorKey: string, - config: EvaluatorInputInterface, -): Promise => { - const response = await axios.post(`/api/evaluators/${evaluatorKey}/run/`, {...config}) - return response.data -} - // Evaluator Configs export const fetchAllEvaluatorConfigs = async (appId: string) => { const tagColors = getTagColors() From 7f737c9a37f56bd9b81a7cbd76f9addd3b2739f9 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Fri, 13 Sep 2024 00:16:01 +0100 Subject: [PATCH 129/149] fix(frontend): removed testcase tab --- .../ConfigureEvaluator/index.tsx | 8 +- .../EvaluatorsModal/EvaluatorsModal.tsx | 15 +- .../TestcaseTab/TestcaseTab.tsx | 178 ------------------ 3 files changed, 5 insertions(+), 196 deletions(-) delete mode 100644 agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/TestcaseTab/TestcaseTab.tsx diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index 5e414b6c97..692d6ba133 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -36,6 +36,7 @@ type ConfigureEvaluatorProps = { setEditMode: (value: React.SetStateAction) => void cloneConfig: boolean setCloneConfig: React.Dispatch> + setSelectedTestcase: React.Dispatch | null>> } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -88,6 +89,7 @@ const ConfigureEvaluator = ({ setEditMode, cloneConfig, setCloneConfig, + setSelectedTestcase, }: ConfigureEvaluatorProps) => { const appId = useAppId() const classes = useStyles() @@ -95,10 +97,8 @@ const ConfigureEvaluator = ({ const [debugEvaluator, setDebugEvaluator] = useLocalStorage("isDebugSelectionOpen", false) const [submitLoading, setSubmitLoading] = useState(false) const [traceTree, setTraceTree] = useState<{ - testcase: Record | null trace: Record | string | null }>({ - testcase: selectedTestcase, trace: null, }) @@ -272,7 +272,7 @@ const ConfigureEvaluator = ({ {advancedSettingsFields.length > 0 && ( )} @@ -297,7 +297,6 @@ const ConfigureEvaluator = ({ selectedEvaluator={selectedEvaluator} selectedTestcase={selectedTestcase} selectedVariant={selectedVariant} - setCurrent={setCurrent} setTraceTree={setTraceTree} debugEvaluator={debugEvaluator} form={form} @@ -305,6 +304,7 @@ const ConfigureEvaluator = ({ traceTree={traceTree} variants={variants} setSelectedVariant={setSelectedVariant} + setSelectedTestcase={setSelectedTestcase} /> diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx index 5fa98de269..9a0ce2ad0c 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx @@ -8,7 +8,6 @@ import React, {useEffect, useState} from "react" import {createUseStyles} from "react-jss" import {fetchVariants} from "@/services/api" import {fetchTestsets} from "@/services/testsets/api" -import TestcaseTab from "./TestcaseTab/TestcaseTab" import ConfigureEvaluator from "./ConfigureEvaluator" import NewEvaluator from "./NewEvaluator" import Evaluators from "./Evaluators" @@ -133,22 +132,10 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { setEditMode={setEditMode} cloneConfig={cloneConfig} setCloneConfig={setCloneConfig} + setSelectedTestcase={setSelectedTestcase} /> ), }) - - if (testsets && testsets.length) { - steps.push({ - content: ( - setCurrent(2)} - testsets={testsets} - setSelectedTestcase={setSelectedTestcase} - selectedTestcase={selectedTestcase} - /> - ), - }) - } } return ( diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/TestcaseTab/TestcaseTab.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/TestcaseTab/TestcaseTab.tsx deleted file mode 100644 index a8ed2a4393..0000000000 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/TestcaseTab/TestcaseTab.tsx +++ /dev/null @@ -1,178 +0,0 @@ -import {JSSTheme, TestSet, testset} from "@/lib/Types" -import {fetchTestset} from "@/services/testsets/api" -import {CloseOutlined} from "@ant-design/icons" -import {Button, Divider, Input, Menu, Table, Typography} from "antd" -import {ColumnsType} from "antd/es/table" -import React, {useEffect, useMemo, useState} from "react" -import {createUseStyles} from "react-jss" - -interface TestcaseTabProps { - handleOnCancel: () => void - setSelectedTestcase: React.Dispatch | null>> - testsets: testset[] - selectedTestcase: Record | null -} - -const useStyles = createUseStyles((theme: JSSTheme) => ({ - header: { - display: "flex", - alignItems: "center", - justifyContent: "space-between", - "& .ant-typography": { - fontSize: theme.fontSizeHeading4, - fontWeight: theme.fontWeightStrong, - lineHeight: theme.lineHeightLG, - }, - }, - title: { - fontSize: theme.fontSizeLG, - fontWeight: theme.fontWeightMedium, - lineHeight: theme.lineHeightLG, - }, - sidebar: { - display: "flex", - flexDirection: "column", - gap: theme.padding, - width: 213, - }, - menu: { - height: 550, - overflowY: "auto", - borderInlineEnd: `0px !important`, - }, -})) - -const TestcaseTab = ({ - handleOnCancel, - setSelectedTestcase, - testsets, - selectedTestcase, -}: TestcaseTabProps) => { - const classes = useStyles() - const [selectedTestset, setSelectedTestset] = useState(testsets[0]._id) - const [isLoadingTestset, setIsLoadingTestset] = useState(false) - const [testsetCsvData, setTestsetCsvData] = useState([]) - - const [searchTerm, setSearchTerm] = useState("") - - const filteredTestset = useMemo(() => { - if (!searchTerm) return testsets - return testsets.filter((item) => item.name.toLowerCase().includes(searchTerm.toLowerCase())) - }, [searchTerm, testsets]) - - useEffect(() => { - const testsetFetcher = async () => { - try { - setIsLoadingTestset(true) - const data = await fetchTestset(selectedTestset) - setTestsetCsvData(data.csvdata) - } catch (error) { - console.error(error) - } finally { - setIsLoadingTestset(false) - } - } - - testsetFetcher() - }, [selectedTestset]) - - const columnDef = useMemo(() => { - const columns: ColumnsType = [] - - if (testsetCsvData.length > 0) { - const keys = Object.keys(testsetCsvData[0]) - - columns.push( - ...keys.map((key, index) => ({ - title: key, - dataIndex: key, - key: index, - render: (_: any, record: any) => { - return
{record[key]}
- }, - })), - ) - } - - return columns - }, [testsetCsvData]) - - return ( -
-
- Select testcase - -
-
-
-
- - Select a testcase to use for debugging the evaluators - -
- setSearchTerm(e.target.value)} - /> - - - - ({ - key: testset._id, - label: testset.name, - }))} - onSelect={({key}) => { - setSelectedTestset(key) - }} - defaultSelectedKeys={[selectedTestset]} - className={classes.menu} - /> -
- -
- Select testcases - -
{ - setSelectedTestcase(selectedRows[0]) - }, - }} - loading={isLoadingTestset} - dataSource={testsetCsvData.map((data, index) => ({...data, id: index}))} - columns={columnDef} - className="flex-1" - bordered - rowKey={"id"} - pagination={false} - scroll={{y: 550}} - /> - -
- - -
- - - - ) -} - -export default TestcaseTab From c4a3e53f64b61b935a1a32624fd3218ed1b01c57 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 13 Sep 2024 08:29:20 +0100 Subject: [PATCH 130/149] minor refactor (backend): explictly get testcase correct_answer key --- .../agenta_backend/services/evaluators_service.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py index 96247e33a3..df22af7e06 100644 --- a/agenta-backend/agenta_backend/services/evaluators_service.py +++ b/agenta-backend/agenta_backend/services/evaluators_service.py @@ -130,8 +130,10 @@ def get_correct_answer( correct_answer_key = settings_values.get("correct_answer_key") if correct_answer_key is None: raise ValueError("No correct answer keys provided.") - if len(correct_answer_key.split(".")) > 1: - correct_answer_key = correct_answer_key.split(".")[-1] + if isinstance(correct_answer_key, str) and correct_answer_key.startswith( + "testcase." + ): + correct_answer_key = correct_answer_key[len("testcase.") :] if correct_answer_key not in data_point: raise ValueError( f"Correct answer column '{correct_answer_key}' not found in the test set." From 381c6283e982e2a61d5fa8d94f862e380159b7f3 Mon Sep 17 00:00:00 2001 From: Abram Date: Fri, 13 Sep 2024 08:30:38 +0100 Subject: [PATCH 131/149] minor refactor (tests): update the run inputs of rag_faithfulness evaluator fixture --- .../agenta_backend/tests/variants_main_router/conftest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py index 5356ad0e9c..a3459942b9 100644 --- a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py +++ b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py @@ -331,8 +331,8 @@ def mapper_to_run_rag_faithfulness_evaluation(): @pytest.fixture() def rag_faithfulness_evaluator_run_inputs(): return { - "question": "List 6 movies about witches in the genre of fiction.", - "context": [ + "question_key": "List 6 movies about witches in the genre of fiction.", + "contexts_key": [ "The Craft (1996) in ['Drama', 'Fantasy', 'Horror']: A newcomer to a Catholic prep high school falls in with a trio of outcast teenage girls who practice witchcraft and they all soon conjure up various spells and curses against those who even slightly anger them.", "Oz the Great and Powerful (2013) in ['Adventure', 'Family', 'Fantasy']: A small-time magician is swept away to an enchanted land and is forced into a power struggle between three witches.", "Snow White: A Tale of Terror (1997) in ['Fantasy', 'Horror']: In this dark take on the fairy tale, the growing hatred of a noblewoman, secretly a practitioner of the dark arts, for her stepdaughter, and the witch's horrifying attempts to kill her.", @@ -343,7 +343,7 @@ def rag_faithfulness_evaluator_run_inputs(): "The Hexer (2001) in ['Adventure', 'Fantasy']: The adventures of Geralt of Rivea, \"The Witcher\".", "Heavy Metal (1981) in ['Animation', 'Adventure', 'Fantasy']: A glowing orb terrorizes a young girl with a collection of stories of dark fantasy, eroticism and horror.", ], - "answer": 'Witches in fiction are depicted through a mix of horror, fantasy, and dark comedy. \n\n"The Craft" (1996) delves into the complexities of teenage witchcraft, showcasing both empowerment and the darker repercussions of their actions. \n"Snow White: A Tale of Terror" (1997) offers a sinister twist on the classic story, highlighting the witch\'s envy and vengeful nature. \n"Hocus Pocus" (1993) delivers a comedic and adventurous take on witchcraft, as three resurrected witches wreak havoc in contemporary Salem', + "answer_key": 'Witches in fiction are depicted through a mix of horror, fantasy, and dark comedy. \n\n"The Craft" (1996) delves into the complexities of teenage witchcraft, showcasing both empowerment and the darker repercussions of their actions. \n"Snow White: A Tale of Terror" (1997) offers a sinister twist on the classic story, highlighting the witch\'s envy and vengeful nature. \n"Hocus Pocus" (1993) delivers a comedic and adventurous take on witchcraft, as three resurrected witches wreak havoc in contemporary Salem', } From 973c256a552e453bfea916df0ee200cdaa4ade28 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Sat, 14 Sep 2024 10:31:41 +0100 Subject: [PATCH 132/149] fix(frontend): removed checkbox and tags columns from evaluator table view --- .../Evaluators/EvaluatorList.tsx | 28 +------------------ 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx index a7dfc23625..719aee6a22 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/Evaluators/EvaluatorList.tsx @@ -27,7 +27,6 @@ const EvaluatorList = ({ setSelectedEvaluator, onSuccess, }: EvaluatorListProps) => { - const [selectedRowKeys, setSelectedRowKeys] = useState([]) const evaluators = useAtom(evaluatorsAtom)[0] const [openDeleteModal, setOpenDeleteModal] = useState(false) const [selectedDelEval, setSelectedDelEval] = useState(null) @@ -45,9 +44,6 @@ const EvaluatorList = ({ title: "Name", dataIndex: "name", key: "name", - onHeaderCell: () => ({ - style: {minWidth: 400}, - }), render: (_, record) => { return
{record.name}
}, @@ -56,25 +52,11 @@ const EvaluatorList = ({ title: "Type", dataIndex: "type", key: "type", - onHeaderCell: () => ({ - style: {minWidth: 200}, - }), render: (_, record) => { const evaluator = evaluators.find((item) => item.key === record.evaluator_key) return {evaluator?.name} }, }, - { - title: "Tags", - dataIndex: "tags", - key: "tags", - onHeaderCell: () => ({ - style: {minWidth: 400}, - }), - render: (_, record) => { - return record.tags?.map((tag, index) => {tag}) - }, - }, { title: , key: "key", @@ -84,7 +66,7 @@ const EvaluatorList = ({ render: (_, record) => { return (
{ - setSelectedRowKeys(selectedRowKeys) - }, - fixed: "left", - }} className="ph-no-capture" columns={columns} rowKey={"id"} From 6ebab980622999bf6f9f40f4bbeece8c720018a6 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Sat, 14 Sep 2024 10:32:30 +0100 Subject: [PATCH 133/149] refactor(frontend): modified test toggle button icon --- .../ConfigureEvaluator/index.tsx | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index 692d6ba133..5276bd25e7 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -1,6 +1,6 @@ import {Evaluator, EvaluatorConfig, JSSTheme, testset, Variant} from "@/lib/Types" import {CloseOutlined} from "@ant-design/icons" -import {ArrowLeft, CaretDoubleLeft, CaretDoubleRight} from "@phosphor-icons/react" +import {ArrowLeft, CaretDoubleRight} from "@phosphor-icons/react" import {Button, Flex, Form, Input, message, Space, Tooltip, Typography} from "antd" import React, {useEffect, useMemo, useState} from "react" import {createUseStyles} from "react-jss" @@ -211,14 +211,17 @@ const ConfigureEvaluator = ({ onClick={() => setDebugEvaluator(!debugEvaluator)} disabled={!isDemo()} > -
- {debugEvaluator ? ( - - ) : ( + {debugEvaluator ? ( +
+ + Test +
+ ) : ( +
+ Test - )} - Test -
+
+ )} From 6c15e428082f50e0a52109a471175a243dc25695 Mon Sep 17 00:00:00 2001 From: Kaosiso Ezealigo Date: Sat, 14 Sep 2024 11:04:52 +0100 Subject: [PATCH 134/149] design(frontend): added transition and conditional to evaluator modal --- .../EvaluatorsModal/ConfigureEvaluator/index.tsx | 6 ++++-- .../EvaluatorsModal/EvaluatorsModal.tsx | 13 ++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index 5276bd25e7..1f4860dc23 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -12,7 +12,6 @@ import { updateEvaluatorConfig, } from "@/services/evaluations/api" import {useAppId} from "@/hooks/useAppId" -import {useLocalStorage} from "usehooks-ts" import {isDemo} from "@/lib/helpers/utils" import {dynamicComponent} from "@/lib/helpers/dynamic" @@ -37,6 +36,8 @@ type ConfigureEvaluatorProps = { cloneConfig: boolean setCloneConfig: React.Dispatch> setSelectedTestcase: React.Dispatch | null>> + setDebugEvaluator: React.Dispatch> + debugEvaluator: boolean } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -90,11 +91,12 @@ const ConfigureEvaluator = ({ cloneConfig, setCloneConfig, setSelectedTestcase, + debugEvaluator, + setDebugEvaluator, }: ConfigureEvaluatorProps) => { const appId = useAppId() const classes = useStyles() const [form] = Form.useForm() - const [debugEvaluator, setDebugEvaluator] = useLocalStorage("isDebugSelectionOpen", false) const [submitLoading, setSubmitLoading] = useState(false) const [traceTree, setTraceTree] = useState<{ trace: Record | string | null diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx index 9a0ce2ad0c..53a43633e1 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx @@ -1,6 +1,6 @@ import {useAppId} from "@/hooks/useAppId" import {evaluatorConfigsAtom, evaluatorsAtom} from "@/lib/atoms/evaluation" -import {Evaluator, EvaluatorConfig, JSSTheme, testset, Variant} from "@/lib/Types" +import {Evaluator, EvaluatorConfig, testset, Variant} from "@/lib/Types" import {fetchAllEvaluatorConfigs, fetchAllEvaluators} from "@/services/evaluations/api" import {Modal} from "antd" import {useAtom} from "jotai" @@ -15,8 +15,9 @@ import {useLocalStorage} from "usehooks-ts" type EvaluatorsModalProps = {} & React.ComponentProps -const useStyles = createUseStyles((theme: JSSTheme) => ({ +const useStyles = createUseStyles(() => ({ modalWrapper: { + transition: "width 0.3s ease", "& .ant-modal-content": { height: 800, "& .ant-modal-body": { @@ -46,6 +47,7 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { "list", ) const [selectedEvaluatorCategory, setSelectedEvaluatorCategory] = useState("view_all") + const [debugEvaluator, setDebugEvaluator] = useLocalStorage("isDebugSelectionOpen", false) const evalConfigFetcher = () => { setFetchingEvalConfigs(true) @@ -65,6 +67,9 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { setEvaluators(evaluators) setEvaluatorConfigs(configs) setVariants(variants) + if (variants.length) { + setSelectedVariant(variants[0]) + } setTestsets(testsets) }) }, [appId]) @@ -133,6 +138,8 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { cloneConfig={cloneConfig} setCloneConfig={setCloneConfig} setSelectedTestcase={setSelectedTestcase} + setDebugEvaluator={setDebugEvaluator} + debugEvaluator={debugEvaluator} /> ), }) @@ -141,7 +148,7 @@ const EvaluatorsModal = ({...props}: EvaluatorsModalProps) => { return ( Date: Sat, 14 Sep 2024 11:26:24 +0100 Subject: [PATCH 135/149] fix(frontend): moved selected testset state to parent level --- .../EvaluatorsModal/ConfigureEvaluator/index.tsx | 8 +++++++- .../autoEvaluation/EvaluatorsModal/EvaluatorsModal.tsx | 6 ++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx index 1f4860dc23..970061751c 100644 --- a/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx +++ b/agenta-web/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/index.tsx @@ -38,6 +38,8 @@ type ConfigureEvaluatorProps = { setSelectedTestcase: React.Dispatch | null>> setDebugEvaluator: React.Dispatch> debugEvaluator: boolean + setSelectedTestset: React.Dispatch> + selectedTestset: string } const useStyles = createUseStyles((theme: JSSTheme) => ({ @@ -93,6 +95,8 @@ const ConfigureEvaluator = ({ setSelectedTestcase, debugEvaluator, setDebugEvaluator, + selectedTestset, + setSelectedTestset, }: ConfigureEvaluatorProps) => { const appId = useAppId() const classes = useStyles() @@ -211,7 +215,7 @@ const ConfigureEvaluator = ({