From 28320b8e52142b6c642445f2dfd48283689ddb37 Mon Sep 17 00:00:00 2001
From: Abram <israelvictory87@gmail.com>
Date: Wed, 14 Aug 2024 07:58:23 +0100
Subject: [PATCH 01/13] refactor (backend): add check for OpenAI API key with
 clear exception message

---
 .../services/evaluators_service.py            | 21 +++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py
index f158253571..b2a3f2c5e0 100644
--- a/agenta-backend/agenta_backend/services/evaluators_service.py
+++ b/agenta-backend/agenta_backend/services/evaluators_service.py
@@ -359,7 +359,12 @@ async def auto_ai_critique(
 
 
 async def ai_critique(input: EvaluatorInputInterface) -> EvaluatorOutputInterface:
-    openai_api_key = input.credentials["OPENAI_API_KEY"]
+    openai_api_key = input.credentials.get("OPENAI_API_KEY", None)
+
+    if not openai_api_key:
+        raise Exception(
+            "No OpenAI key was found. AI Critique evaluator requires a valid OpenAI API key to function. Please configure your OpenAI API and try again."
+        )
 
     chain_run_args = {
         "llm_app_prompt_template": input.inputs.get("prompt_user", ""),
@@ -786,7 +791,7 @@ async def measure_rag_consistency(
     openai_api_key = input.credentials.get("OPENAI_API_KEY", None)
     if not openai_api_key:
         raise Exception(
-            "No LLM keys OpenAI key found. Please configure your OpenAI keys and try again."
+            "No OpenAI key was found. RAG evaluator requires a valid OpenAI API key to function. Please configure your OpenAI API and try again."
         )
 
     # Initialize RAG evaluator to calculate faithfulness score
@@ -885,10 +890,9 @@ async def measure_context_coherence(
     input: EvaluatorInputInterface,
 ) -> EvaluatorOutputInterface:
     openai_api_key = input.credentials.get("OPENAI_API_KEY", None)
-
     if not openai_api_key:
         raise Exception(
-            "No LLM keys OpenAI key found. Please configure your OpenAI keys and try again."
+            "No OpenAI key was found. RAG evaluator requires a valid OpenAI API key to function. Please configure your OpenAI API and try again."
         )
 
     # Initialize RAG evaluator to calculate context relevancy score
@@ -1124,8 +1128,13 @@ async def semantic_similarity(
         float: the semantic similarity score
     """
 
-    api_key = input.credentials["OPENAI_API_KEY"]
-    openai = AsyncOpenAI(api_key=api_key)
+    openai_api_key = input.credentials.get("OPENAI_API_KEY", None)
+    if not openai_api_key:
+        raise Exception(
+            "No OpenAI key was found. Semantic evaluator requires a valid OpenAI API key to function. Please configure your OpenAI API and try again."
+        )
+
+    openai = AsyncOpenAI(api_key=openai_api_key)
 
     async def encode(text: str):
         response = await openai.embeddings.create(

From d8a1bbdc265805c62373e1197af0a78fa2c0ccf6 Mon Sep 17 00:00:00 2001
From: Abram <israelvictory87@gmail.com>
Date: Wed, 14 Aug 2024 08:00:10 +0100
Subject: [PATCH 02/13] feat (tests): add test case for auto_ai_critique and
 evaluators requiring OpenAI API key

---
 .../tests/unit/test_evaluators.py             | 107 ++++++++++++++++--
 1 file changed, 99 insertions(+), 8 deletions(-)

diff --git a/agenta-backend/agenta_backend/tests/unit/test_evaluators.py b/agenta-backend/agenta_backend/tests/unit/test_evaluators.py
index c0bfbfade8..7fa391ccad 100644
--- a/agenta-backend/agenta_backend/tests/unit/test_evaluators.py
+++ b/agenta-backend/agenta_backend/tests/unit/test_evaluators.py
@@ -5,6 +5,7 @@
 
 from agenta_backend.services.evaluators_service import (
     auto_levenshtein_distance,
+    auto_ai_critique,
     auto_starts_with,
     auto_ends_with,
     auto_contains,
@@ -18,6 +19,53 @@
 )
 
 
+@pytest.mark.parametrize(
+    "ground_truth, output, settings_values, openai_api_key, expected_min, expected_max",
+    [
+        (
+            {"correct_answer": "The capital of Kiribati is Tarawa."},
+            "The capital of Kiribati is South Tarawa.",
+            {
+                "prompt_template": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below:\nEvaluation strategy: 0 to 10 0 is very bad and 10 is very good.\nPrompt: {llm_app_prompt_template}\nInputs: country: {country}\nExpected Answer Column:{correct_answer}\nEvaluate this: {variant_output}\n\nAnswer ONLY with one of the given grading or evaluation options.",
+                "correct_answer_key": "correct_answer",
+            },
+            os.environ.get("OPENAI_API_KEY"),
+            0,
+            10,
+        ),
+        (
+            {"correct_answer": "The capital of Kiribati is Tarawa."},
+            "The capital of Kiribati is South Tarawa.",
+            {
+                "prompt_template": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below:\nEvaluation strategy: 0 to 10 0 is very bad and 10 is very good.\nPrompt: {llm_app_prompt_template}\nInputs: country: {country}\nExpected Answer Column:{correct_answer}\nEvaluate this: {variant_output}\n\nAnswer ONLY with one of the given grading or evaluation options.",
+                "correct_answer_key": "correct_answer",
+            },
+            None,
+            None,
+            None,
+        ),
+    ],
+)
+@pytest.mark.asyncio
+async def test_auto_ai_critique_evaluator(
+    ground_truth, output, settings_values, openai_api_key, expected_min, expected_max
+):
+    result = await auto_ai_critique(
+        {},
+        output,
+        ground_truth,
+        {},
+        settings_values,
+        {"OPENAI_API_KEY": openai_api_key},
+    )
+    try:
+        assert expected_min <= round(result.value, 1) <= expected_max
+    except TypeError as error:
+        # exceptions
+        # - raised by evaluator (agenta) -> TypeError
+        assert not isinstance(result.value, float) or not isinstance(result.value, int)
+
+
 @pytest.mark.parametrize(
     "output, settings_values, expected",
     [
@@ -287,6 +335,15 @@ async def test_auto_json_diff(
             0.0,
             1.0,
         ),
+        (
+            {"correct_answer": "The capital of Namibia is Windhoek."},
+            "Windhoek is the capital of Namibia.",
+            {
+                "correct_answer_key": "correct_answer",
+            },
+            None,
+            None,
+        ),
     ],
 )
 @pytest.mark.asyncio
@@ -301,7 +358,12 @@ async def test_auto_semantic_similarity_match(
         settings_values,
         {"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")},
     )
-    assert expected_min <= round(result.value, 3) <= expected_max
+    try:
+        assert expected_min <= round(result.value, 1) <= expected_max
+    except TypeError as error:
+        # exceptions
+        # - raised by evaluator (agenta) -> TypeError
+        assert not isinstance(result.value, float) or not isinstance(result.value, int)
 
 
 @pytest.mark.parametrize(
@@ -359,7 +421,7 @@ async def test_auto_levenshtein_distance(output, data_point, settings_values, ex
 
 
 @pytest.mark.parametrize(
-    "settings_values, expected_min, expected_max",
+    "settings_values, expected_min, openai_api_key, expected_max",
     [
         (
             {
@@ -367,28 +429,46 @@ async def test_auto_levenshtein_distance(output, data_point, settings_values, ex
                 "answer_key": "rag.reporter.outputs.report",
                 "contexts_key": "rag.retriever.outputs.movies",
             },
+            os.environ.get("OPENAI_API_KEY"),
             0.0,
             1.0,
         ),
+        (
+            {
+                "question_key": "rag.retriever.internals.prompt",
+                "answer_key": "rag.reporter.outputs.report",
+                "contexts_key": "rag.retriever.outputs.movies",
+            },
+            None,
+            None,
+            None,
+        ),
         # add more use cases
     ],
 )
 @pytest.mark.asyncio
-async def test_rag_faithfulness_evaluator(settings_values, expected_min, expected_max):
+async def test_rag_faithfulness_evaluator(
+    settings_values, expected_min, openai_api_key, expected_max
+):
     result = await rag_faithfulness(
         {},
         simple_rag_trace,
         {},
         {},
         settings_values,
-        {"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")},
+        {"OPENAI_API_KEY": openai_api_key},
     )
 
-    assert expected_min <= round(result.value, 1) <= expected_max
+    try:
+        assert expected_min <= round(result.value, 1) <= expected_max
+    except TypeError as error:
+        # exceptions
+        # - raised by evaluator (agenta) -> TypeError
+        assert not isinstance(result.value, float) or not isinstance(result.value, int)
 
 
 @pytest.mark.parametrize(
-    "settings_values, expected_min, expected_max",
+    "settings_values, expected_min, openai_api_key, expected_max",
     [
         (
             {
@@ -396,15 +476,26 @@ async def test_rag_faithfulness_evaluator(settings_values, expected_min, expecte
                 "answer_key": "rag.reporter.outputs.report",
                 "contexts_key": "rag.retriever.outputs.movies",
             },
+            os.environ.get("OPENAI_API_KEY"),
             0.0,
             1.0,
         ),
+        (
+            {
+                "question_key": "rag.retriever.internals.prompt",
+                "answer_key": "rag.reporter.outputs.report",
+                "contexts_key": "rag.retriever.outputs.movies",
+            },
+            None,
+            None,
+            None,
+        ),
         # add more use cases
     ],
 )
 @pytest.mark.asyncio
 async def test_rag_context_relevancy_evaluator(
-    settings_values, expected_min, expected_max
+    settings_values, expected_min, openai_api_key, expected_max
 ):
     result = await rag_context_relevancy(
         {},
@@ -412,7 +503,7 @@ async def test_rag_context_relevancy_evaluator(
         {},
         {},
         settings_values,
-        {"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")},
+        {"OPENAI_API_KEY": openai_api_key},
     )
 
     try:

From e02fefaa53491d15f77eb2de1f9ee40681006aaf Mon Sep 17 00:00:00 2001
From: Abram <israelvictory87@gmail.com>
Date: Mon, 19 Aug 2024 23:50:52 +0100
Subject: [PATCH 03/13] refactor (backend): rewrite db function to check if
 evaluators exist in evaluators

---
 .../agenta_backend/services/db_manager.py     | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/agenta-backend/agenta_backend/services/db_manager.py b/agenta-backend/agenta_backend/services/db_manager.py
index 08ac7b00f9..077fb07ef4 100644
--- a/agenta-backend/agenta_backend/services/db_manager.py
+++ b/agenta-backend/agenta_backend/services/db_manager.py
@@ -2963,13 +2963,17 @@ async def fetch_evaluator_config(evaluator_config_id: str):
         return evaluator_config
 
 
-async def check_if_ai_critique_exists_in_list_of_evaluators_configs(
-    evaluators_configs_ids: List[str],
+async def check_if_evaluators_exist_in_list_of_evaluators_configs(
+    evaluators_configs_ids: List[str], evaluators_keys: List[str]
 ) -> bool:
-    """Fetch evaluator configurations from the database.
+    """Check if the provided evaluators exist in the database within the given evaluator configurations.
+
+    Arguments:
+        evaluators_configs_ids (List[str]): List of evaluator configuration IDs to search within.
+        evaluators_keys (List[str]): List of evaluator keys to check for existence.
 
     Returns:
-        EvaluatorConfigDB: the evaluator configuration object.
+        bool: True if all evaluators exist, False otherwise.
     """
 
     async with db_engine.get_session() as session:
@@ -2978,15 +2982,18 @@ async def check_if_ai_critique_exists_in_list_of_evaluators_configs(
             for evaluator_config_id in evaluators_configs_ids
         ]
 
-        query = select(EvaluatorConfigDB).where(
+        query = select(EvaluatorConfigDB.id, EvaluatorConfigDB.evaluator_key).where(
             EvaluatorConfigDB.id.in_(evaluator_config_uuids),
-            EvaluatorConfigDB.evaluator_key == "auto_ai_critique",
+            EvaluatorConfigDB.evaluator_key.in_(evaluators_keys),
         )
-
         result = await session.execute(query)
-        evaluators_configs = result.scalars().all()
 
-        return bool(evaluators_configs)
+        # NOTE: result.all() returns the records as a list of tuples
+        # 0 is the evaluator_id and 1 is evaluator_key
+        fetched_evaluators_keys = {config[1] for config in result.all()}
+
+        # Ensure the passed evaluators are found in the fetched evaluator keys
+        return any(key in fetched_evaluators_keys for key in evaluators_keys)
 
 
 async def fetch_evaluator_config_by_appId(

From 4cee49fa9acc04e635e321e854e1a6b27b7fbc74 Mon Sep 17 00:00:00 2001
From: Abram <israelvictory87@gmail.com>
Date: Mon, 19 Aug 2024 23:51:24 +0100
Subject: [PATCH 04/13] chore (backend): remove deprecated function
 'check_ai_critique_inputs'

---
 .../services/evaluator_manager.py             | 25 -------------------
 1 file changed, 25 deletions(-)

diff --git a/agenta-backend/agenta_backend/services/evaluator_manager.py b/agenta-backend/agenta_backend/services/evaluator_manager.py
index 586c59b282..84dd456e2d 100644
--- a/agenta-backend/agenta_backend/services/evaluator_manager.py
+++ b/agenta-backend/agenta_backend/services/evaluator_manager.py
@@ -166,28 +166,3 @@ async def create_ready_to_use_evaluators(app: AppDB):
             evaluator_key=evaluator.key,
             settings_values=settings_values,
         )
-
-
-async def check_ai_critique_inputs(
-    evaluators_configs: List[str], lm_providers_keys: Optional[Dict[str, Any]]
-) -> Tuple[bool, Optional[JSONResponse]]:
-    """
-    Checks if AI critique exists in evaluators configs and validates lm_providers_keys.
-
-    Args:
-        evaluators_configs (List[str]): List of evaluator configurations.
-        lm_providers_keys (Optional[Dict[str, Any]]): Language model provider keys.
-
-    Returns:
-        Tuple[bool, Optional[JSONResponse]]: Returns a tuple containing a boolean indicating success,
-                                             and a JSONResponse in case of error.
-    """
-    if await db_manager.check_if_ai_critique_exists_in_list_of_evaluators_configs(
-        evaluators_configs
-    ):
-        if not lm_providers_keys:
-            return False, JSONResponse(
-                {"detail": "Missing LM provider Key"},
-                status_code=400,
-            )
-    return True, None

From c6ee3c8c8989f1bedb5e6e4b695fc0c7dfebf333 Mon Sep 17 00:00:00 2001
From: Abram <israelvictory87@gmail.com>
Date: Mon, 19 Aug 2024 23:53:54 +0100
Subject: [PATCH 05/13] feat (backend): implemented helper functions to:

- format llm provider keys
- and to ensure required llm keys exists in the provided evaluator configs
---
 .../agenta_backend/services/helpers.py        | 66 ++++++++++++++++++-
 1 file changed, 64 insertions(+), 2 deletions(-)

diff --git a/agenta-backend/agenta_backend/services/helpers.py b/agenta-backend/agenta_backend/services/helpers.py
index 7b9510a0b7..04208291f3 100644
--- a/agenta-backend/agenta_backend/services/helpers.py
+++ b/agenta-backend/agenta_backend/services/helpers.py
@@ -1,6 +1,9 @@
 import json
-from typing import List, Dict, Any, Tuple, Union
-from datetime import datetime, timedelta, timezone
+from datetime import datetime, timezone
+from typing import List, Dict, Any, Union, Tuple
+
+from agenta_backend.services import db_manager
+from agenta_backend.models.api.evaluation_model import LMProvidersEnum
 
 
 def format_inputs(list_of_dictionaries: List[Dict[str, Any]]) -> Dict:
@@ -76,3 +79,62 @@ def convert_to_utc_datetime(dt: Union[datetime, str, None]) -> datetime:
     if dt.tzinfo is None:
         return dt.replace(tzinfo=timezone.utc)
     return dt
+
+
+def format_llm_provider_keys(
+    llm_provider_keys: Dict[LMProvidersEnum, str]
+) -> Dict[str, str]:
+    """Formats a dictionary of LLM provider keys into a dictionary of strings.
+
+    Args:
+        llm_provider_keys (Dict[LMProvidersEnum, str]): LLM provider keys
+
+    Returns:
+        Dict[str, str]: formatted llm provided keys
+
+    Example:
+        Input: {<LMProvidersEnum.mistralai: 'MISTRAL_API_KEY'>: '...', ...}
+        Output:  {'MISTRAL_API_KEY': '...', ...}
+    """
+
+    llm_provider_keys = {key.value: value for key, value in llm_provider_keys.items()}
+    return llm_provider_keys
+
+
+async def ensure_required_llm_keys_exist(
+    evaluator_configs: List[str], llm_provider_keys: Dict[str, str]
+) -> Tuple[bool, None]:
+    """
+    Validates if necessary LLM API keys are present when required evaluators are used.
+
+    Args:
+        evaluator_configs (List[str]): List of evaluator configurations to check.
+        llm_provider_keys (Dict[str, str]): Dictionary of LLM provider keys (e.g., {"OPENAI_API_KEY": "your-key"}).
+
+    Returns:
+        Tuple[bool, None]: Returns (True, None) if validation passes.
+
+    Raises:
+        ValueError: If an evaluator requiring LLM keys is configured but no LLM API key is provided.
+
+    """
+
+    evaluators_requiring_llm_keys = [
+        "rag_context_relevancy",
+        "rag_faithfulness",
+        "auto_ai_critique",
+        "auto_semantic_similarity",
+    ]
+
+    evaluators_found = (
+        await db_manager.check_if_evaluators_exist_in_list_of_evaluators_configs(
+            evaluator_configs, evaluators_requiring_llm_keys
+        )
+    )
+
+    if evaluators_found and "OPENAI_API_KEY" not in llm_provider_keys:
+        raise ValueError(
+            "OpenAI API key is required to run one or more of the specified evaluators."
+        )
+
+    return True, None

From a8c1273bbc5e0da03c2aef0edc3be65a36e79070 Mon Sep 17 00:00:00 2001
From: Abram <israelvictory87@gmail.com>
Date: Mon, 19 Aug 2024 23:54:52 +0100
Subject: [PATCH 06/13] refactor (backend): update evaluator_router to:

- properly format llm provider keys
- and check that the required llm keys exists
---
 .../agenta_backend/routers/evaluation_router.py     | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/agenta-backend/agenta_backend/routers/evaluation_router.py b/agenta-backend/agenta_backend/routers/evaluation_router.py
index 3ebe171772..19bb6ec2af 100644
--- a/agenta-backend/agenta_backend/routers/evaluation_router.py
+++ b/agenta-backend/agenta_backend/routers/evaluation_router.py
@@ -5,6 +5,7 @@
 from fastapi.responses import JSONResponse
 from fastapi import HTTPException, Request, status, Response, Query
 
+from agenta_backend.services import helpers
 from agenta_backend.models import converters
 from agenta_backend.tasks.evaluations import evaluate
 from agenta_backend.utils.common import APIRouter, isCloudEE
@@ -15,9 +16,6 @@
     NewEvaluation,
     DeleteEvaluation,
 )
-from agenta_backend.services.evaluator_manager import (
-    check_ai_critique_inputs,
-)
 
 if isCloudEE():
     from agenta_backend.commons.models.shared_models import Permission
@@ -112,8 +110,9 @@ async def create_evaluation(
                     status_code=403,
                 )
 
-        success, response = await check_ai_critique_inputs(
-            payload.evaluators_configs, payload.lm_providers_keys
+        llm_provider_keys = helpers.format_llm_provider_keys(payload.lm_providers_keys)
+        success, response = await helpers.ensure_required_llm_keys_exist(
+            payload.evaluators_configs, llm_provider_keys
         )
         if not success:
             return response
@@ -134,8 +133,8 @@ async def create_evaluation(
                 evaluators_config_ids=payload.evaluators_configs,
                 testset_id=payload.testset_id,
                 evaluation_id=evaluation.id,
-                rate_limit_config=payload.rate_limit.dict(),
-                lm_providers_keys=payload.lm_providers_keys,
+                rate_limit_config=payload.rate_limit.model_dump(),
+                lm_providers_keys=llm_provider_keys,
             )
             evaluations.append(evaluation)
 

From f3367ef4476fd5f20aa7cfa7bac60b290aefafcb Mon Sep 17 00:00:00 2001
From: Abram <israelvictory87@gmail.com>
Date: Tue, 20 Aug 2024 08:16:32 +0100
Subject: [PATCH 07/13] feat (tests): added test to create evaluation with no
 llm keys

---
 .../tests/variants_main_router/conftest.py    | 10 ++++
 .../test_variant_evaluators_router.py         | 57 +++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py
index 0d86e074c9..d636052b93 100644
--- a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py
+++ b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py
@@ -217,6 +217,16 @@ def app_variant_parameters_updated():
     }
 
 
+@pytest.fixture()
+def evaluators_requiring_llm_keys():
+    return [
+        "rag_context_relevancy",
+        "rag_faithfulness",
+        "auto_ai_critique",
+        "auto_semantic_similarity",
+    ]
+
+
 @pytest.fixture()
 def auto_exact_match_evaluator_config():
     return {
diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py
index ecd5e0a02d..15ef905f29 100644
--- a/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py
+++ b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py
@@ -269,6 +269,63 @@ async def create_evaluation_with_evaluator(evaluator_config_name):
         await wait_for_evaluation_to_finish(evaluation_id)
 
 
+@pytest.mark.asyncio
+async def test_create_evaluation_with_no_llm_keys(evaluators_requiring_llm_keys):
+    async with db_engine.get_session() as session:
+        app_result = await session.execute(select(AppDB).filter_by(app_name=APP_NAME))
+        app = app_result.scalars().first()
+
+        app_variant_result = await session.execute(
+            select(AppVariantDB).filter_by(app_id=app.id)
+        )
+        app_variant = app_variant_result.scalars().first()
+
+        testset_result = await session.execute(
+            select(TestSetDB).filter_by(app_id=app.id)
+        )
+        testset = testset_result.scalars().first()
+
+        # Prepare payload
+        payload = {
+            "app_id": str(app.id),
+            "variant_ids": [str(app_variant.id)],
+            "evaluators_configs": [],
+            "testset_id": str(testset.id),
+            "lm_providers_keys": {"MISTRAL_API_KEY": OPEN_AI_KEY},
+            "rate_limit": {
+                "batch_size": 10,
+                "max_retries": 3,
+                "retry_delay": 3,
+                "delay_between_batches": 5,
+            },
+        }
+
+        # Fetch evaluator configs
+        response = await test_client.get(
+            f"{BACKEND_API_HOST}/evaluators/configs/?app_id={payload['app_id']}",
+            timeout=timeout,
+        )
+        list_of_configs_ids = []
+        evaluator_configs = response.json()
+        for evaluator_config in evaluator_configs:
+            if evaluator_config["evaluator_key"] in evaluators_requiring_llm_keys:
+                list_of_configs_ids.append(evaluator_config["id"])
+
+        # Update payload with list of configs ids
+        payload["evaluators_configs"] = list_of_configs_ids
+
+        # Make request to create evaluation
+        response = await test_client.post(
+            f"{BACKEND_API_HOST}/evaluations/", json=payload, timeout=timeout
+        )
+
+        assert response.status_code == 500
+        assert (
+            response.json()["detail"]
+            == "OpenAI API key is required to run one or more of the specified evaluators."
+        )
+
+
 @pytest.mark.asyncio
 async def test_create_evaluation_auto_exact_match():
     await create_evaluation_with_evaluator("auto_exact_match_evaluator_config")

From c499a192a3cb21044e00cb72526ae865dae814fb Mon Sep 17 00:00:00 2001
From: Abram <israelvictory87@gmail.com>
Date: Tue, 20 Aug 2024 10:33:16 +0100
Subject: [PATCH 08/13] refactor (backend): added - configurable setting to
 evaluators requiring llm api keys - update fixture to make use of centralized
 evaluators

---
 .../resources/evaluators/evaluators.py        | 139 +++++++++++++++++-
 .../agenta_backend/services/helpers.py        |  10 +-
 .../tests/variants_main_router/conftest.py    |  12 +-
 3 files changed, 150 insertions(+), 11 deletions(-)

diff --git a/agenta-backend/agenta_backend/resources/evaluators/evaluators.py b/agenta-backend/agenta_backend/resources/evaluators/evaluators.py
index 55e1105e16..c902bf025a 100644
--- a/agenta-backend/agenta_backend/resources/evaluators/evaluators.py
+++ b/agenta-backend/agenta_backend/resources/evaluators/evaluators.py
@@ -1,4 +1,12 @@
 rag_evaluator_settings_template = {
+    "requires_llm_api_keys": {
+        "label": "Requires LLM API Key(s)",
+        "type": "boolean",
+        "required": True,
+        "default": True,
+        "advanced": True,
+        "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+    },
     "question_key": {
         "label": "Question Key",
         "default": "",
@@ -30,6 +38,14 @@
         "key": "auto_exact_match",
         "direct_use": True,
         "settings_template": {
+            "requires_llm_api_keys": {
+                "label": "Requires LLM API Key(s)",
+                "type": "boolean",
+                "required": True,
+                "default": False,
+                "advanced": True,
+                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+            },
             "correct_answer_key": {
                 "label": "Expected Answer Column",
                 "default": "correct_answer",
@@ -46,7 +62,16 @@
         "name": "Contains Json",
         "key": "auto_contains_json",
         "direct_use": True,
-        "settings_template": {},
+        "settings_template": {
+            "requires_llm_api_keys": {
+                "label": "Requires LLM API Key(s)",
+                "type": "boolean",
+                "required": True,
+                "default": False,
+                "advanced": True,
+                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+            },
+        },
         "description": "Contains Json evaluator checks if the output contains the specified JSON structure.",
         "oss": True,
     },
@@ -55,6 +80,14 @@
         "key": "auto_similarity_match",
         "direct_use": False,
         "settings_template": {
+            "requires_llm_api_keys": {
+                "label": "Requires LLM API Key(s)",
+                "type": "boolean",
+                "required": True,
+                "default": False,
+                "advanced": True,
+                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+            },
             "similarity_threshold": {
                 "label": "Similarity Threshold",
                 "type": "number",
@@ -82,6 +115,14 @@
         "direct_use": False,
         "description": "Semantic Similarity Match evaluator measures the similarity between two pieces of text by analyzing their meaning and context. It compares the semantic content, providing a score that reflects how closely the texts match in terms of meaning, rather than just exact word matches.",
         "settings_template": {
+            "requires_llm_api_keys": {
+                "label": "Requires LLM API Key(s)",
+                "type": "boolean",
+                "required": True,
+                "advanced": True,
+                "default": True,
+                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+            },
             "correct_answer_key": {
                 "label": "Expected Answer Column",
                 "default": "correct_answer",
@@ -99,6 +140,14 @@
         "direct_use": False,
         "description": "Regex Test evaluator checks if the generated answer matches a regular expression pattern. You need to provide the regex expression and specify whether an answer is correct if it matches or does not match the regex.",
         "settings_template": {
+            "requires_llm_api_keys": {
+                "label": "Requires LLM API Key(s)",
+                "type": "boolean",
+                "required": True,
+                "default": False,
+                "advanced": True,
+                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+            },
             "regex_pattern": {
                 "label": "Regex Pattern",
                 "type": "regex",
@@ -120,6 +169,14 @@
         "key": "field_match_test",
         "direct_use": False,
         "settings_template": {
+            "requires_llm_api_keys": {
+                "label": "Requires LLM API Key(s)",
+                "type": "boolean",
+                "required": True,
+                "default": False,
+                "advanced": True,
+                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+            },
             "json_field": {
                 "label": "JSON Field",
                 "type": "string",
@@ -145,6 +202,14 @@
         "direct_use": False,
         "description": "Compares the generated JSON output to a ground truth JSON and returns a normalized score between 0 and 1 based on their differences.",
         "settings_template": {
+            "requires_llm_api_keys": {
+                "label": "Requires LLM API Key(s)",
+                "type": "boolean",
+                "required": True,
+                "default": False,
+                "advanced": True,
+                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+            },
             "compare_schema_only": {
                 "label": "Compare Schema Only",
                 "type": "boolean",
@@ -182,6 +247,14 @@
         "key": "auto_ai_critique",
         "direct_use": False,
         "settings_template": {
+            "requires_llm_api_keys": {
+                "label": "Requires LLM API Key(s)",
+                "type": "boolean",
+                "required": True,
+                "default": True,
+                "advanced": True,
+                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+            },
             "prompt_template": {
                 "label": "Prompt Template",
                 "type": "text",
@@ -206,6 +279,14 @@
         "key": "auto_custom_code_run",
         "direct_use": False,
         "settings_template": {
+            "requires_llm_api_keys": {
+                "label": "Requires LLM API Key(s)",
+                "type": "boolean",
+                "required": True,
+                "default": False,
+                "advanced": True,
+                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+            },
             "code": {
                 "label": "Evaluation Code",
                 "type": "code",
@@ -230,6 +311,14 @@
         "key": "auto_webhook_test",
         "direct_use": False,
         "settings_template": {
+            "requires_llm_api_keys": {
+                "label": "Requires LLM API Key(s)",
+                "type": "boolean",
+                "required": True,
+                "default": False,
+                "advanced": True,
+                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+            },
             "webhook_url": {
                 "label": "Webhook URL",
                 "type": "string",
@@ -253,6 +342,14 @@
         "key": "auto_starts_with",
         "direct_use": False,
         "settings_template": {
+            "requires_llm_api_keys": {
+                "label": "Requires LLM API Key(s)",
+                "type": "boolean",
+                "required": True,
+                "default": False,
+                "advanced": True,
+                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+            },
             "prefix": {
                 "label": "prefix",
                 "type": "string",
@@ -274,6 +371,14 @@
         "key": "auto_ends_with",
         "direct_use": False,
         "settings_template": {
+            "requires_llm_api_keys": {
+                "label": "Requires LLM API Key(s)",
+                "type": "boolean",
+                "required": True,
+                "default": False,
+                "advanced": True,
+                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+            },
             "case_sensitive": {
                 "label": "Case Sensitive",
                 "type": "boolean",
@@ -295,6 +400,14 @@
         "key": "auto_contains",
         "direct_use": False,
         "settings_template": {
+            "requires_llm_api_keys": {
+                "label": "Requires LLM API Key(s)",
+                "type": "boolean",
+                "required": True,
+                "default": False,
+                "advanced": True,
+                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+            },
             "case_sensitive": {
                 "label": "Case Sensitive",
                 "type": "boolean",
@@ -316,6 +429,14 @@
         "key": "auto_contains_any",
         "direct_use": False,
         "settings_template": {
+            "requires_llm_api_keys": {
+                "label": "Requires LLM API Key(s)",
+                "type": "boolean",
+                "required": True,
+                "default": False,
+                "advanced": True,
+                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+            },
             "case_sensitive": {
                 "label": "Case Sensitive",
                 "type": "boolean",
@@ -337,6 +458,14 @@
         "key": "auto_contains_all",
         "direct_use": False,
         "settings_template": {
+            "requires_llm_api_keys": {
+                "label": "Requires LLM API Key(s)",
+                "type": "boolean",
+                "required": True,
+                "default": False,
+                "advanced": True,
+                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+            },
             "case_sensitive": {
                 "label": "Case Sensitive",
                 "type": "boolean",
@@ -358,6 +487,14 @@
         "key": "auto_levenshtein_distance",
         "direct_use": False,
         "settings_template": {
+            "requires_llm_api_keys": {
+                "label": "Requires LLM API Key(s)",
+                "type": "boolean",
+                "required": True,
+                "default": False,
+                "advanced": True,
+                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
+            },
             "threshold": {
                 "label": "Threshold",
                 "type": "number",
diff --git a/agenta-backend/agenta_backend/services/helpers.py b/agenta-backend/agenta_backend/services/helpers.py
index 04208291f3..1731dfe64f 100644
--- a/agenta-backend/agenta_backend/services/helpers.py
+++ b/agenta-backend/agenta_backend/services/helpers.py
@@ -4,6 +4,7 @@
 
 from agenta_backend.services import db_manager
 from agenta_backend.models.api.evaluation_model import LMProvidersEnum
+from agenta_backend.resources.evaluators.evaluators import get_all_evaluators
 
 
 def format_inputs(list_of_dictionaries: List[Dict[str, Any]]) -> Dict:
@@ -120,12 +121,11 @@ async def ensure_required_llm_keys_exist(
     """
 
     evaluators_requiring_llm_keys = [
-        "rag_context_relevancy",
-        "rag_faithfulness",
-        "auto_ai_critique",
-        "auto_semantic_similarity",
+        evaluator["key"]
+        for evaluator in get_all_evaluators()
+        if evaluator["settings_template"]["requires_llm_api_keys"].get("default", False)
+        is True
     ]
-
     evaluators_found = (
         await db_manager.check_if_evaluators_exist_in_list_of_evaluators_configs(
             evaluator_configs, evaluators_requiring_llm_keys
diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py
index d636052b93..f0bb764814 100644
--- a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py
+++ b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py
@@ -13,6 +13,7 @@
     ImageDB,
     AppVariantDB,
 )
+from agenta_backend.resources.evaluators.evaluators import get_all_evaluators
 
 import httpx
 from sqlalchemy.future import select
@@ -219,12 +220,13 @@ def app_variant_parameters_updated():
 
 @pytest.fixture()
 def evaluators_requiring_llm_keys():
-    return [
-        "rag_context_relevancy",
-        "rag_faithfulness",
-        "auto_ai_critique",
-        "auto_semantic_similarity",
+    evaluators_requiring_llm_keys = [
+        evaluator["key"]
+        for evaluator in get_all_evaluators()
+        if evaluator["settings_template"]["requires_llm_api_keys"].get("default", False)
+        is True
     ]
+    return evaluators_requiring_llm_keys
 
 
 @pytest.fixture()

From 33e6e170267061c1f9df73d6eca8974f1caa59bd Mon Sep 17 00:00:00 2001
From: Abram <israelvictory87@gmail.com>
Date: Thu, 22 Aug 2024 01:04:21 +0100
Subject: [PATCH 09/13] refactor (backend): clean up LLM key checks in
 evaluators

- Removed `requires_llm_api_keys` from evaluators that don't require LLM API keys
- Ensured evaluators requiring LLM keys have `requires_llm_api_keys` set to `True` by default
---
 .../resources/evaluators/evaluators.py        | 127 +-----------------
 .../agenta_backend/services/helpers.py        |   8 +-
 .../tests/variants_main_router/conftest.py    |   8 +-
 3 files changed, 17 insertions(+), 126 deletions(-)

diff --git a/agenta-backend/agenta_backend/resources/evaluators/evaluators.py b/agenta-backend/agenta_backend/resources/evaluators/evaluators.py
index 99cd1006c2..1a8f6f5b77 100644
--- a/agenta-backend/agenta_backend/resources/evaluators/evaluators.py
+++ b/agenta-backend/agenta_backend/resources/evaluators/evaluators.py
@@ -1,12 +1,4 @@
 rag_evaluator_settings_template = {
-    "requires_llm_api_keys": {
-        "label": "Requires LLM API Key(s)",
-        "type": "boolean",
-        "required": True,
-        "default": True,
-        "advanced": True,
-        "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
-    },
     "question_key": {
         "label": "Question Key",
         "default": "",
@@ -38,14 +30,6 @@
         "key": "auto_exact_match",
         "direct_use": True,
         "settings_template": {
-            "requires_llm_api_keys": {
-                "label": "Requires LLM API Key(s)",
-                "type": "boolean",
-                "required": True,
-                "default": False,
-                "advanced": True,
-                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
-            },
             "correct_answer_key": {
                 "label": "Expected Answer Column",
                 "default": "correct_answer",
@@ -62,16 +46,7 @@
         "name": "Contains JSON",
         "key": "auto_contains_json",
         "direct_use": True,
-        "settings_template": {
-            "requires_llm_api_keys": {
-                "label": "Requires LLM API Key(s)",
-                "type": "boolean",
-                "required": True,
-                "default": False,
-                "advanced": True,
-                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
-            },
-        },
+        "settings_template": {},
         "description": "'Contains JSON' evaluator checks if the output contains the a valid JSON.",
         "oss": True,
     },
@@ -80,14 +55,6 @@
         "key": "auto_similarity_match",
         "direct_use": False,
         "settings_template": {
-            "requires_llm_api_keys": {
-                "label": "Requires LLM API Key(s)",
-                "type": "boolean",
-                "required": True,
-                "default": False,
-                "advanced": True,
-                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
-            },
             "similarity_threshold": {
                 "label": "Similarity Threshold",
                 "type": "number",
@@ -113,16 +80,9 @@
         "name": "Semantic Similarity Match",
         "key": "auto_semantic_similarity",
         "direct_use": False,
+        "requires_llm_api_keys": True,
         "description": "Semantic Similarity Match evaluator measures the similarity between two pieces of text by analyzing their meaning and context. It compares the semantic content, providing a score that reflects how closely the texts match in terms of meaning, rather than just exact word matches.",
         "settings_template": {
-            "requires_llm_api_keys": {
-                "label": "Requires LLM API Key(s)",
-                "type": "boolean",
-                "required": True,
-                "advanced": True,
-                "default": True,
-                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
-            },
             "correct_answer_key": {
                 "label": "Expected Answer Column",
                 "default": "correct_answer",
@@ -140,14 +100,6 @@
         "direct_use": False,
         "description": "Regex Test evaluator checks if the generated answer matches a regular expression pattern. You need to provide the regex expression and specify whether an answer is correct if it matches or does not match the regex.",
         "settings_template": {
-            "requires_llm_api_keys": {
-                "label": "Requires LLM API Key(s)",
-                "type": "boolean",
-                "required": True,
-                "default": False,
-                "advanced": True,
-                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
-            },
             "regex_pattern": {
                 "label": "Regex Pattern",
                 "type": "regex",
@@ -169,14 +121,6 @@
         "key": "field_match_test",
         "direct_use": False,
         "settings_template": {
-            "requires_llm_api_keys": {
-                "label": "Requires LLM API Key(s)",
-                "type": "boolean",
-                "required": True,
-                "default": False,
-                "advanced": True,
-                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
-            },
             "json_field": {
                 "label": "JSON Field",
                 "type": "string",
@@ -202,14 +146,6 @@
         "direct_use": False,
         "description": "Compares the generated JSON output to a ground truth JSON and returns a normalized score between 0 and 1 based on their differences.",
         "settings_template": {
-            "requires_llm_api_keys": {
-                "label": "Requires LLM API Key(s)",
-                "type": "boolean",
-                "required": True,
-                "default": False,
-                "advanced": True,
-                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
-            },
             "compare_schema_only": {
                 "label": "Compare Schema Only",
                 "type": "boolean",
@@ -246,15 +182,8 @@
         "name": "LLM-as-a-judge",
         "key": "auto_ai_critique",
         "direct_use": False,
+        "requires_llm_api_keys": True,
         "settings_template": {
-            "requires_llm_api_keys": {
-                "label": "Requires LLM API Key(s)",
-                "type": "boolean",
-                "required": True,
-                "default": True,
-                "advanced": True,
-                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
-            },
             "prompt_template": {
                 "label": "Prompt Template",
                 "type": "text",
@@ -342,14 +271,6 @@
         "key": "auto_starts_with",
         "direct_use": False,
         "settings_template": {
-            "requires_llm_api_keys": {
-                "label": "Requires LLM API Key(s)",
-                "type": "boolean",
-                "required": True,
-                "default": False,
-                "advanced": True,
-                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
-            },
             "prefix": {
                 "label": "prefix",
                 "type": "string",
@@ -371,14 +292,6 @@
         "key": "auto_ends_with",
         "direct_use": False,
         "settings_template": {
-            "requires_llm_api_keys": {
-                "label": "Requires LLM API Key(s)",
-                "type": "boolean",
-                "required": True,
-                "default": False,
-                "advanced": True,
-                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
-            },
             "case_sensitive": {
                 "label": "Case Sensitive",
                 "type": "boolean",
@@ -400,14 +313,6 @@
         "key": "auto_contains",
         "direct_use": False,
         "settings_template": {
-            "requires_llm_api_keys": {
-                "label": "Requires LLM API Key(s)",
-                "type": "boolean",
-                "required": True,
-                "default": False,
-                "advanced": True,
-                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
-            },
             "case_sensitive": {
                 "label": "Case Sensitive",
                 "type": "boolean",
@@ -429,14 +334,6 @@
         "key": "auto_contains_any",
         "direct_use": False,
         "settings_template": {
-            "requires_llm_api_keys": {
-                "label": "Requires LLM API Key(s)",
-                "type": "boolean",
-                "required": True,
-                "default": False,
-                "advanced": True,
-                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
-            },
             "case_sensitive": {
                 "label": "Case Sensitive",
                 "type": "boolean",
@@ -458,14 +355,6 @@
         "key": "auto_contains_all",
         "direct_use": False,
         "settings_template": {
-            "requires_llm_api_keys": {
-                "label": "Requires LLM API Key(s)",
-                "type": "boolean",
-                "required": True,
-                "default": False,
-                "advanced": True,
-                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
-            },
             "case_sensitive": {
                 "label": "Case Sensitive",
                 "type": "boolean",
@@ -487,14 +376,6 @@
         "key": "auto_levenshtein_distance",
         "direct_use": False,
         "settings_template": {
-            "requires_llm_api_keys": {
-                "label": "Requires LLM API Key(s)",
-                "type": "boolean",
-                "required": True,
-                "default": False,
-                "advanced": True,
-                "description": "Indicates whether the evaluation requires LLM API key(s) to function.",
-            },
             "threshold": {
                 "label": "Threshold",
                 "type": "number",
@@ -517,6 +398,7 @@
         "name": "RAG Faithfulness",
         "key": "rag_faithfulness",
         "direct_use": False,
+        "requires_llm_api_keys": True,
         "settings_template": rag_evaluator_settings_template,
         "description": "RAG Faithfulness evaluator assesses the accuracy and reliability of responses generated by Retrieval-Augmented Generation (RAG) models. It evaluates how faithfully the responses adhere to the retrieved documents or sources, ensuring that the generated text accurately reflects the information from the original sources.",
     },
@@ -524,6 +406,7 @@
         "name": "RAG Context Relevancy",
         "key": "rag_context_relevancy",
         "direct_use": False,
+        "requires_llm_api_keys": True,
         "settings_template": rag_evaluator_settings_template,
         "description": "RAG Context Relevancy evaluator measures how relevant the retrieved documents or contexts are to the given question or prompt. It ensures that the selected documents provide the necessary information for generating accurate and meaningful responses, improving the overall quality of the RAG model's output.",
     },
diff --git a/agenta-backend/agenta_backend/services/helpers.py b/agenta-backend/agenta_backend/services/helpers.py
index 1731dfe64f..18951ad6f7 100644
--- a/agenta-backend/agenta_backend/services/helpers.py
+++ b/agenta-backend/agenta_backend/services/helpers.py
@@ -123,8 +123,12 @@ async def ensure_required_llm_keys_exist(
     evaluators_requiring_llm_keys = [
         evaluator["key"]
         for evaluator in get_all_evaluators()
-        if evaluator["settings_template"]["requires_llm_api_keys"].get("default", False)
-        is True
+        if evaluator.get("requires_llm_api_keys", False)
+        or (
+            evaluator.get("settings_template", {})
+            .get("requires_llm_api_keys", {})
+            .get("default", False)
+        )
     ]
     evaluators_found = (
         await db_manager.check_if_evaluators_exist_in_list_of_evaluators_configs(
diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py
index 4fec2e75d6..5356ad0e9c 100644
--- a/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py
+++ b/agenta-backend/agenta_backend/tests/variants_main_router/conftest.py
@@ -227,8 +227,12 @@ def evaluators_requiring_llm_keys():
     evaluators_requiring_llm_keys = [
         evaluator["key"]
         for evaluator in get_all_evaluators()
-        if evaluator["settings_template"]["requires_llm_api_keys"].get("default", False)
-        is True
+        if evaluator.get("requires_llm_api_keys", False)
+        or (
+            evaluator.get("settings_template", {})
+            .get("requires_llm_api_keys", {})
+            .get("default", False)
+        )
     ]
     return evaluators_requiring_llm_keys
 

From 7c28f6d14878d009a2eb38cd5924dafee76d745e Mon Sep 17 00:00:00 2001
From: Abram <israelvictory87@gmail.com>
Date: Thu, 22 Aug 2024 01:05:45 +0100
Subject: [PATCH 10/13] chore (tests): add '@pytest.mark.asyncio' to test cases
 in test_user_profile

---
 .../variants_main_router/test_variant_evaluators_router.py      | 1 -
 .../tests/variants_user_profile_router/test_user_profile.py     | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py
index dcbea2d10d..a2067da77e 100644
--- a/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py
+++ b/agenta-backend/agenta_backend/tests/variants_main_router/test_variant_evaluators_router.py
@@ -176,7 +176,6 @@ async def fetch_evaluation_results(evaluation_id):
         f"{BACKEND_API_HOST}/evaluations/{evaluation_id}/results/", timeout=timeout
     )
     response_data = response.json()
-    print("Response Data: ", response_data)
 
     assert response.status_code == 200
     assert response_data["evaluation_id"] == evaluation_id
diff --git a/agenta-backend/agenta_backend/tests/variants_user_profile_router/test_user_profile.py b/agenta-backend/agenta_backend/tests/variants_user_profile_router/test_user_profile.py
index 1fd8a4aec6..d7fd237994 100644
--- a/agenta-backend/agenta_backend/tests/variants_user_profile_router/test_user_profile.py
+++ b/agenta-backend/agenta_backend/tests/variants_user_profile_router/test_user_profile.py
@@ -48,6 +48,7 @@ async def test_fetch_user_profile_without_user_id():
         assert response.json()["username"] == user_db_dict["username"]
 
 
+@pytest.mark.asyncio
 async def test_fetch_user_profile_with_valid_user_id():
     async with db_engine.get_session() as session:
         result = await session.execute(select(UserDB).filter_by(uid="0"))
@@ -75,6 +76,7 @@ async def test_fetch_user_profile_with_valid_user_id():
         assert response.json()["username"] == user_db_dict["username"]
 
 
+@pytest.mark.asyncio
 async def test_fetch_user_profile_with_non_existent_user_id_error():
     user_non_existent_id = str(uuid4())
     response = await test_client.get(

From 91d23d8f9e87cb4ec7eac6f6dcfffdbdf8c0522b Mon Sep 17 00:00:00 2001
From: Juan Pablo Vega <jp@agenta.ai>
Date: Fri, 23 Aug 2024 14:14:37 +0200
Subject: [PATCH 11/13] fix ai critique

---
 .../services/evaluators_service.py              | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py
index 392fee8b85..2328184ae6 100644
--- a/agenta-backend/agenta_backend/services/evaluators_service.py
+++ b/agenta-backend/agenta_backend/services/evaluators_service.py
@@ -337,13 +337,20 @@ async def auto_ai_critique(
     try:
         correct_answer = get_correct_answer(data_point, settings_values)
         inputs = {
-            "prompt_user": app_params.get("prompt_user", ""),
+            "prompt_user": app_params.get("prompt_user", "").format(**data_point),
             "prediction": output,
             "ground_truth": correct_answer,
         }
+        settings = {
+            "prompt_template": settings_values.get("prompt_template", ""),
+        }
         response = await ai_critique(
             input=EvaluatorInputInterface(
-                **{"inputs": inputs, "credentials": lm_providers_keys}
+                **{
+                    "inputs": inputs,
+                    "settings": settings,
+                    "credentials": lm_providers_keys,
+                }
             )
         )
         return Result(type="text", value=response["outputs"]["score"])
@@ -374,12 +381,14 @@ async def ai_critique(input: EvaluatorInputInterface) -> EvaluatorOutputInterfac
     for key, value in input.inputs.items():
         chain_run_args[key] = value
 
-    prompt_template = input.settings["prompt_template"]
+    prompt_system = input.settings.get("prompt_system", "")
     messages = [
-        {"role": "system", "content": prompt_template},
+        {"role": "system", "content": prompt_system},
         {"role": "user", "content": str(chain_run_args)},
     ]
 
+    print(input)
+
     client = AsyncOpenAI(api_key=openai_api_key)
     response = await client.chat.completions.create(
         model="gpt-3.5-turbo", messages=messages, temperature=0.8

From ca81cea88ace482dabaa832f0714483cfc26bb17 Mon Sep 17 00:00:00 2001
From: Abram <israelvictory87@gmail.com>
Date: Fri, 23 Aug 2024 14:32:10 +0100
Subject: [PATCH 12/13] minor refactor (backend): resolve ValueError when
 casting string to float for ai critique evaluator

---
 agenta-backend/agenta_backend/services/evaluators_service.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py
index 2328184ae6..bfb5861589 100644
--- a/agenta-backend/agenta_backend/services/evaluators_service.py
+++ b/agenta-backend/agenta_backend/services/evaluators_service.py
@@ -394,7 +394,7 @@ async def ai_critique(input: EvaluatorInputInterface) -> EvaluatorOutputInterfac
         model="gpt-3.5-turbo", messages=messages, temperature=0.8
     )
     evaluation_output = response.choices[0].message.content.strip()
-    return {"outputs": {"score": float(evaluation_output)}}
+    return {"outputs": {"score": evaluation_output}}
 
 
 async def auto_starts_with(

From cc33a662a4283c4f4738809645ddc1885dd6ff21 Mon Sep 17 00:00:00 2001
From: jp-agenta <jp@agenta.ai>
Date: Mon, 26 Aug 2024 15:13:28 +0200
Subject: [PATCH 13/13] Update evaluators_service.py

---
 agenta-backend/agenta_backend/services/evaluators_service.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py
index 985e9e321b..d316db702d 100644
--- a/agenta-backend/agenta_backend/services/evaluators_service.py
+++ b/agenta-backend/agenta_backend/services/evaluators_service.py
@@ -442,9 +442,9 @@ async def ai_critique(input: EvaluatorInputInterface) -> EvaluatorOutputInterfac
     for key, value in input.inputs.items():
         chain_run_args[key] = value
 
-    prompt_system = input.settings.get("prompt_system", "")
+    prompt_template = input.settings.get("prompt_template", "")
     messages = [
-        {"role": "system", "content": prompt_system},
+        {"role": "system", "content": prompt_template},
         {"role": "user", "content": str(chain_run_args)},
     ]