From d8a1bbdc265805c62373e1197af0a78fa2c0ccf6 Mon Sep 17 00:00:00 2001
From: Abram <israelvictory87@gmail.com>
Date: Wed, 14 Aug 2024 08:00:10 +0100
Subject: [PATCH] feat (tests): add test case for auto_ai_critique and
 evaluators requiring OpenAI API key

---
 .../tests/unit/test_evaluators.py             | 107 ++++++++++++++++--
 1 file changed, 99 insertions(+), 8 deletions(-)

diff --git a/agenta-backend/agenta_backend/tests/unit/test_evaluators.py b/agenta-backend/agenta_backend/tests/unit/test_evaluators.py
index c0bfbfade8..7fa391ccad 100644
--- a/agenta-backend/agenta_backend/tests/unit/test_evaluators.py
+++ b/agenta-backend/agenta_backend/tests/unit/test_evaluators.py
@@ -5,6 +5,7 @@
 
 from agenta_backend.services.evaluators_service import (
     auto_levenshtein_distance,
+    auto_ai_critique,
     auto_starts_with,
     auto_ends_with,
     auto_contains,
@@ -18,6 +19,53 @@
 )
 
 
+@pytest.mark.parametrize(
+    "ground_truth, output, settings_values, openai_api_key, expected_min, expected_max",
+    [
+        (
+            {"correct_answer": "The capital of Kiribati is Tarawa."},
+            "The capital of Kiribati is South Tarawa.",
+            {
+                "prompt_template": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below:\nEvaluation strategy: 0 to 10 0 is very bad and 10 is very good.\nPrompt: {llm_app_prompt_template}\nInputs: country: {country}\nExpected Answer Column:{correct_answer}\nEvaluate this: {variant_output}\n\nAnswer ONLY with one of the given grading or evaluation options.",
+                "correct_answer_key": "correct_answer",
+            },
+            os.environ.get("OPENAI_API_KEY"),
+            0,
+            10,
+        ),
+        (
+            {"correct_answer": "The capital of Kiribati is Tarawa."},
+            "The capital of Kiribati is South Tarawa.",
+            {
+                "prompt_template": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below:\nEvaluation strategy: 0 to 10 0 is very bad and 10 is very good.\nPrompt: {llm_app_prompt_template}\nInputs: country: {country}\nExpected Answer Column:{correct_answer}\nEvaluate this: {variant_output}\n\nAnswer ONLY with one of the given grading or evaluation options.",
+                "correct_answer_key": "correct_answer",
+            },
+            None,
+            None,
+            None,
+        ),
+    ],
+)
+@pytest.mark.asyncio
+async def test_auto_ai_critique_evaluator(
+    ground_truth, output, settings_values, openai_api_key, expected_min, expected_max
+):
+    result = await auto_ai_critique(
+        {},
+        output,
+        ground_truth,
+        {},
+        settings_values,
+        {"OPENAI_API_KEY": openai_api_key},
+    )
+    try:
+        assert expected_min <= round(result.value, 1) <= expected_max
+    except TypeError as error:
+        # exceptions
+        # - raised by evaluator (agenta) -> TypeError
+        assert not isinstance(result.value, float) or not isinstance(result.value, int)
+
+
 @pytest.mark.parametrize(
     "output, settings_values, expected",
     [
@@ -287,6 +335,15 @@ async def test_auto_json_diff(
             0.0,
             1.0,
         ),
+        (
+            {"correct_answer": "The capital of Namibia is Windhoek."},
+            "Windhoek is the capital of Namibia.",
+            {
+                "correct_answer_key": "correct_answer",
+            },
+            None,
+            None,
+        ),
     ],
 )
 @pytest.mark.asyncio
@@ -301,7 +358,12 @@ async def test_auto_semantic_similarity_match(
         settings_values,
         {"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")},
     )
-    assert expected_min <= round(result.value, 3) <= expected_max
+    try:
+        assert expected_min <= round(result.value, 1) <= expected_max
+    except TypeError as error:
+        # exceptions
+        # - raised by evaluator (agenta) -> TypeError
+        assert not isinstance(result.value, float) or not isinstance(result.value, int)
 
 
 @pytest.mark.parametrize(
@@ -359,7 +421,7 @@ async def test_auto_levenshtein_distance(output, data_point, settings_values, ex
 
 
 @pytest.mark.parametrize(
-    "settings_values, expected_min, expected_max",
+    "settings_values, expected_min, openai_api_key, expected_max",
     [
         (
             {
@@ -367,28 +429,46 @@ async def test_auto_levenshtein_distance(output, data_point, settings_values, ex
                 "answer_key": "rag.reporter.outputs.report",
                 "contexts_key": "rag.retriever.outputs.movies",
             },
+            os.environ.get("OPENAI_API_KEY"),
             0.0,
             1.0,
         ),
+        (
+            {
+                "question_key": "rag.retriever.internals.prompt",
+                "answer_key": "rag.reporter.outputs.report",
+                "contexts_key": "rag.retriever.outputs.movies",
+            },
+            None,
+            None,
+            None,
+        ),
         # add more use cases
     ],
 )
 @pytest.mark.asyncio
-async def test_rag_faithfulness_evaluator(settings_values, expected_min, expected_max):
+async def test_rag_faithfulness_evaluator(
+    settings_values, expected_min, openai_api_key, expected_max
+):
     result = await rag_faithfulness(
         {},
         simple_rag_trace,
         {},
         {},
         settings_values,
-        {"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")},
+        {"OPENAI_API_KEY": openai_api_key},
     )
 
-    assert expected_min <= round(result.value, 1) <= expected_max
+    try:
+        assert expected_min <= round(result.value, 1) <= expected_max
+    except TypeError as error:
+        # exceptions
+        # - raised by evaluator (agenta) -> TypeError
+        assert not isinstance(result.value, float) or not isinstance(result.value, int)
 
 
 @pytest.mark.parametrize(
-    "settings_values, expected_min, expected_max",
+    "settings_values, expected_min, openai_api_key, expected_max",
     [
         (
             {
@@ -396,15 +476,26 @@ async def test_rag_faithfulness_evaluator(settings_values, expected_min, expecte
                 "answer_key": "rag.reporter.outputs.report",
                 "contexts_key": "rag.retriever.outputs.movies",
             },
+            os.environ.get("OPENAI_API_KEY"),
             0.0,
             1.0,
         ),
+        (
+            {
+                "question_key": "rag.retriever.internals.prompt",
+                "answer_key": "rag.reporter.outputs.report",
+                "contexts_key": "rag.retriever.outputs.movies",
+            },
+            None,
+            None,
+            None,
+        ),
         # add more use cases
     ],
 )
 @pytest.mark.asyncio
 async def test_rag_context_relevancy_evaluator(
-    settings_values, expected_min, expected_max
+    settings_values, expected_min, openai_api_key, expected_max
 ):
     result = await rag_context_relevancy(
         {},
@@ -412,7 +503,7 @@ async def test_rag_context_relevancy_evaluator(
         {},
         {},
         settings_values,
-        {"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")},
+        {"OPENAI_API_KEY": openai_api_key},
     )
 
     try: