Agenta-AI · mmabrouk · Jul 5, 2024 · Jul 3, 2024 · Jul 3, 2024 · Jul 3, 2024
diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py
@@ -33,6 +33,7 @@ class EvaluationStatusEnum(str, Enum):
     EVALUATION_FINISHED = "EVALUATION_FINISHED"
     EVALUATION_FINISHED_WITH_ERRORS = "EVALUATION_FINISHED_WITH_ERRORS"
     EVALUATION_FAILED = "EVALUATION_FAILED"
+    EVALUATION_AGGREGATION_FAILED = "EVALUATION_AGGREGATION_FAILED"
 
 
 class EvaluationScenarioStatusEnum(str, Enum):

diff --git a/agenta-backend/agenta_backend/services/aggregation_service.py b/agenta-backend/agenta_backend/services/aggregation_service.py
@@ -15,26 +15,33 @@ def aggregate_ai_critique(results: List[Result]) -> Result:
         Result: aggregated result
     """
 
-    numeric_scores = []
-    for result in results:
-        # Extract the first number found in the result value
-        match = re.search(r"\d+", result.value)
-        if match:
-            try:
-                score = int(match.group())
-                numeric_scores.append(score)
-            except ValueError:
-                # Ignore if the extracted value is not an integer
-                continue
-
-    # Calculate the average of numeric scores if any are present
-    average_value = (
-        sum(numeric_scores) / len(numeric_scores) if numeric_scores else None
-    )
-    return Result(
-        type="number",
-        value=average_value,
-    )
+    try:
+        numeric_scores = []
+        for result in results:
+            # Extract the first number found in the result value
+            match = re.search(r"\d+", result.value)
+            if match:
+                try:
+                    score = int(match.group())
+                    numeric_scores.append(score)
+                except ValueError:
+                    # Ignore if the extracted value is not an integer
+                    continue
+
+        # Calculate the average of numeric scores if any are present
+        average_value = (
+            sum(numeric_scores) / len(numeric_scores) if numeric_scores else None
+        )
+        return Result(
+            type="number",
+            value=average_value,
+        )
+    except Exception as exc:
+        return Result(
+            type="error",
+            value=None,
+            error=Error(message=str(exc), stacktrace=str(traceback.format_exc())),
+        )
 
 
 def aggregate_binary(results: List[Result]) -> Result:
@@ -71,7 +78,7 @@ def aggregate_float(results: List[Result]) -> Result:
         return Result(
             type="error",
             value=None,
-            error=Error(message="Failed", stacktrace=str(traceback.format_exc())),
+            error=Error(message=str(exc), stacktrace=str(traceback.format_exc())),
         )
 
 

diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py
@@ -1,6 +1,7 @@
 import json
 import logging
 import re
+import traceback
 from typing import Any, Dict, List, Tuple
 
 import httpx
@@ -79,7 +80,8 @@ def auto_exact_match(
             type="error",
             value=None,
             error=Error(
-                message="Error during Auto Exact Match evaluation", stacktrace=str(e)
+                message="Error during Auto Exact Match evaluation",
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -103,7 +105,8 @@ def auto_regex_test(
             type="error",
             value=None,
             error=Error(
-                message="Error during Auto Regex evaluation", stacktrace=str(e)
+                message="Error during Auto Regex evaluation",
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -186,15 +189,16 @@ def auto_webhook_test(
             value=None,
             error=Error(
                 message="Error during Auto Webhook evaluation; An HTTP error occurred",
-                stacktrace=str(e),
+                stacktrace=str(traceback.format_exc()),
             ),
         )
     except Exception as e:  # pylint: disable=broad-except
         return Result(
             type="error",
             value=None,
             error=Error(
-                message="Error during Auto Webhook evaluation", stacktrace=str(e)
+                message="Error during Auto Webhook evaluation",
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -224,7 +228,8 @@ def auto_custom_code_run(
             type="error",
             value=None,
             error=Error(
-                message="Error during Auto Custom Code Evaluation", stacktrace=str(e)
+                message="Error during Auto Custom Code Evaluation",
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -281,7 +286,10 @@ def auto_ai_critique(
         return Result(
             type="error",
             value=None,
-            error=Error(message="Error during Auto AI Critique", stacktrace=str(e)),
+            error=Error(
+                message="Error during Auto AI Critique",
+                stacktrace=str(traceback.format_exc()),
+            ),
         )
 
 
@@ -308,7 +316,8 @@ def auto_starts_with(
             type="error",
             value=None,
             error=Error(
-                message="Error during Starts With evaluation", stacktrace=str(e)
+                message="Error during Starts With evaluation",
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -335,7 +344,10 @@ def auto_ends_with(
         return Result(
             type="error",
             value=None,
-            error=Error(message="Error during Ends With evaluation", stacktrace=str(e)),
+            error=Error(
+                message="Error during Ends With evaluation",
+                stacktrace=str(traceback.format_exc()),
+            ),
         )
 
 
@@ -361,7 +373,10 @@ def auto_contains(
         return Result(
             type="error",
             value=None,
-            error=Error(message="Error during Contains evaluation", stacktrace=str(e)),
+            error=Error(
+                message="Error during Contains evaluation",
+                stacktrace=str(traceback.format_exc()),
+            ),
         )
 
 
@@ -391,7 +406,8 @@ def auto_contains_any(
             type="error",
             value=None,
             error=Error(
-                message="Error during Contains Any evaluation", stacktrace=str(e)
+                message="Error during Contains Any evaluation",
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -422,7 +438,8 @@ def auto_contains_all(
             type="error",
             value=None,
             error=Error(
-                message="Error during Contains All evaluation", stacktrace=str(e)
+                message="Error during Contains All evaluation",
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -452,7 +469,8 @@ def auto_contains_json(
             type="error",
             value=None,
             error=Error(
-                message="Error during Contains JSON evaluation", stacktrace=str(e)
+                message="Error during Contains JSON evaluation",
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -511,7 +529,7 @@ def auto_levenshtein_distance(
             value=None,
             error=Error(
                 message="Error during Levenshtein threshold evaluation",
-                stacktrace=str(e),
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -552,7 +570,7 @@ def auto_similarity_match(
             value=None,
             error=Error(
                 message="Error during Auto Similarity Match evaluation",
-                stacktrace=str(e),
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 

diff --git a/agenta-backend/agenta_backend/tasks/evaluations.py b/agenta-backend/agenta_backend/tasks/evaluations.py
@@ -249,12 +249,14 @@ def evaluate(
                 evaluators_results.append(result_object)
 
             all_correct_answers = [
-                CorrectAnswer(
-                    key=ground_truth_column_name,
-                    value=data_point[ground_truth_column_name],
+                (
+                    CorrectAnswer(
+                        key=ground_truth_column_name,
+                        value=data_point[ground_truth_column_name],
+                    )
+                    if ground_truth_column_name in data_point
+                    else CorrectAnswer(key=ground_truth_column_name, value="")
                 )
-                if ground_truth_column_name in data_point
-                else CorrectAnswer(key=ground_truth_column_name, value="")
                 for ground_truth_column_name in ground_truth_column_names
             ]
             # 4. We save the result of the eval scenario in the db
@@ -313,43 +315,72 @@ def evaluate(
                     "status": Result(
                         type="status",
                         value="EVALUATION_FAILED",
-                        error=Error(message="Evaluation Failed", stacktrace=str(e)),
+                        error=Error(
+                            message="Evaluation Failed",
+                            stacktrace=str(traceback.format_exc()),
+                        ),
                     )
                 },
             )
         )
         self.update_state(state=states.FAILURE)
         return
 
-    aggregated_results = loop.run_until_complete(
-        aggregate_evaluator_results(app, evaluators_aggregated_data)
-    )
-    loop.run_until_complete(
-        update_evaluation_with_aggregated_results(
-            new_evaluation_db.id, aggregated_results
+    try:
+        aggregated_results = loop.run_until_complete(
+            aggregate_evaluator_results(app, evaluators_aggregated_data)
         )
-    )
 
-    failed_evaluation_scenarios = loop.run_until_complete(
-        check_if_evaluation_contains_failed_evaluation_scenarios(new_evaluation_db.id)
-    )
+        loop.run_until_complete(
+            update_evaluation_with_aggregated_results(
+                new_evaluation_db.id, aggregated_results
+            )
+        )
 
-    evaluation_status = Result(
-        type="status", value=EvaluationStatusEnum.EVALUATION_FINISHED, error=None
-    )
+        failed_evaluation_scenarios = loop.run_until_complete(
+            check_if_evaluation_contains_failed_evaluation_scenarios(
+                new_evaluation_db.id
+            )
+        )
 
-    if failed_evaluation_scenarios:
         evaluation_status = Result(
-            type="status",
-            value=EvaluationStatusEnum.EVALUATION_FINISHED_WITH_ERRORS,
-            error=None,
+            type="status", value=EvaluationStatusEnum.EVALUATION_FINISHED, error=None
         )
 
-    loop.run_until_complete(
-        update_evaluation(
-            evaluation_id=new_evaluation_db.id, updates={"status": evaluation_status}
+        if failed_evaluation_scenarios:
+            evaluation_status = Result(
+                type="status",
+                value=EvaluationStatusEnum.EVALUATION_FINISHED_WITH_ERRORS,
+                error=None,
+            )
+
+        loop.run_until_complete(
+            update_evaluation(
+                evaluation_id=new_evaluation_db.id,
+                updates={"status": evaluation_status},
+            )
         )
-    )
+
+    except Exception as e:
+        logger.error(f"An error occurred during evaluation aggregation: {e}")
+        traceback.print_exc()
+        loop.run_until_complete(
+            update_evaluation(
+                evaluation_id,
+                {
+                    "status": Result(
+                        type="status",
+                        value="EVALUATION_AGGREGATION_FAILED",
+                        error=Error(
+                            message="Evaluation Aggregation Failed",
+                            stacktrace=str(traceback.format_exc()),
+                        ),
+                    )
+                },
+            )
+        )
+        self.update_state(state=states.FAILURE)
+        return
 
 
 async def aggregate_evaluator_results(