Agenta-AI · aakrem · Jul 7, 2024 · May 26, 2024 · May 26, 2024 · May 26, 2024
diff --git a/.all-contributorsrc b/.all-contributorsrc
@@ -437,6 +437,16 @@
         "bug",
         "code"
       ]
+    },
+    {
+      "login": "jp-agenta",
+      "name": "jp-agenta",
+      "avatar_url": "https://avatars.githubusercontent.com/u/174311389?v=4",
+      "profile": "https://github.com/jp-agenta",
+      "contributions": [
+        "code",
+        "bug"
+      ]
     }
   ],
   "contributorsPerLine": 7,

diff --git a/README.md b/README.md
@@ -171,9 +171,7 @@ Check out our [Contributing Guide](https://docs.agenta.ai/misc/contributing/gett
 ## Contributors ✨
 
 <!-- ALL-CONTRIBUTORS-BADGE:START - Do not remove or modify this section -->
-
-[![All Contributors](https://img.shields.io/badge/all_contributors-46-orange.svg?style=flat-square)](#contributors-)
-
+[![All Contributors](https://img.shields.io/badge/all_contributors-47-orange.svg?style=flat-square)](#contributors-)
 <!-- ALL-CONTRIBUTORS-BADGE:END -->
 
 Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
@@ -194,7 +192,7 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d
     </tr>
     <tr>
       <td align="center" valign="top" width="14.28%"><a href="https://github.com/Pajko97"><img src="https://avatars.githubusercontent.com/u/25198892?v=4?s=100" width="100px;" alt="Pavle Janjusevic"/><br /><sub><b>Pavle Janjusevic</b></sub></a><br /><a href="#infra-Pajko97" title="Infrastructure (Hosting, Build-Tools, etc)">🚇</a></td>
-      <td align="center" valign="top" width="14.28%"><a href="http://kaosiso-ezealigo.netlify.app"><img src="https://avatars.githubusercontent.com/u/99529776?v=4?s=100" width="100px;" alt="Kaosiso Ezealigo"/><br /><sub><b>Kaosiso Ezealigo</b></sub></a><br /><a href="https://github.com/Agenta-AI/agenta/issues?q=author%3Abekossy" title="Bug reports">🐛</a> <a href="https://github.com/Agenta-AI/agenta/commits?author=bekossy" title="Code">💻</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="http://kaosiso-ezealigo.netlify.app"><img src="https://avatars.githubusercontent.com/u/99529776?v=4?s=100" width="100px;" alt="Kaosi Ezealigo"/><br /><sub><b>Kaosi Ezealigo</b></sub></a><br /><a href="https://github.com/Agenta-AI/agenta/issues?q=author%3Abekossy" title="Bug reports">🐛</a> <a href="https://github.com/Agenta-AI/agenta/commits?author=bekossy" title="Code">💻</a></td>
       <td align="center" valign="top" width="14.28%"><a href="https://github.com/albnunes"><img src="https://avatars.githubusercontent.com/u/46302915?v=4?s=100" width="100px;" alt="Alberto Nunes"/><br /><sub><b>Alberto Nunes</b></sub></a><br /><a href="https://github.com/Agenta-AI/agenta/issues?q=author%3Aalbnunes" title="Bug reports">🐛</a></td>
       <td align="center" valign="top" width="14.28%"><a href="https://www.linkedin.com/in/mohammed-maaz-6290b0116/"><img src="https://avatars.githubusercontent.com/u/17180132?v=4?s=100" width="100px;" alt="Maaz Bin Khawar"/><br /><sub><b>Maaz Bin Khawar</b></sub></a><br /><a href="https://github.com/Agenta-AI/agenta/commits?author=MohammedMaaz" title="Code">💻</a> <a href="https://github.com/Agenta-AI/agenta/pulls?q=is%3Apr+reviewed-by%3AMohammedMaaz" title="Reviewed Pull Requests">👀</a> <a href="#mentoring-MohammedMaaz" title="Mentoring">🧑‍🏫</a></td>
       <td align="center" valign="top" width="14.28%"><a href="https://github.com/devgenix"><img src="https://avatars.githubusercontent.com/u/56418363?v=4?s=100" width="100px;" alt="Nehemiah Onyekachukwu Emmanuel"/><br /><sub><b>Nehemiah Onyekachukwu Emmanuel</b></sub></a><br /><a href="https://github.com/Agenta-AI/agenta/commits?author=devgenix" title="Code">💻</a> <a href="#example-devgenix" title="Examples">💡</a> <a href="https://github.com/Agenta-AI/agenta/commits?author=devgenix" title="Documentation">📖</a></td>
@@ -242,6 +240,7 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d
       <td align="center" valign="top" width="14.28%"><a href="https://github.com/youcefs21"><img src="https://avatars.githubusercontent.com/u/34604972?v=4?s=100" width="100px;" alt="Youcef Boumar"/><br /><sub><b>Youcef Boumar</b></sub></a><br /><a href="https://github.com/Agenta-AI/agenta/commits?author=youcefs21" title="Documentation">📖</a></td>
       <td align="center" valign="top" width="14.28%"><a href="https://github.com/LucasTrg"><img src="https://avatars.githubusercontent.com/u/47852577?v=4?s=100" width="100px;" alt="LucasTrg"/><br /><sub><b>LucasTrg</b></sub></a><br /><a href="https://github.com/Agenta-AI/agenta/commits?author=LucasTrg" title="Code">💻</a> <a href="https://github.com/Agenta-AI/agenta/issues?q=author%3ALucasTrg" title="Bug reports">🐛</a></td>
       <td align="center" valign="top" width="14.28%"><a href="https://ashrafchowdury.me"><img src="https://avatars.githubusercontent.com/u/87828904?v=4?s=100" width="100px;" alt="Ashraf Chowdury"/><br /><sub><b>Ashraf Chowdury</b></sub></a><br /><a href="https://github.com/Agenta-AI/agenta/issues?q=author%3Aashrafchowdury" title="Bug reports">🐛</a> <a href="https://github.com/Agenta-AI/agenta/commits?author=ashrafchowdury" title="Code">💻</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/jp-agenta"><img src="https://avatars.githubusercontent.com/u/174311389?v=4?s=100" width="100px;" alt="jp-agenta"/><br /><sub><b>jp-agenta</b></sub></a><br /><a href="https://github.com/Agenta-AI/agenta/commits?author=jp-agenta" title="Code">💻</a> <a href="https://github.com/Agenta-AI/agenta/issues?q=author%3Ajp-agenta" title="Bug reports">🐛</a></td>
     </tr>
   </tbody>
 </table>

diff --git a/agenta-backend/agenta_backend/celery_config.py b/agenta-backend/agenta_backend/celery_config.py
@@ -7,6 +7,7 @@
 CELERY_ACCEPT_CONTENT = ["json"]
 CELERY_RESULT_SERIALIZER = "json"
 CELERY_TIMEZONE = "UTC"
+CELERY_TASK_TRACK_STARTED = True
 
 CELERY_QUEUES = (
     Queue(

diff --git a/agenta-backend/agenta_backend/models/api/evaluation_model.py b/agenta-backend/agenta_backend/models/api/evaluation_model.py
@@ -33,6 +33,7 @@ class EvaluationStatusEnum(str, Enum):
     EVALUATION_FINISHED = "EVALUATION_FINISHED"
     EVALUATION_FINISHED_WITH_ERRORS = "EVALUATION_FINISHED_WITH_ERRORS"
     EVALUATION_FAILED = "EVALUATION_FAILED"
+    EVALUATION_AGGREGATION_FAILED = "EVALUATION_AGGREGATION_FAILED"
 
 
 class EvaluationScenarioStatusEnum(str, Enum):

diff --git a/agenta-backend/agenta_backend/services/aggregation_service.py b/agenta-backend/agenta_backend/services/aggregation_service.py
@@ -15,28 +15,33 @@ def aggregate_ai_critique(results: List[Result]) -> Result:
         Result: aggregated result
     """
 
-    numeric_scores = []
-    for result in results:
-        try:
+    try:
+        numeric_scores = []
+        for result in results:
             # Extract the first number found in the result value
-            match = re.search(r"\d+", result.value)  # type: ignore
-            if not match:
-                continue
-
-            score = int(match.group())
-            numeric_scores.append(score)
-        except (TypeError, ValueError):
-            # Ignore if the extracted value is not an integer or is None
-            continue
-
-    # Calculate the average of numeric scores if any are present
-    average_value = (
-        sum(numeric_scores) / len(numeric_scores) if numeric_scores else None
-    )
-    return Result(
-        type="number",
-        value=average_value,
-    )
+            match = re.search(r"\d+", result.value)
+            if match:
+                try:
+                    score = int(match.group())
+                    numeric_scores.append(score)
+                except ValueError:
+                    # Ignore if the extracted value is not an integer
+                    continue
+
+        # Calculate the average of numeric scores if any are present
+        average_value = (
+            sum(numeric_scores) / len(numeric_scores) if numeric_scores else None
+        )
+        return Result(
+            type="number",
+            value=average_value,
+        )
+    except Exception as exc:
+        return Result(
+            type="error",
+            value=None,
+            error=Error(message=str(exc), stacktrace=str(traceback.format_exc())),
+        )
 
 
 def aggregate_binary(results: List[Result]) -> Result:
@@ -73,7 +78,7 @@ def aggregate_float(results: List[Result]) -> Result:
         return Result(
             type="error",
             value=None,
-            error=Error(message="Failed", stacktrace=str(traceback.format_exc())),
+            error=Error(message=str(exc), stacktrace=str(traceback.format_exc())),
         )
 
 

diff --git a/agenta-backend/agenta_backend/services/evaluation_service.py b/agenta-backend/agenta_backend/services/evaluation_service.py
@@ -436,7 +436,7 @@ async def create_new_evaluation(
         user_id=str(app.user_id),
         testset=testset,
         status=Result(
-            value=EvaluationStatusEnum.EVALUATION_STARTED, type="status", error=None
+            value=EvaluationStatusEnum.EVALUATION_INITIALIZED, type="status", error=None
         ),
         variant=variant_id,
         variant_revision=str(variant_revision.id),

diff --git a/agenta-backend/agenta_backend/services/evaluators_service.py b/agenta-backend/agenta_backend/services/evaluators_service.py
@@ -2,13 +2,13 @@
 import json
 import logging
 import traceback
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict
 
 import httpx
 from openai import OpenAI
 
-from agenta_backend.models.shared_models import Error, Result
 from agenta_backend.services.security import sandbox
+from agenta_backend.models.shared_models import Error, Result
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@@ -80,7 +80,8 @@ def auto_exact_match(
             type="error",
             value=None,
             error=Error(
-                message="Error during Auto Exact Match evaluation", stacktrace=str(e)
+                message="Error during Auto Exact Match evaluation",
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -104,7 +105,8 @@ def auto_regex_test(
             type="error",
             value=None,
             error=Error(
-                message="Error during Auto Regex evaluation", stacktrace=str(e)
+                message="Error during Auto Regex evaluation",
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -187,15 +189,16 @@ def auto_webhook_test(
             value=None,
             error=Error(
                 message="Error during Auto Webhook evaluation; An HTTP error occurred",
-                stacktrace=str(e),
+                stacktrace=str(traceback.format_exc()),
             ),
         )
     except Exception as e:  # pylint: disable=broad-except
         return Result(
             type="error",
             value=None,
             error=Error(
-                message="Error during Auto Webhook evaluation", stacktrace=str(e)
+                message="Error during Auto Webhook evaluation",
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -225,7 +228,8 @@ def auto_custom_code_run(
             type="error",
             value=None,
             error=Error(
-                message="Error during Auto Custom Code Evaluation", stacktrace=str(e)
+                message="Error during Auto Custom Code Evaluation",
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -284,7 +288,7 @@ def auto_ai_critique(
             value=None,
             error=Error(
                 message="Error during Auto AI Critique",
-                stacktrace=traceback.format_exc(),
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -312,7 +316,8 @@ def auto_starts_with(
             type="error",
             value=None,
             error=Error(
-                message="Error during Starts With evaluation", stacktrace=str(e)
+                message="Error during Starts With evaluation",
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -339,7 +344,10 @@ def auto_ends_with(
         return Result(
             type="error",
             value=None,
-            error=Error(message="Error during Ends With evaluation", stacktrace=str(e)),
+            error=Error(
+                message="Error during Ends With evaluation",
+                stacktrace=str(traceback.format_exc()),
+            ),
         )
 
 
@@ -365,7 +373,10 @@ def auto_contains(
         return Result(
             type="error",
             value=None,
-            error=Error(message="Error during Contains evaluation", stacktrace=str(e)),
+            error=Error(
+                message="Error during Contains evaluation",
+                stacktrace=str(traceback.format_exc()),
+            ),
         )
 
 
@@ -395,7 +406,8 @@ def auto_contains_any(
             type="error",
             value=None,
             error=Error(
-                message="Error during Contains Any evaluation", stacktrace=str(e)
+                message="Error during Contains Any evaluation",
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -426,7 +438,8 @@ def auto_contains_all(
             type="error",
             value=None,
             error=Error(
-                message="Error during Contains All evaluation", stacktrace=str(e)
+                message="Error during Contains All evaluation",
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -456,7 +469,8 @@ def auto_contains_json(
             type="error",
             value=None,
             error=Error(
-                message="Error during Contains JSON evaluation", stacktrace=str(e)
+                message="Error during Contains JSON evaluation",
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -515,7 +529,7 @@ def auto_levenshtein_distance(
             value=None,
             error=Error(
                 message="Error during Levenshtein threshold evaluation",
-                stacktrace=str(e),
+                stacktrace=str(traceback.format_exc()),
             ),
         )
 
@@ -556,7 +570,7 @@ def auto_similarity_match(
             value=None,
             error=Error(
                 message="Error during Auto Similarity Match evaluation",
-                stacktrace=str(e),
+                stacktrace=str(traceback.format_exc()),
             ),
         )