Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix two likely causes of infinitely running evaluations #1840

Merged
merged 10 commits into from
Jul 5, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class EvaluationStatusEnum(str, Enum):
EVALUATION_FINISHED = "EVALUATION_FINISHED"
EVALUATION_FINISHED_WITH_ERRORS = "EVALUATION_FINISHED_WITH_ERRORS"
EVALUATION_FAILED = "EVALUATION_FAILED"
EVALUATION_AGGREGATION_FAILED = "EVALUATION_AGGREGATION_FAILED"


class EvaluationScenarioStatusEnum(str, Enum):
Expand Down
49 changes: 28 additions & 21 deletions agenta-backend/agenta_backend/services/aggregation_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,33 @@ def aggregate_ai_critique(results: List[Result]) -> Result:
Result: aggregated result
"""

numeric_scores = []
for result in results:
# Extract the first number found in the result value
match = re.search(r"\d+", result.value)
if match:
try:
score = int(match.group())
numeric_scores.append(score)
except ValueError:
# Ignore if the extracted value is not an integer
continue

# Calculate the average of numeric scores if any are present
average_value = (
sum(numeric_scores) / len(numeric_scores) if numeric_scores else None
)
return Result(
type="number",
value=average_value,
)
try:
numeric_scores = []
for result in results:
# Extract the first number found in the result value
match = re.search(r"\d+", result.value)
if match:
try:
score = int(match.group())
numeric_scores.append(score)
except ValueError:
# Ignore if the extracted value is not an integer
continue

# Calculate the average of numeric scores if any are present
average_value = (
sum(numeric_scores) / len(numeric_scores) if numeric_scores else None
)
return Result(
type="number",
value=average_value,
)
except Exception as exc:
return Result(
type="error",
value=None,
error=Error(message=str(exc), stacktrace=str(traceback.format_exc())),
)


def aggregate_binary(results: List[Result]) -> Result:
Expand Down Expand Up @@ -71,7 +78,7 @@ def aggregate_float(results: List[Result]) -> Result:
return Result(
type="error",
value=None,
error=Error(message="Failed", stacktrace=str(traceback.format_exc())),
error=Error(message=str(exc), stacktrace=str(traceback.format_exc())),
)


Expand Down
46 changes: 32 additions & 14 deletions agenta-backend/agenta_backend/services/evaluators_service.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import logging
import re
import traceback
from typing import Any, Dict, List, Tuple

import httpx
Expand Down Expand Up @@ -79,7 +80,8 @@ def auto_exact_match(
type="error",
value=None,
error=Error(
message="Error during Auto Exact Match evaluation", stacktrace=str(e)
message="Error during Auto Exact Match evaluation",
stacktrace=str(traceback.format_exc()),
),
)

Expand All @@ -103,7 +105,8 @@ def auto_regex_test(
type="error",
value=None,
error=Error(
message="Error during Auto Regex evaluation", stacktrace=str(e)
message="Error during Auto Regex evaluation",
stacktrace=str(traceback.format_exc()),
),
)

Expand Down Expand Up @@ -186,15 +189,16 @@ def auto_webhook_test(
value=None,
error=Error(
message="Error during Auto Webhook evaluation; An HTTP error occurred",
stacktrace=str(e),
stacktrace=str(traceback.format_exc()),
),
)
except Exception as e: # pylint: disable=broad-except
return Result(
type="error",
value=None,
error=Error(
message="Error during Auto Webhook evaluation", stacktrace=str(e)
message="Error during Auto Webhook evaluation",
stacktrace=str(traceback.format_exc()),
),
)

Expand Down Expand Up @@ -224,7 +228,8 @@ def auto_custom_code_run(
type="error",
value=None,
error=Error(
message="Error during Auto Custom Code Evaluation", stacktrace=str(e)
message="Error during Auto Custom Code Evaluation",
stacktrace=str(traceback.format_exc()),
),
)

Expand Down Expand Up @@ -281,7 +286,10 @@ def auto_ai_critique(
return Result(
type="error",
value=None,
error=Error(message="Error during Auto AI Critique", stacktrace=str(e)),
error=Error(
message="Error during Auto AI Critique",
stacktrace=str(traceback.format_exc()),
),
)


Expand All @@ -308,7 +316,8 @@ def auto_starts_with(
type="error",
value=None,
error=Error(
message="Error during Starts With evaluation", stacktrace=str(e)
message="Error during Starts With evaluation",
stacktrace=str(traceback.format_exc()),
),
)

Expand All @@ -335,7 +344,10 @@ def auto_ends_with(
return Result(
type="error",
value=None,
error=Error(message="Error during Ends With evaluation", stacktrace=str(e)),
error=Error(
message="Error during Ends With evaluation",
stacktrace=str(traceback.format_exc()),
),
)


Expand All @@ -361,7 +373,10 @@ def auto_contains(
return Result(
type="error",
value=None,
error=Error(message="Error during Contains evaluation", stacktrace=str(e)),
error=Error(
message="Error during Contains evaluation",
stacktrace=str(traceback.format_exc()),
),
)


Expand Down Expand Up @@ -391,7 +406,8 @@ def auto_contains_any(
type="error",
value=None,
error=Error(
message="Error during Contains Any evaluation", stacktrace=str(e)
message="Error during Contains Any evaluation",
stacktrace=str(traceback.format_exc()),
),
)

Expand Down Expand Up @@ -422,7 +438,8 @@ def auto_contains_all(
type="error",
value=None,
error=Error(
message="Error during Contains All evaluation", stacktrace=str(e)
message="Error during Contains All evaluation",
stacktrace=str(traceback.format_exc()),
),
)

Expand Down Expand Up @@ -452,7 +469,8 @@ def auto_contains_json(
type="error",
value=None,
error=Error(
message="Error during Contains JSON evaluation", stacktrace=str(e)
message="Error during Contains JSON evaluation",
stacktrace=str(traceback.format_exc()),
),
)

Expand Down Expand Up @@ -511,7 +529,7 @@ def auto_levenshtein_distance(
value=None,
error=Error(
message="Error during Levenshtein threshold evaluation",
stacktrace=str(e),
stacktrace=str(traceback.format_exc()),
),
)

Expand Down Expand Up @@ -552,7 +570,7 @@ def auto_similarity_match(
value=None,
error=Error(
message="Error during Auto Similarity Match evaluation",
stacktrace=str(e),
stacktrace=str(traceback.format_exc()),
),
)

Expand Down
85 changes: 58 additions & 27 deletions agenta-backend/agenta_backend/tasks/evaluations.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,12 +249,14 @@ def evaluate(
evaluators_results.append(result_object)

all_correct_answers = [
CorrectAnswer(
key=ground_truth_column_name,
value=data_point[ground_truth_column_name],
(
CorrectAnswer(
key=ground_truth_column_name,
value=data_point[ground_truth_column_name],
)
if ground_truth_column_name in data_point
else CorrectAnswer(key=ground_truth_column_name, value="")
)
if ground_truth_column_name in data_point
else CorrectAnswer(key=ground_truth_column_name, value="")
for ground_truth_column_name in ground_truth_column_names
]
# 4. We save the result of the eval scenario in the db
Expand Down Expand Up @@ -313,43 +315,72 @@ def evaluate(
"status": Result(
type="status",
value="EVALUATION_FAILED",
error=Error(message="Evaluation Failed", stacktrace=str(e)),
error=Error(
message="Evaluation Failed",
stacktrace=str(traceback.format_exc()),
),
)
},
)
)
self.update_state(state=states.FAILURE)
return

aggregated_results = loop.run_until_complete(
aggregate_evaluator_results(app, evaluators_aggregated_data)
)
loop.run_until_complete(
update_evaluation_with_aggregated_results(
new_evaluation_db.id, aggregated_results
try:
aggregated_results = loop.run_until_complete(
aggregate_evaluator_results(app, evaluators_aggregated_data)
)
)

failed_evaluation_scenarios = loop.run_until_complete(
check_if_evaluation_contains_failed_evaluation_scenarios(new_evaluation_db.id)
)
loop.run_until_complete(
update_evaluation_with_aggregated_results(
new_evaluation_db.id, aggregated_results
)
)

evaluation_status = Result(
type="status", value=EvaluationStatusEnum.EVALUATION_FINISHED, error=None
)
failed_evaluation_scenarios = loop.run_until_complete(
check_if_evaluation_contains_failed_evaluation_scenarios(
new_evaluation_db.id
)
)

if failed_evaluation_scenarios:
evaluation_status = Result(
type="status",
value=EvaluationStatusEnum.EVALUATION_FINISHED_WITH_ERRORS,
error=None,
type="status", value=EvaluationStatusEnum.EVALUATION_FINISHED, error=None
)

loop.run_until_complete(
update_evaluation(
evaluation_id=new_evaluation_db.id, updates={"status": evaluation_status}
if failed_evaluation_scenarios:
evaluation_status = Result(
type="status",
value=EvaluationStatusEnum.EVALUATION_FINISHED_WITH_ERRORS,
error=None,
)

loop.run_until_complete(
update_evaluation(
evaluation_id=new_evaluation_db.id,
updates={"status": evaluation_status},
)
)
)

except Exception as e:
logger.error(f"An error occurred during evaluation aggregation: {e}")
traceback.print_exc()
loop.run_until_complete(
update_evaluation(
evaluation_id,
{
"status": Result(
type="status",
value="EVALUATION_AGGREGATION_FAILED",
error=Error(
message="Evaluation Aggregation Failed",
stacktrace=str(traceback.format_exc()),
),
)
},
)
)
self.update_state(state=states.FAILURE)
return


async def aggregate_evaluator_results(
Expand Down
Loading
Loading