Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix/code-evaluators #1750

Merged
merged 5 commits into from
Jun 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,335 @@
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional

from pydantic import BaseModel, Field
from beanie import Document, Link, PydanticObjectId, iterative_migration

from datetime import datetime, timezone
from typing import Any, Dict, List, Optional

from pydantic import BaseModel, Field
from beanie import Document, Link, PydanticObjectId


class UserDB(Document):
uid: str = Field(default="0", unique=True, index=True)
username: str = Field(default="agenta")
email: str = Field(default="demo@agenta.ai", unique=True)
created_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))
updated_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))

class Settings:
name = "users"


class ImageDB(Document):
"""Defines the info needed to get an image and connect it to the app variant"""

type: Optional[str] = Field(default="image")
template_uri: Optional[str]
docker_id: Optional[str] = Field(index=True)
tags: Optional[str]
deletable: bool = Field(default=True)
user: Link[UserDB]
created_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))
updated_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))

class Settings:
name = "docker_images"


class AppDB(Document):
app_name: str
user: Link[UserDB]
created_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))
updated_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))

class Settings:
name = "app_db"


class DeploymentDB(Document):
app: Link[AppDB]
user: Link[UserDB]
container_name: Optional[str]
container_id: Optional[str]
uri: Optional[str]
status: str
created_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))
updated_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))

class Settings:
name = "deployments"


class VariantBaseDB(Document):
app: Link[AppDB]
user: Link[UserDB]
base_name: str
image: Link[ImageDB]
deployment: Optional[PydanticObjectId] # Link to deployment
created_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))
updated_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))

class Settings:
name = "bases"


class ConfigDB(BaseModel):
config_name: str
parameters: Dict[str, Any] = Field(default_factory=dict)


class AppVariantDB(Document):
app: Link[AppDB]
variant_name: str
revision: int
image: Link[ImageDB]
user: Link[UserDB]
modified_by: Link[UserDB]
parameters: Dict[str, Any] = Field(default=dict) # TODO: deprecated. remove
previous_variant_name: Optional[str] # TODO: deprecated. remove
base_name: Optional[str]
base: Link[VariantBaseDB]
config_name: Optional[str]
config: ConfigDB
created_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))
updated_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))

is_deleted: bool = Field( # TODO: deprecated. remove
default=False
) # soft deletion for using the template variants

class Settings:
name = "app_variants"


class AppVariantRevisionsDB(Document):
variant: Link[AppVariantDB]
revision: int
modified_by: Link[UserDB]
base: Link[VariantBaseDB]
config: ConfigDB
created_at: datetime
updated_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))

class Settings:
name = "app_variant_revisions"


class AppEnvironmentDB(Document):
app: Link[AppDB]
name: str
user: Link[UserDB]
revision: int
deployed_app_variant: Optional[PydanticObjectId]
deployed_app_variant_revision: Optional[Link[AppVariantRevisionsDB]]
deployment: Optional[PydanticObjectId] # reference to deployment
created_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))

class Settings:
name = "environments"


class AppEnvironmentRevisionDB(Document):
environment: Link[AppEnvironmentDB]
revision: int
modified_by: Link[UserDB]
deployed_app_variant_revision: Optional[PydanticObjectId]
deployment: Optional[PydanticObjectId] # reference to deployment
created_at: datetime

class Settings:
name = "environments_revisions"


class TemplateDB(Document):
type: Optional[str] = Field(default="image")
template_uri: Optional[str]
tag_id: Optional[int]
name: str = Field(unique=True) # tag name of image
repo_name: Optional[str]
title: str
description: str
size: Optional[int]
digest: Optional[str] # sha256 hash of image digest
last_pushed: Optional[datetime]

class Settings:
name = "templates"


class TestSetDB(Document):
name: str
app: Link[AppDB]
csvdata: List[Dict[str, str]]
user: Link[UserDB]
created_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))
updated_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))

class Settings:
name = "testsets"


class EvaluatorConfigDB(Document):
app: Link[AppDB]
user: Link[UserDB]
name: str
evaluator_key: str
settings_values: Dict[str, Any] = Field(default=dict)
created_at: datetime = Field(default=datetime.now(timezone.utc))
updated_at: datetime = Field(default=datetime.now(timezone.utc))

class Settings:
name = "evaluators_configs"


class Error(BaseModel):
message: str
stacktrace: Optional[str] = None


class Result(BaseModel):
type: str
value: Optional[Any] = None
error: Optional[Error] = None


class InvokationResult(BaseModel):
result: Result
cost: Optional[float] = None
latency: Optional[float] = None


class EvaluationScenarioResult(BaseModel):
evaluator_config: PydanticObjectId
result: Result


class AggregatedResult(BaseModel):
evaluator_config: PydanticObjectId
result: Result


class EvaluationScenarioInputDB(BaseModel):
name: str
type: str
value: str


class EvaluationScenarioOutputDB(BaseModel):
result: Result
cost: Optional[float] = None
latency: Optional[float] = None


class HumanEvaluationScenarioInput(BaseModel):
input_name: str
input_value: str


class HumanEvaluationScenarioOutput(BaseModel):
variant_id: str
variant_output: str


class HumanEvaluationDB(Document):
app: Link[AppDB]
user: Link[UserDB]
status: str
evaluation_type: str
variants: List[PydanticObjectId]
variants_revisions: List[PydanticObjectId]
testset: Link[TestSetDB]
created_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))
updated_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))

class Settings:
name = "human_evaluations"


class HumanEvaluationScenarioDB(Document):
user: Link[UserDB]
evaluation: Link[HumanEvaluationDB]
inputs: List[HumanEvaluationScenarioInput]
outputs: List[HumanEvaluationScenarioOutput]
vote: Optional[str]
score: Optional[Any]
correct_answer: Optional[str]
created_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))
updated_at: Optional[datetime] = Field(default=datetime.now(timezone.utc))
is_pinned: Optional[bool]
note: Optional[str]

class Settings:
name = "human_evaluations_scenarios"


class EvaluationDB(Document):
app: Link[AppDB]
user: Link[UserDB]
status: Result
testset: Link[TestSetDB]
variant: PydanticObjectId
variant_revision: PydanticObjectId
evaluators_configs: List[PydanticObjectId]
aggregated_results: List[AggregatedResult]
average_cost: Optional[Result] = None
total_cost: Optional[Result] = None
average_latency: Optional[Result] = None
created_at: datetime = Field(default=datetime.now(timezone.utc))
updated_at: datetime = Field(default=datetime.now(timezone.utc))

class Settings:
name = "new_evaluations"


class CorrectAnswer(BaseModel):
key: str
value: str


class EvaluationScenarioDB(Document):
user: Link[UserDB]
evaluation: Link[EvaluationDB]
variant_id: PydanticObjectId
inputs: List[EvaluationScenarioInputDB]
outputs: List[EvaluationScenarioOutputDB]
correct_answers: Optional[List[CorrectAnswer]]
is_pinned: Optional[bool]
note: Optional[str]
evaluators_configs: List[PydanticObjectId]
results: List[EvaluationScenarioResult]
latency: Optional[int] = None
cost: Optional[int] = None
created_at: datetime = Field(default=datetime.now(timezone.utc))
updated_at: datetime = Field(default=datetime.now(timezone.utc))

class Settings:
name = "new_evaluation_scenarios"


class Forward:
@iterative_migration()
async def migrate_settings_values(
self,
input_document: EvaluatorConfigDB,
output_document: EvaluatorConfigDB,
):
evaluators_using_correct_answer = [
"auto_custom_code_run",
]

if (
input_document.evaluator_key in evaluators_using_correct_answer
and "correct_answer_key" not in input_document.settings_values
):
new_settings_values = input_document.settings_values
new_settings_values["correct_answer_key"] = "correct_answer"
output_document.settings_values = new_settings_values
else:
output_document.settings_values = input_document.settings_values


class Backward:
...
12 changes: 10 additions & 2 deletions agenta-backend/agenta_backend/resources/evaluators/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,18 @@
"code": {
"label": "Evaluation Code",
"type": "code",
"default": "from typing import Dict\n\ndef evaluate(\n app_params: Dict[str, str],\n inputs: Dict[str, str],\n output: str,\n correct_answer: str\n) -> float:\n # ...\n return 0.75 # Replace with your calculated score",
"default": "from typing import Dict\n\ndef evaluate(\n app_params: Dict[str, str],\n inputs: Dict[str, str],\n output: str, # output of the llm app\n datapoint: Dict[str, str] # contains the testset row \n) -> float:\n if output in datapoint.get('correct_answer', None):\n return 1.0\n else:\n return 0.0\n",
"description": "Code for evaluating submissions",
"required": True,
}
},
"correct_answer_key": {
"label": "Expected Answer Column",
"default": "correct_answer",
"type": "string",
"advanced": True, # Tells the frontend that this setting is advanced and should be hidden by default
"ground_truth_key": True, # Tells the frontend that is the name of the column in the test set that should be shown as a ground truth to the user
"description": "The name of the column in the test data that contains the correct answer. This will be shown in the results page.",
},
},
"description": "Code Evaluation allows you to write your own evaluator in Python. You need to provide the Python code for the evaluator.",
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def aggregate_float(results: List[Result]) -> Result:
return Result(
type="error",
value=None,
error=Error(message=str(exc), stacktrace=str(traceback.format_exc())),
error=Error(message="Failed", stacktrace=str(traceback.format_exc())),
)


Expand Down
5 changes: 4 additions & 1 deletion agenta-backend/agenta_backend/services/evaluators_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,11 @@ def auto_custom_code_run(
app_params=app_params,
inputs=inputs,
output=output,
data_point=data_point,
correct_answer=data_point.get(
"correct_answer", None
), # for backward compatibility
code=settings_values["code"],
datapoint=data_point,
)
return Result(type="number", value=result)
except Exception as e: # pylint: disable=broad-except
Expand Down
Loading
Loading