Skip to content

Commit

Permalink
[OPIK-101] sdk improve errors logs in evaluate method (#307)
Browse files Browse the repository at this point in the history
* Draft implementation of catching missing score arguments situations

* Fix lint errors

* Refactor raise if score arguments are missing

* Add unit test, fix lint errors

* Add one more unit test for evaluate function

* Remove unused fixture from the test

* Update error message
  • Loading branch information
alexkuzmik authored Sep 24, 2024
1 parent 28399a4 commit 347fd21
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 12 deletions.
31 changes: 31 additions & 0 deletions sdks/python/src/opik/evaluation/metrics/arguments_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from typing import List, Callable, Dict, Any
import inspect
from opik import exceptions


def raise_if_score_arguments_are_missing(
score_function: Callable, score_name: str, kwargs: Dict[str, Any]
) -> None:
signature = inspect.signature(score_function)

parameters = signature.parameters

missing_required_arguments: List[str] = []

for name, param in parameters.items():
if name == "self":
continue

if param.default == inspect.Parameter.empty and param.kind in (
inspect.Parameter.POSITIONAL_OR_KEYWORD,
inspect.Parameter.KEYWORD_ONLY,
):
if name not in kwargs:
missing_required_arguments.append(name)

if len(missing_required_arguments) > 0:
raise exceptions.ScoreMethodMissingArguments(
f"The scoring object {score_name} is missing arguments: {missing_required_arguments}. "
f"These keys were not present in the dictionary returned by the evaluation task. "
f"Evaluation task dictionary keys found: {list(kwargs.keys())}."
)
13 changes: 6 additions & 7 deletions sdks/python/src/opik/evaluation/task_output.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
from typing import Any

import pydantic


class TaskOutput(pydantic.BaseModel):
# Keys that are already used by our metrics.
input: Any = None
output: Any = None
expected: Any = None
context: Any = None
metadata: Any = None
#
# input
# output
# expected
# context
# metadata

# Model config allows to provide custom fields.
# It might be especially relevant for custom metrics.
Expand Down
23 changes: 19 additions & 4 deletions sdks/python/src/opik/evaluation/tasks_scorer.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import tqdm
import logging
from concurrent import futures

from typing import List
from .types import LLMTask
from opik.api_objects.dataset import dataset, dataset_item
from opik.api_objects import opik_client, trace
from opik import context_storage, opik_context
from opik import context_storage, opik_context, exceptions

from . import task_output, test_case, test_result
from .metrics import score_result, base_metric
from .metrics import arguments_helpers, score_result, base_metric

LOGGER = logging.getLogger(__name__)


def _score_test_case(
Expand All @@ -17,15 +20,27 @@ def _score_test_case(
score_results = []
for metric in scoring_metrics:
try:
result = metric.score(
**test_case_.task_output.model_dump(exclude_none=False)
score_kwargs = test_case_.task_output.model_dump(exclude_none=False)
arguments_helpers.raise_if_score_arguments_are_missing(
score_function=metric.score,
score_name=metric.name,
kwargs=score_kwargs,
)
result = metric.score(**score_kwargs)
if isinstance(result, list):
score_results += result
else:
score_results.append(result)
except exceptions.ScoreMethodMissingArguments:
raise
except Exception as e:
# This can be problematic if the metric returns a list of strings as we will not know the name of the metrics that have failed
LOGGER.error(
"Failed to compute metric %s. Score result will be marked as failed.",
metric.name,
exc_info=True,
)

score_results.append(
score_result.ScoreResult(
name=metric.name, value=0.0, reason=str(e), scoring_failed=True
Expand Down
4 changes: 4 additions & 0 deletions sdks/python/src/opik/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,7 @@ class ContextExtractorNotSet(OpikException):

class ConfigurationError(OpikException):
pass


class ScoreMethodMissingArguments(OpikException):
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from opik import exceptions
from opik.evaluation.metrics import arguments_helpers, base_metric

import pytest


@pytest.mark.parametrize(
argnames="score_kwargs, should_raise",
argvalues=[
({"a": 1, "b": 2}, False),
({"a": 1, "b": 2, "c": 3}, False),
({"a": 1, "c": 3}, True),
({}, True),
],
)
def test_raise_if_score_arguments_are_missing(score_kwargs, should_raise):
class SomeMetric(base_metric.BaseMetric):
def score(self, a, b, **ignored_kwargs):
pass

some_metric = SomeMetric(name="some-metric")

if should_raise:
with pytest.raises(exceptions.ScoreMethodMissingArguments):
arguments_helpers.raise_if_score_arguments_are_missing(
score_function=some_metric.score,
score_name=some_metric.name,
kwargs=score_kwargs,
)
else:
arguments_helpers.raise_if_score_arguments_are_missing(
score_function=some_metric.score,
score_name=some_metric.name,
kwargs=score_kwargs,
)
38 changes: 37 additions & 1 deletion sdks/python/tests/unit/evaluation/test_evaluate.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import mock
import pytest

from opik.api_objects.dataset import dataset_item
from opik.api_objects import opik_client
from opik import evaluation
from opik import evaluation, exceptions
from opik.evaluation import metrics
from ...testlib import backend_emulator_message_processor, ANY_BUT_NONE, assert_equal
from ...testlib.models import (
Expand Down Expand Up @@ -123,3 +125,37 @@ def say_task(dataset_item: dataset_item.DatasetItem):
EXPECTED_TRACE_TREES, fake_message_processor_.trace_trees
):
assert_equal(expected_trace, actual_trace)


def test_evaluate___output_key_is_missing_in_task_output_dict__equals_metric_misses_output_argument__exception_raised():
# Dataset is the only thing which is mocked for this test because
# evaluate should raise an exception right after the first attempt
# to compute Equals metric score.
mock_dataset = mock.Mock()
mock_dataset.name = "the-dataset-name"
mock_dataset.get_all_items.return_value = [
dataset_item.DatasetItem(
id="dataset-item-id-1",
input={"input": "say hello"},
expected_output={"output": "hello"},
),
]

def say_task(dataset_item: dataset_item.DatasetItem):
if dataset_item.input["input"] == "say hello":
return {
"the-key-that-is-not-named-output": "hello",
"reference": dataset_item.expected_output["output"],
}
raise Exception

with pytest.raises(exceptions.ScoreMethodMissingArguments):
evaluation.evaluate(
dataset=mock_dataset,
task=say_task,
experiment_name="the-experiment-name",
scoring_metrics=[metrics.Equals()],
task_threads=1,
)

mock_dataset.get_all_items.assert_called_once()

0 comments on commit 347fd21

Please sign in to comment.