From 347fd210a0b2bd38fced1e0c733e468c7808a7d3 Mon Sep 17 00:00:00 2001 From: Aliaksandr Kuzmik <98702584+alexkuzmik@users.noreply.github.com> Date: Tue, 24 Sep 2024 16:57:25 +0200 Subject: [PATCH] [OPIK-101] sdk improve errors logs in evaluate method (#307) * Draft implementation of catching missing score arguments situations * Fix lint errors * Refactor raise if score arguments are missing * Add unit test, fix lint errors * Add one more unit test for evaluate function * Remove unused fixture from the test * Update error message --- .../evaluation/metrics/arguments_helpers.py | 31 +++++++++++++++ .../python/src/opik/evaluation/task_output.py | 13 +++---- .../src/opik/evaluation/tasks_scorer.py | 23 +++++++++-- sdks/python/src/opik/exceptions.py | 4 ++ .../metrics/test_arguments_helpers.py | 35 +++++++++++++++++ .../tests/unit/evaluation/test_evaluate.py | 38 ++++++++++++++++++- 6 files changed, 132 insertions(+), 12 deletions(-) create mode 100644 sdks/python/src/opik/evaluation/metrics/arguments_helpers.py create mode 100644 sdks/python/tests/unit/evaluation/metrics/test_arguments_helpers.py diff --git a/sdks/python/src/opik/evaluation/metrics/arguments_helpers.py b/sdks/python/src/opik/evaluation/metrics/arguments_helpers.py new file mode 100644 index 000000000..d77de865d --- /dev/null +++ b/sdks/python/src/opik/evaluation/metrics/arguments_helpers.py @@ -0,0 +1,31 @@ +from typing import List, Callable, Dict, Any +import inspect +from opik import exceptions + + +def raise_if_score_arguments_are_missing( + score_function: Callable, score_name: str, kwargs: Dict[str, Any] +) -> None: + signature = inspect.signature(score_function) + + parameters = signature.parameters + + missing_required_arguments: List[str] = [] + + for name, param in parameters.items(): + if name == "self": + continue + + if param.default == inspect.Parameter.empty and param.kind in ( + inspect.Parameter.POSITIONAL_OR_KEYWORD, + inspect.Parameter.KEYWORD_ONLY, + ): + if name not in kwargs: + missing_required_arguments.append(name) + + if len(missing_required_arguments) > 0: + raise exceptions.ScoreMethodMissingArguments( + f"The scoring object {score_name} is missing arguments: {missing_required_arguments}. " + f"These keys were not present in the dictionary returned by the evaluation task. " + f"Evaluation task dictionary keys found: {list(kwargs.keys())}." + ) diff --git a/sdks/python/src/opik/evaluation/task_output.py b/sdks/python/src/opik/evaluation/task_output.py index 99c43d70e..b8abace09 100644 --- a/sdks/python/src/opik/evaluation/task_output.py +++ b/sdks/python/src/opik/evaluation/task_output.py @@ -1,15 +1,14 @@ -from typing import Any - import pydantic class TaskOutput(pydantic.BaseModel): # Keys that are already used by our metrics. - input: Any = None - output: Any = None - expected: Any = None - context: Any = None - metadata: Any = None + # + # input + # output + # expected + # context + # metadata # Model config allows to provide custom fields. # It might be especially relevant for custom metrics. diff --git a/sdks/python/src/opik/evaluation/tasks_scorer.py b/sdks/python/src/opik/evaluation/tasks_scorer.py index cdabb6328..1f9c3f68b 100644 --- a/sdks/python/src/opik/evaluation/tasks_scorer.py +++ b/sdks/python/src/opik/evaluation/tasks_scorer.py @@ -1,14 +1,17 @@ import tqdm +import logging from concurrent import futures from typing import List from .types import LLMTask from opik.api_objects.dataset import dataset, dataset_item from opik.api_objects import opik_client, trace -from opik import context_storage, opik_context +from opik import context_storage, opik_context, exceptions from . import task_output, test_case, test_result -from .metrics import score_result, base_metric +from .metrics import arguments_helpers, score_result, base_metric + +LOGGER = logging.getLogger(__name__) def _score_test_case( @@ -17,15 +20,27 @@ def _score_test_case( score_results = [] for metric in scoring_metrics: try: - result = metric.score( - **test_case_.task_output.model_dump(exclude_none=False) + score_kwargs = test_case_.task_output.model_dump(exclude_none=False) + arguments_helpers.raise_if_score_arguments_are_missing( + score_function=metric.score, + score_name=metric.name, + kwargs=score_kwargs, ) + result = metric.score(**score_kwargs) if isinstance(result, list): score_results += result else: score_results.append(result) + except exceptions.ScoreMethodMissingArguments: + raise except Exception as e: # This can be problematic if the metric returns a list of strings as we will not know the name of the metrics that have failed + LOGGER.error( + "Failed to compute metric %s. Score result will be marked as failed.", + metric.name, + exc_info=True, + ) + score_results.append( score_result.ScoreResult( name=metric.name, value=0.0, reason=str(e), scoring_failed=True diff --git a/sdks/python/src/opik/exceptions.py b/sdks/python/src/opik/exceptions.py index a47c6ab66..ce8d0c7b9 100644 --- a/sdks/python/src/opik/exceptions.py +++ b/sdks/python/src/opik/exceptions.py @@ -12,3 +12,7 @@ class ContextExtractorNotSet(OpikException): class ConfigurationError(OpikException): pass + + +class ScoreMethodMissingArguments(OpikException): + pass diff --git a/sdks/python/tests/unit/evaluation/metrics/test_arguments_helpers.py b/sdks/python/tests/unit/evaluation/metrics/test_arguments_helpers.py new file mode 100644 index 000000000..d1e938fc8 --- /dev/null +++ b/sdks/python/tests/unit/evaluation/metrics/test_arguments_helpers.py @@ -0,0 +1,35 @@ +from opik import exceptions +from opik.evaluation.metrics import arguments_helpers, base_metric + +import pytest + + +@pytest.mark.parametrize( + argnames="score_kwargs, should_raise", + argvalues=[ + ({"a": 1, "b": 2}, False), + ({"a": 1, "b": 2, "c": 3}, False), + ({"a": 1, "c": 3}, True), + ({}, True), + ], +) +def test_raise_if_score_arguments_are_missing(score_kwargs, should_raise): + class SomeMetric(base_metric.BaseMetric): + def score(self, a, b, **ignored_kwargs): + pass + + some_metric = SomeMetric(name="some-metric") + + if should_raise: + with pytest.raises(exceptions.ScoreMethodMissingArguments): + arguments_helpers.raise_if_score_arguments_are_missing( + score_function=some_metric.score, + score_name=some_metric.name, + kwargs=score_kwargs, + ) + else: + arguments_helpers.raise_if_score_arguments_are_missing( + score_function=some_metric.score, + score_name=some_metric.name, + kwargs=score_kwargs, + ) diff --git a/sdks/python/tests/unit/evaluation/test_evaluate.py b/sdks/python/tests/unit/evaluation/test_evaluate.py index 7a18c2037..28c636342 100644 --- a/sdks/python/tests/unit/evaluation/test_evaluate.py +++ b/sdks/python/tests/unit/evaluation/test_evaluate.py @@ -1,7 +1,9 @@ import mock +import pytest + from opik.api_objects.dataset import dataset_item from opik.api_objects import opik_client -from opik import evaluation +from opik import evaluation, exceptions from opik.evaluation import metrics from ...testlib import backend_emulator_message_processor, ANY_BUT_NONE, assert_equal from ...testlib.models import ( @@ -123,3 +125,37 @@ def say_task(dataset_item: dataset_item.DatasetItem): EXPECTED_TRACE_TREES, fake_message_processor_.trace_trees ): assert_equal(expected_trace, actual_trace) + + +def test_evaluate___output_key_is_missing_in_task_output_dict__equals_metric_misses_output_argument__exception_raised(): + # Dataset is the only thing which is mocked for this test because + # evaluate should raise an exception right after the first attempt + # to compute Equals metric score. + mock_dataset = mock.Mock() + mock_dataset.name = "the-dataset-name" + mock_dataset.get_all_items.return_value = [ + dataset_item.DatasetItem( + id="dataset-item-id-1", + input={"input": "say hello"}, + expected_output={"output": "hello"}, + ), + ] + + def say_task(dataset_item: dataset_item.DatasetItem): + if dataset_item.input["input"] == "say hello": + return { + "the-key-that-is-not-named-output": "hello", + "reference": dataset_item.expected_output["output"], + } + raise Exception + + with pytest.raises(exceptions.ScoreMethodMissingArguments): + evaluation.evaluate( + dataset=mock_dataset, + task=say_task, + experiment_name="the-experiment-name", + scoring_metrics=[metrics.Equals()], + task_threads=1, + ) + + mock_dataset.get_all_items.assert_called_once()