[OPIK-101] sdk improve errors logs in evaluate method (#307)

* Draft implementation of catching missing score arguments situations * Fix lint errors * Refactor raise if score arguments are missing * Add unit test, fix lint errors * Add one more unit test for evaluate function * Remove unused fixture from the test * Update error message
comet-ml · Sep 24, 2024 · 347fd21 · 347fd21
1 parent 28399a4
commit 347fd21
Show file tree

Hide file tree

Showing 6 changed files with 132 additions and 12 deletions.
diff --git a/sdks/python/src/opik/evaluation/metrics/arguments_helpers.py b/sdks/python/src/opik/evaluation/metrics/arguments_helpers.py
@@ -0,0 +1,31 @@
+from typing import List, Callable, Dict, Any
+import inspect
+from opik import exceptions
+
+
+def raise_if_score_arguments_are_missing(
+    score_function: Callable, score_name: str, kwargs: Dict[str, Any]
+) -> None:
+    signature = inspect.signature(score_function)
+
+    parameters = signature.parameters
+
+    missing_required_arguments: List[str] = []
+
+    for name, param in parameters.items():
+        if name == "self":
+            continue
+
+        if param.default == inspect.Parameter.empty and param.kind in (
+            inspect.Parameter.POSITIONAL_OR_KEYWORD,
+            inspect.Parameter.KEYWORD_ONLY,
+        ):
+            if name not in kwargs:
+                missing_required_arguments.append(name)
+
+    if len(missing_required_arguments) > 0:
+        raise exceptions.ScoreMethodMissingArguments(
+            f"The scoring object {score_name} is missing arguments: {missing_required_arguments}. "
+            f"These keys were not present in the dictionary returned by the evaluation task. "
+            f"Evaluation task dictionary keys found: {list(kwargs.keys())}."
+        )
diff --git a/sdks/python/src/opik/evaluation/task_output.py b/sdks/python/src/opik/evaluation/task_output.py
@@ -1,15 +1,14 @@
-from typing import Any
-
 import pydantic
 
 
 class TaskOutput(pydantic.BaseModel):
     # Keys that are already used by our metrics.
-    input: Any = None
-    output: Any = None
-    expected: Any = None
-    context: Any = None
-    metadata: Any = None
+    #
+    # input
+    # output
+    # expected
+    # context
+    # metadata
 
     # Model config allows to provide custom fields.
     # It might be especially relevant for custom metrics.

diff --git a/sdks/python/src/opik/evaluation/tasks_scorer.py b/sdks/python/src/opik/evaluation/tasks_scorer.py
@@ -1,14 +1,17 @@
 import tqdm
+import logging
 from concurrent import futures
 
 from typing import List
 from .types import LLMTask
 from opik.api_objects.dataset import dataset, dataset_item
 from opik.api_objects import opik_client, trace
-from opik import context_storage, opik_context
+from opik import context_storage, opik_context, exceptions
 
 from . import task_output, test_case, test_result
-from .metrics import score_result, base_metric
+from .metrics import arguments_helpers, score_result, base_metric
+
+LOGGER = logging.getLogger(__name__)
 
 
 def _score_test_case(
@@ -17,15 +20,27 @@ def _score_test_case(
     score_results = []
     for metric in scoring_metrics:
         try:
-            result = metric.score(
-                **test_case_.task_output.model_dump(exclude_none=False)
+            score_kwargs = test_case_.task_output.model_dump(exclude_none=False)
+            arguments_helpers.raise_if_score_arguments_are_missing(
+                score_function=metric.score,
+                score_name=metric.name,
+                kwargs=score_kwargs,
             )
+            result = metric.score(**score_kwargs)
             if isinstance(result, list):
                 score_results += result
             else:
                 score_results.append(result)
+        except exceptions.ScoreMethodMissingArguments:
+            raise
         except Exception as e:
             # This can be problematic if the metric returns a list of strings as we will not know the name of the metrics that have failed
+            LOGGER.error(
+                "Failed to compute metric %s. Score result will be marked as failed.",
+                metric.name,
+                exc_info=True,
+            )
+
             score_results.append(
                 score_result.ScoreResult(
                     name=metric.name, value=0.0, reason=str(e), scoring_failed=True

diff --git a/sdks/python/src/opik/exceptions.py b/sdks/python/src/opik/exceptions.py
@@ -12,3 +12,7 @@ class ContextExtractorNotSet(OpikException):
 
 class ConfigurationError(OpikException):
     pass
+
+
+class ScoreMethodMissingArguments(OpikException):
+    pass
diff --git a/sdks/python/tests/unit/evaluation/metrics/test_arguments_helpers.py b/sdks/python/tests/unit/evaluation/metrics/test_arguments_helpers.py
@@ -0,0 +1,35 @@
+from opik import exceptions
+from opik.evaluation.metrics import arguments_helpers, base_metric
+
+import pytest
+
+
+@pytest.mark.parametrize(
+    argnames="score_kwargs, should_raise",
+    argvalues=[
+        ({"a": 1, "b": 2}, False),
+        ({"a": 1, "b": 2, "c": 3}, False),
+        ({"a": 1, "c": 3}, True),
+        ({}, True),
+    ],
+)
+def test_raise_if_score_arguments_are_missing(score_kwargs, should_raise):
+    class SomeMetric(base_metric.BaseMetric):
+        def score(self, a, b, **ignored_kwargs):
+            pass
+
+    some_metric = SomeMetric(name="some-metric")
+
+    if should_raise:
+        with pytest.raises(exceptions.ScoreMethodMissingArguments):
+            arguments_helpers.raise_if_score_arguments_are_missing(
+                score_function=some_metric.score,
+                score_name=some_metric.name,
+                kwargs=score_kwargs,
+            )
+    else:
+        arguments_helpers.raise_if_score_arguments_are_missing(
+            score_function=some_metric.score,
+            score_name=some_metric.name,
+            kwargs=score_kwargs,
+        )
diff --git a/sdks/python/tests/unit/evaluation/test_evaluate.py b/sdks/python/tests/unit/evaluation/test_evaluate.py
@@ -1,7 +1,9 @@
 import mock
+import pytest
+
 from opik.api_objects.dataset import dataset_item
 from opik.api_objects import opik_client
-from opik import evaluation
+from opik import evaluation, exceptions
 from opik.evaluation import metrics
 from ...testlib import backend_emulator_message_processor, ANY_BUT_NONE, assert_equal
 from ...testlib.models import (
@@ -123,3 +125,37 @@ def say_task(dataset_item: dataset_item.DatasetItem):
         EXPECTED_TRACE_TREES, fake_message_processor_.trace_trees
     ):
         assert_equal(expected_trace, actual_trace)
+
+
+def test_evaluate___output_key_is_missing_in_task_output_dict__equals_metric_misses_output_argument__exception_raised():
+    # Dataset is the only thing which is mocked for this test because
+    # evaluate should raise an exception right after the first attempt
+    # to compute Equals metric score.
+    mock_dataset = mock.Mock()
+    mock_dataset.name = "the-dataset-name"
+    mock_dataset.get_all_items.return_value = [
+        dataset_item.DatasetItem(
+            id="dataset-item-id-1",
+            input={"input": "say hello"},
+            expected_output={"output": "hello"},
+        ),
+    ]
+
+    def say_task(dataset_item: dataset_item.DatasetItem):
+        if dataset_item.input["input"] == "say hello":
+            return {
+                "the-key-that-is-not-named-output": "hello",
+                "reference": dataset_item.expected_output["output"],
+            }
+        raise Exception
+
+    with pytest.raises(exceptions.ScoreMethodMissingArguments):
+        evaluation.evaluate(
+            dataset=mock_dataset,
+            task=say_task,
+            experiment_name="the-experiment-name",
+            scoring_metrics=[metrics.Equals()],
+            task_threads=1,
+        )
+
+    mock_dataset.get_all_items.assert_called_once()