Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added nb_samples parameter in evaluate function #355

Merged
merged 2 commits into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions sdks/python/examples/evaluation_example.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Dict, Any

from opik.evaluation.metrics import (
Contains,
IsJson,
Hallucination,
)
Expand All @@ -16,13 +15,15 @@

openai_client = track_openai(openai.OpenAI())

contains_hello = Contains(searched_value="hello", name="ContainsHello")
contains_bye = Contains(searched_value="bye", name="ContainsBye")
# contains_hello = Contains(searched_value="hello", name="ContainsHello")
# contains_bye = Contains(searched_value="bye", name="ContainsBye")
is_json = IsJson()
hallucination = Hallucination()

client = Opik()
dataset = client.create_dataset(name="My 42 dataset", description="For storing stuff")
dataset = client.get_or_create_dataset(
name="My 42 dataset", description="For storing stuff"
)
# dataset = client.get_dataset(name="My 42 dataset")

json = """
Expand Down Expand Up @@ -69,5 +70,6 @@ def llm_task(item: DatasetItem) -> Dict[str, Any]:
experiment_name="My experiment",
dataset=dataset,
task=llm_task,
scoring_metrics=[contains_hello, contains_bye, is_json, hallucination],
nb_samples=2,
scoring_metrics=[is_json, hallucination],
)
36 changes: 28 additions & 8 deletions sdks/python/src/opik/api_objects/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,12 +179,17 @@ def to_json(self) -> str:

return converters.to_json(dataset_items, keys_mapping={})

def get_all_items(self) -> List[dataset_item.DatasetItem]:
def get_items(
self, nb_samples: Optional[int] = None
) -> List[dataset_item.DatasetItem]:
"""
Retrieve all items from the dataset.
Retrieve a fixed set number of dataset items.

Args:
nb_samples: The number of samples to retrieve.

Returns:
A list of DatasetItem objects representing all items in the dataset.
A list of DatasetItem objects representing the samples.
"""
results: List[dataset_item.DatasetItem] = []

Expand All @@ -194,8 +199,11 @@ def get_all_items(self) -> List[dataset_item.DatasetItem]:
last_retrieved_id=results[-1].id if len(results) > 0 else None,
)

previous_results_size = len(results)
if nb_samples is not None and len(results) == nb_samples:
break

item_bytes = b"".join(stream)
stream_results: List[dataset_item.DatasetItem] = []
for line in item_bytes.split(b"\n"):
if len(line) == 0:
continue
Expand All @@ -212,15 +220,27 @@ def get_all_items(self) -> List[dataset_item.DatasetItem]:
source=item_content.get("source"), # type: ignore
)

stream_results.append(item)
results.append(item)

if len(stream_results) == 0:
break
# Break the loop if we have enough samples
if nb_samples is not None and len(results) == nb_samples:
break

results.extend(stream_results)
# Break the loop if we have not received any new samples
if len(results) == previous_results_size:
break

return results

def get_all_items(self) -> List[dataset_item.DatasetItem]:
"""
Retrieve all items from the dataset.

Returns:
A list of DatasetItem objects representing all items in the dataset.
"""
return self.get_items()

def insert_from_json(
self,
json_array: str,
Expand Down
4 changes: 4 additions & 0 deletions sdks/python/src/opik/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def evaluate(
experiment_name: Optional[str] = None,
experiment_config: Optional[Dict[str, Any]] = None,
verbose: int = 1,
nb_samples: Optional[int] = None,
task_threads: int = 16,
) -> evaluation_result.EvaluationResult:
"""
Expand All @@ -42,6 +43,8 @@ def evaluate(
verbose: an integer value that controls evaluation output logs such as summary and tqdm progress bar.
0 - no outputs, 1 - outputs are enabled (default).

nb_samples: number of samples to evaluate. If no value is provided, all samples in the dataset will be evaluated.

task_threads: amount of thread workers to run tasks. If set to 1, no additional
threads are created, all tasks executed in the current thread sequentially.
are executed sequentially in the current thread.
Expand All @@ -55,6 +58,7 @@ def evaluate(
dataset_=dataset,
task=task,
scoring_metrics=scoring_metrics,
nb_samples=nb_samples,
workers=task_threads,
verbose=verbose,
)
Expand Down
5 changes: 3 additions & 2 deletions sdks/python/src/opik/evaluation/tasks_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
from concurrent import futures

from typing import List
from typing import List, Optional
from .types import LLMTask
from opik.api_objects.dataset import dataset, dataset_item
from opik.api_objects import opik_client, trace
Expand Down Expand Up @@ -96,9 +96,10 @@ def run(
task: LLMTask,
scoring_metrics: List[base_metric.BaseMetric],
workers: int,
nb_samples: Optional[int],
verbose: int,
) -> List[test_result.TestResult]:
dataset_items = dataset_.get_all_items()
dataset_items = dataset_.get_items(nb_samples=nb_samples)
test_cases: List[test_result.TestResult]

if workers == 1:
Expand Down
6 changes: 3 additions & 3 deletions sdks/python/tests/unit/evaluation/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_evaluate_happyflow(fake_streamer):

mock_dataset = mock.Mock()
mock_dataset.name = "the-dataset-name"
mock_dataset.get_all_items.return_value = [
mock_dataset.get_items.return_value = [
dataset_item.DatasetItem(
id="dataset-item-id-1",
input={"input": "say hello"},
Expand Down Expand Up @@ -133,7 +133,7 @@ def test_evaluate___output_key_is_missing_in_task_output_dict__equals_metric_mis
# to compute Equals metric score.
mock_dataset = mock.Mock()
mock_dataset.name = "the-dataset-name"
mock_dataset.get_all_items.return_value = [
mock_dataset.get_items.return_value = [
dataset_item.DatasetItem(
id="dataset-item-id-1",
input={"input": "say hello"},
Expand All @@ -158,4 +158,4 @@ def say_task(dataset_item: dataset_item.DatasetItem):
task_threads=1,
)

mock_dataset.get_all_items.assert_called_once()
mock_dataset.get_items.assert_called_once()
Loading