Skip to content

Commit

Permalink
Merge branch 'confident-ai:main' into addlength
Browse files Browse the repository at this point in the history
  • Loading branch information
j-space-b authored Sep 24, 2023
2 parents 148278c + 8b08ca1 commit dd0236b
Show file tree
Hide file tree
Showing 10 changed files with 513 additions and 45 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
run: |
python -c "import sys; print(sys.version)"
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
python -m pip install -r requirements.txt ragas
python -m pip install . pytest-rerunfailures pytest-asyncio
- name: Run Unit Tests (pytest)
Expand Down
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -207,3 +207,18 @@ Built by the Confident AI Team. For any questions/business enquiries - please co
howpublished = {\url{https://github.com/confident-ai/deepeval}},
}
```

# Contributors

<!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->
<!-- prettier-ignore-start -->
<!-- markdownlint-disable -->

<!-- markdownlint-restore -->
<!-- prettier-ignore-end -->

<!-- ALL-CONTRIBUTORS-LIST:END -->

<a href="https://github.com/confident-ai/deepeval/graphs/contributors">
<img src="https://contrib.rocks/image?repo=confident-ai/deepeval" />
</a>
2 changes: 1 addition & 1 deletion deepeval/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__: str = "0.16.3"
__version__: str = "0.17.1"
71 changes: 42 additions & 29 deletions deepeval/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import requests
import json
import warnings
from collections import defaultdict

from typing import Any, Optional
from pydantic import BaseModel, Field
Expand Down Expand Up @@ -70,29 +71,45 @@ class TestRun(BaseModel):
def add_llm_test_case(
self, test_case: LLMTestCase, metrics: List[Metric], run_duration: float
):
self.metric_scores.extend([MetricScore.from_metric(m) for m in metrics])
metric_dict = defaultdict(list)
for metric in metrics:
metric_dict[metric.__name__].extend(
[metric.score]
+ [
ms.score
for ms in self.metric_scores
if ms.metric == metric.__name__
]
)
self.metric_scores = [
MetricScore(metric=metric_name, score=sum(scores) / len(scores))
for metric_name, scores in metric_dict.items()
]
# Check if test case with the same ID already exists
existing_test_case: APITestCase = next(
(tc for tc in self.test_cases if tc.name == test_case.__name__),
None,
)
metric_dict = defaultdict(list)
for metric in metrics:
metric_dict[metric.__name__].append(metric.score)
metrics_metadata = [
MetricsMetadata(
metric=metric_name,
score=sum(scores) / len(scores),
minimumScore=min(scores),
)
for metric_name, scores in metric_dict.items()
]
success = all([metric.is_successful() for metric in metrics])
threshold = metrics[0].minimum_score

if existing_test_case:
# If it exists, append the metrics to the existing test case
existing_test_case.metricsMetadata.extend(
[
MetricsMetadata(
metric=metric.__name__,
score=metric.score,
minimumScore=metric.minimum_score,
)
for metric in metrics
]
)
existing_test_case.metricsMetadata.extend(metrics_metadata)
# Update the success status and threshold
existing_test_case.success = all(
[metric.is_successful() for metric in metrics]
)
existing_test_case.threshold = metrics[0].minimum_score
existing_test_case.success = success
existing_test_case.threshold = threshold
else:
# If it doesn't exist, create a new test case
name = "Test " + str(len(self.test_cases) + 1)
Expand All @@ -102,16 +119,9 @@ def add_llm_test_case(
input=test_case.query,
actualOutput=test_case.output,
expectedOutput=test_case.expected_output,
success=all([metric.is_successful() for metric in metrics]),
metricsMetadata=[
MetricsMetadata(
metric=metric.__name__,
score=metric.score,
minimumScore=metric.minimum_score,
)
for metric in metrics
],
threshold=metrics[0].minimum_score,
success=success,
metricsMetadata=metrics_metadata,
threshold=threshold,
runDuration=run_duration,
)
)
Expand All @@ -124,8 +134,6 @@ def save(self, file_path: Optional[str] = None):
return
elif not file_path.endswith(".json"):
file_path = f"{file_path}.json"
print({"save_filepath", file_path})

with open(file_path, "w") as f:
json.dump(self.dict(by_alias=True, exclude_none=True), f)

Expand All @@ -140,7 +148,6 @@ def load(cls, file_path: Optional[str] = None):
return
elif not file_path.endswith(".json"):
file_path = f"{file_path}.json"
print({"load_filepath", file_path})
with open(file_path, "r") as f:
return cls(**json.load(f))

Expand Down Expand Up @@ -461,7 +468,13 @@ def list_implementations(self):

def post_test_run(self, test_run: TestRun):
"""Post a test run"""
try:
body = test_run.model_dump(by_alias=True)
except AttributeError:
# Pydantic version below 2.0
body = test_run.dict(by_alias=True)

return self.post_request(
endpoint="/v1/test-run",
body=test_run.model_dump(by_alias=True),
body=body,
)
28 changes: 18 additions & 10 deletions deepeval/cli/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,18 +78,25 @@ def sample():
pass


def check_if_legit_file(test_file: str):
if test_file.endswith(".py"):
if not test_file.startswith("test_"):
raise ValueError(
"Test will not run. Please ensure the `test_` prefix."
)
def check_if_legit_file(test_file_or_directory: str):
if os.path.isfile(test_file_or_directory):
if test_file_or_directory.endswith(".py"):
if not os.path.basename(test_file_or_directory).startswith("test_"):
raise ValueError(
"Test will not run. Please ensure the file starts with `test_` prefix."
)
elif os.path.isdir(test_file_or_directory):
return
else:
raise ValueError(
"Provided path is neither a valid file nor a directory."
)


@app.command()
def run(
test_file_or_directory: str,
verbose: bool = False,
verbose: bool = True,
color: str = "yes",
durations: int = 10,
pdb: bool = False,
Expand All @@ -98,7 +105,8 @@ def run(
] = False,
):
"""Run a test"""
pytest_args = ["-k", test_file_or_directory]
check_if_legit_file(test_file_or_directory)
pytest_args = [test_file_or_directory]
if exit_on_first_failure:
pytest_args.insert(0, "-x")

Expand All @@ -111,9 +119,10 @@ def run(
"--verbose" if verbose else "--quiet",
f"--color={color}",
f"--durations={durations}",
"--pdb" if pdb else "",
]
)
if pdb:
pytest_args.append("--pdb")
# Add the deepeval plugin file to pytest arguments
pytest_args.extend(["-p", "plugins"])

Expand All @@ -122,7 +131,6 @@ def run(
TextColumn("[progress.description]{task.description}"),
transient=True,
) as progress:
# progress.add_task(description="Preparing tests...", total=None)
progress.add_task(
description="Downloading models (may take up to 2 minutes if running for the first time)...",
total=None,
Expand Down
48 changes: 48 additions & 0 deletions deepeval/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,54 @@ def from_csv(
def from_test_cases(self, test_cases: list):
self.data = test_cases

@classmethod
def from_hf_dataset(
cls,
dataset_name: str,
split: str,
query_column: str,
expected_output_column: str,
context_column: str = None,
output_column: str = None,
id_column: str = None,
):
"""
Load test cases from a HuggingFace dataset.
Args:
dataset_name (str): The name of the HuggingFace dataset to load.
split (str): The split of the dataset to load (e.g., 'train', 'test').
query_column (str): The column in the dataset corresponding to the query.
expected_output_column (str): The column in the dataset corresponding to the expected output.
context_column (str, optional): The column in the dataset corresponding to the context. Defaults to None.
output_column (str, optional): The column in the dataset corresponding to the output. Defaults to None.
id_column (str, optional): The column in the dataset corresponding to the ID. Defaults to None.
Returns:
EvaluationDataset: An instance of EvaluationDataset containing the loaded test cases.
"""
try:
from datasets import load_dataset
except ImportError:
raise ImportError(
"The 'datasets' library is missing. Please install it using pip: pip install datasets"
)

hf_dataset = load_dataset(dataset_name, split=split)
test_cases = []

for i, row in enumerate(hf_dataset):
test_cases.append(
LLMTestCase(
query=row[query_column],
expected_output=row[expected_output_column],
context=row[context_column] if context_column else None,
output=row[output_column] if output_column else None,
id=row[id_column] if id_column else None,
)
)
return cls(test_cases)

@classmethod
def from_json(
cls,
Expand Down
4 changes: 3 additions & 1 deletion deepeval/metrics/bias_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __call__(self, output, expected_output, query: Optional[str] = "-"):
success = score >= self.minimum_score
return score

def measure(self, test_case: LLMTestCase):
def measure(self, test_case: LLMTestCase, return_all_scores: bool = False):
if test_case.output is None:
raise ValueError("Required attributes for test_case cannot be None")

Expand All @@ -49,6 +49,8 @@ def measure(self, test_case: LLMTestCase):
self.success = True

self.score = v
if return_all_scores:
return results
return v

def is_successful(self):
Expand Down
Loading

0 comments on commit dd0236b

Please sign in to comment.