Merge branch 'confident-ai:main' into addlength

j-space-b · Sep 24, 2023 · dd0236b · dd0236b
2 parents 148278c + 8b08ca1
commit dd0236b
Show file tree

Hide file tree

Showing 10 changed files with 513 additions and 45 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -37,7 +37,7 @@ jobs:
         run: |
           python -c "import sys; print(sys.version)"
           python -m pip install --upgrade pip
-          python -m pip install -r requirements.txt
+          python -m pip install -r requirements.txt ragas
           python -m pip install . pytest-rerunfailures pytest-asyncio
 
       - name: Run Unit Tests (pytest)

diff --git a/README.md b/README.md
@@ -207,3 +207,18 @@ Built by the Confident AI Team. For any questions/business enquiries - please co
   howpublished = {\url{https://github.com/confident-ai/deepeval}},
 }
 ```
+
+# Contributors
+
+<!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->
+<!-- prettier-ignore-start -->
+<!-- markdownlint-disable -->
+
+<!-- markdownlint-restore -->
+<!-- prettier-ignore-end -->
+
+<!-- ALL-CONTRIBUTORS-LIST:END -->
+
+<a href="https://github.com/confident-ai/deepeval/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=confident-ai/deepeval" />
+</a>
diff --git a/deepeval/_version.py b/deepeval/_version.py
@@ -1 +1 @@
-__version__: str = "0.16.3"
+__version__: str = "0.17.1"
diff --git a/deepeval/api.py b/deepeval/api.py
@@ -4,6 +4,7 @@
 import requests
 import json
 import warnings
+from collections import defaultdict
 
 from typing import Any, Optional
 from pydantic import BaseModel, Field
@@ -70,29 +71,45 @@ class TestRun(BaseModel):
     def add_llm_test_case(
         self, test_case: LLMTestCase, metrics: List[Metric], run_duration: float
     ):
-        self.metric_scores.extend([MetricScore.from_metric(m) for m in metrics])
+        metric_dict = defaultdict(list)
+        for metric in metrics:
+            metric_dict[metric.__name__].extend(
+                [metric.score]
+                + [
+                    ms.score
+                    for ms in self.metric_scores
+                    if ms.metric == metric.__name__
+                ]
+            )
+        self.metric_scores = [
+            MetricScore(metric=metric_name, score=sum(scores) / len(scores))
+            for metric_name, scores in metric_dict.items()
+        ]
         # Check if test case with the same ID already exists
         existing_test_case: APITestCase = next(
             (tc for tc in self.test_cases if tc.name == test_case.__name__),
             None,
         )
+        metric_dict = defaultdict(list)
+        for metric in metrics:
+            metric_dict[metric.__name__].append(metric.score)
+        metrics_metadata = [
+            MetricsMetadata(
+                metric=metric_name,
+                score=sum(scores) / len(scores),
+                minimumScore=min(scores),
+            )
+            for metric_name, scores in metric_dict.items()
+        ]
+        success = all([metric.is_successful() for metric in metrics])
+        threshold = metrics[0].minimum_score
+
         if existing_test_case:
             # If it exists, append the metrics to the existing test case
-            existing_test_case.metricsMetadata.extend(
-                [
-                    MetricsMetadata(
-                        metric=metric.__name__,
-                        score=metric.score,
-                        minimumScore=metric.minimum_score,
-                    )
-                    for metric in metrics
-                ]
-            )
+            existing_test_case.metricsMetadata.extend(metrics_metadata)
             # Update the success status and threshold
-            existing_test_case.success = all(
-                [metric.is_successful() for metric in metrics]
-            )
-            existing_test_case.threshold = metrics[0].minimum_score
+            existing_test_case.success = success
+            existing_test_case.threshold = threshold
         else:
             # If it doesn't exist, create a new test case
             name = "Test " + str(len(self.test_cases) + 1)
@@ -102,16 +119,9 @@ def add_llm_test_case(
                     input=test_case.query,
                     actualOutput=test_case.output,
                     expectedOutput=test_case.expected_output,
-                    success=all([metric.is_successful() for metric in metrics]),
-                    metricsMetadata=[
-                        MetricsMetadata(
-                            metric=metric.__name__,
-                            score=metric.score,
-                            minimumScore=metric.minimum_score,
-                        )
-                        for metric in metrics
-                    ],
-                    threshold=metrics[0].minimum_score,
+                    success=success,
+                    metricsMetadata=metrics_metadata,
+                    threshold=threshold,
                     runDuration=run_duration,
                 )
             )
@@ -124,8 +134,6 @@ def save(self, file_path: Optional[str] = None):
                 return
             elif not file_path.endswith(".json"):
                 file_path = f"{file_path}.json"
-        print({"save_filepath", file_path})
-
         with open(file_path, "w") as f:
             json.dump(self.dict(by_alias=True, exclude_none=True), f)
 
@@ -140,7 +148,6 @@ def load(cls, file_path: Optional[str] = None):
                 return
             elif not file_path.endswith(".json"):
                 file_path = f"{file_path}.json"
-        print({"load_filepath", file_path})
         with open(file_path, "r") as f:
             return cls(**json.load(f))
 
@@ -461,7 +468,13 @@ def list_implementations(self):
 
     def post_test_run(self, test_run: TestRun):
         """Post a test run"""
+        try:
+            body = test_run.model_dump(by_alias=True)
+        except AttributeError:
+            # Pydantic version below 2.0
+            body = test_run.dict(by_alias=True)
+
         return self.post_request(
             endpoint="/v1/test-run",
-            body=test_run.model_dump(by_alias=True),
+            body=body,
         )
diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py
@@ -78,18 +78,25 @@ def sample():
         pass
 
 
-def check_if_legit_file(test_file: str):
-    if test_file.endswith(".py"):
-        if not test_file.startswith("test_"):
-            raise ValueError(
-                "Test will not run. Please ensure the `test_` prefix."
-            )
+def check_if_legit_file(test_file_or_directory: str):
+    if os.path.isfile(test_file_or_directory):
+        if test_file_or_directory.endswith(".py"):
+            if not os.path.basename(test_file_or_directory).startswith("test_"):
+                raise ValueError(
+                    "Test will not run. Please ensure the file starts with `test_` prefix."
+                )
+    elif os.path.isdir(test_file_or_directory):
+        return
+    else:
+        raise ValueError(
+            "Provided path is neither a valid file nor a directory."
+        )
 
 
 @app.command()
 def run(
     test_file_or_directory: str,
-    verbose: bool = False,
+    verbose: bool = True,
     color: str = "yes",
     durations: int = 10,
     pdb: bool = False,
@@ -98,7 +105,8 @@ def run(
     ] = False,
 ):
     """Run a test"""
-    pytest_args = ["-k", test_file_or_directory]
+    check_if_legit_file(test_file_or_directory)
+    pytest_args = [test_file_or_directory]
     if exit_on_first_failure:
         pytest_args.insert(0, "-x")
 
@@ -111,9 +119,10 @@ def run(
             "--verbose" if verbose else "--quiet",
             f"--color={color}",
             f"--durations={durations}",
-            "--pdb" if pdb else "",
         ]
     )
+    if pdb:
+        pytest_args.append("--pdb")
     # Add the deepeval plugin file to pytest arguments
     pytest_args.extend(["-p", "plugins"])
 
@@ -122,7 +131,6 @@ def run(
         TextColumn("[progress.description]{task.description}"),
         transient=True,
     ) as progress:
-        # progress.add_task(description="Preparing tests...", total=None)
         progress.add_task(
             description="Downloading models (may take up to 2 minutes if running for the first time)...",
             total=None,

diff --git a/deepeval/dataset.py b/deepeval/dataset.py
@@ -76,6 +76,54 @@ def from_csv(
     def from_test_cases(self, test_cases: list):
         self.data = test_cases
 
+    @classmethod
+    def from_hf_dataset(
+        cls,
+        dataset_name: str,
+        split: str,
+        query_column: str,
+        expected_output_column: str,
+        context_column: str = None,
+        output_column: str = None,
+        id_column: str = None,
+    ):
+        """
+        Load test cases from a HuggingFace dataset.
+
+        Args:
+            dataset_name (str): The name of the HuggingFace dataset to load.
+            split (str): The split of the dataset to load (e.g., 'train', 'test').
+            query_column (str): The column in the dataset corresponding to the query.
+            expected_output_column (str): The column in the dataset corresponding to the expected output.
+            context_column (str, optional): The column in the dataset corresponding to the context. Defaults to None.
+            output_column (str, optional): The column in the dataset corresponding to the output. Defaults to None.
+            id_column (str, optional): The column in the dataset corresponding to the ID. Defaults to None.
+
+        Returns:
+            EvaluationDataset: An instance of EvaluationDataset containing the loaded test cases.
+        """
+        try:
+            from datasets import load_dataset
+        except ImportError:
+            raise ImportError(
+                "The 'datasets' library is missing. Please install it using pip: pip install datasets"
+            )
+
+        hf_dataset = load_dataset(dataset_name, split=split)
+        test_cases = []
+
+        for i, row in enumerate(hf_dataset):
+            test_cases.append(
+                LLMTestCase(
+                    query=row[query_column],
+                    expected_output=row[expected_output_column],
+                    context=row[context_column] if context_column else None,
+                    output=row[output_column] if output_column else None,
+                    id=row[id_column] if id_column else None,
+                )
+            )
+        return cls(test_cases)
+
     @classmethod
     def from_json(
         cls,

diff --git a/deepeval/metrics/bias_classifier.py b/deepeval/metrics/bias_classifier.py
@@ -25,7 +25,7 @@ def __call__(self, output, expected_output, query: Optional[str] = "-"):
         success = score >= self.minimum_score
         return score
 
-    def measure(self, test_case: LLMTestCase):
+    def measure(self, test_case: LLMTestCase, return_all_scores: bool = False):
         if test_case.output is None:
             raise ValueError("Required attributes for test_case cannot be None")
 
@@ -49,6 +49,8 @@ def measure(self, test_case: LLMTestCase):
             self.success = True
 
         self.score = v
+        if return_all_scores:
+            return results
         return v
 
     def is_successful(self):