diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 73f6cdf71..35c280841 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -37,7 +37,7 @@ jobs: run: | python -c "import sys; print(sys.version)" python -m pip install --upgrade pip - python -m pip install -r requirements.txt + python -m pip install -r requirements.txt ragas python -m pip install . pytest-rerunfailures pytest-asyncio - name: Run Unit Tests (pytest) diff --git a/README.md b/README.md index 894d9baa9..d7b8d1d03 100644 --- a/README.md +++ b/README.md @@ -207,3 +207,18 @@ Built by the Confident AI Team. For any questions/business enquiries - please co howpublished = {\url{https://github.com/confident-ai/deepeval}}, } ``` + +# Contributors + + + + + + + + + + + + + diff --git a/deepeval/_version.py b/deepeval/_version.py index 6d34ba33a..37fc43d7e 100644 --- a/deepeval/_version.py +++ b/deepeval/_version.py @@ -1 +1 @@ -__version__: str = "0.16.3" +__version__: str = "0.17.1" diff --git a/deepeval/api.py b/deepeval/api.py index 5b22b18b2..2860bdec9 100644 --- a/deepeval/api.py +++ b/deepeval/api.py @@ -4,6 +4,7 @@ import requests import json import warnings +from collections import defaultdict from typing import Any, Optional from pydantic import BaseModel, Field @@ -70,29 +71,45 @@ class TestRun(BaseModel): def add_llm_test_case( self, test_case: LLMTestCase, metrics: List[Metric], run_duration: float ): - self.metric_scores.extend([MetricScore.from_metric(m) for m in metrics]) + metric_dict = defaultdict(list) + for metric in metrics: + metric_dict[metric.__name__].extend( + [metric.score] + + [ + ms.score + for ms in self.metric_scores + if ms.metric == metric.__name__ + ] + ) + self.metric_scores = [ + MetricScore(metric=metric_name, score=sum(scores) / len(scores)) + for metric_name, scores in metric_dict.items() + ] # Check if test case with the same ID already exists existing_test_case: APITestCase = next( (tc for tc in self.test_cases if tc.name == test_case.__name__), None, ) + metric_dict = defaultdict(list) + for metric in metrics: + metric_dict[metric.__name__].append(metric.score) + metrics_metadata = [ + MetricsMetadata( + metric=metric_name, + score=sum(scores) / len(scores), + minimumScore=min(scores), + ) + for metric_name, scores in metric_dict.items() + ] + success = all([metric.is_successful() for metric in metrics]) + threshold = metrics[0].minimum_score + if existing_test_case: # If it exists, append the metrics to the existing test case - existing_test_case.metricsMetadata.extend( - [ - MetricsMetadata( - metric=metric.__name__, - score=metric.score, - minimumScore=metric.minimum_score, - ) - for metric in metrics - ] - ) + existing_test_case.metricsMetadata.extend(metrics_metadata) # Update the success status and threshold - existing_test_case.success = all( - [metric.is_successful() for metric in metrics] - ) - existing_test_case.threshold = metrics[0].minimum_score + existing_test_case.success = success + existing_test_case.threshold = threshold else: # If it doesn't exist, create a new test case name = "Test " + str(len(self.test_cases) + 1) @@ -102,16 +119,9 @@ def add_llm_test_case( input=test_case.query, actualOutput=test_case.output, expectedOutput=test_case.expected_output, - success=all([metric.is_successful() for metric in metrics]), - metricsMetadata=[ - MetricsMetadata( - metric=metric.__name__, - score=metric.score, - minimumScore=metric.minimum_score, - ) - for metric in metrics - ], - threshold=metrics[0].minimum_score, + success=success, + metricsMetadata=metrics_metadata, + threshold=threshold, runDuration=run_duration, ) ) @@ -124,8 +134,6 @@ def save(self, file_path: Optional[str] = None): return elif not file_path.endswith(".json"): file_path = f"{file_path}.json" - print({"save_filepath", file_path}) - with open(file_path, "w") as f: json.dump(self.dict(by_alias=True, exclude_none=True), f) @@ -140,7 +148,6 @@ def load(cls, file_path: Optional[str] = None): return elif not file_path.endswith(".json"): file_path = f"{file_path}.json" - print({"load_filepath", file_path}) with open(file_path, "r") as f: return cls(**json.load(f)) @@ -461,7 +468,13 @@ def list_implementations(self): def post_test_run(self, test_run: TestRun): """Post a test run""" + try: + body = test_run.model_dump(by_alias=True) + except AttributeError: + # Pydantic version below 2.0 + body = test_run.dict(by_alias=True) + return self.post_request( endpoint="/v1/test-run", - body=test_run.model_dump(by_alias=True), + body=body, ) diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py index 5435764bc..5dd772cee 100644 --- a/deepeval/cli/test.py +++ b/deepeval/cli/test.py @@ -78,18 +78,25 @@ def sample(): pass -def check_if_legit_file(test_file: str): - if test_file.endswith(".py"): - if not test_file.startswith("test_"): - raise ValueError( - "Test will not run. Please ensure the `test_` prefix." - ) +def check_if_legit_file(test_file_or_directory: str): + if os.path.isfile(test_file_or_directory): + if test_file_or_directory.endswith(".py"): + if not os.path.basename(test_file_or_directory).startswith("test_"): + raise ValueError( + "Test will not run. Please ensure the file starts with `test_` prefix." + ) + elif os.path.isdir(test_file_or_directory): + return + else: + raise ValueError( + "Provided path is neither a valid file nor a directory." + ) @app.command() def run( test_file_or_directory: str, - verbose: bool = False, + verbose: bool = True, color: str = "yes", durations: int = 10, pdb: bool = False, @@ -98,7 +105,8 @@ def run( ] = False, ): """Run a test""" - pytest_args = ["-k", test_file_or_directory] + check_if_legit_file(test_file_or_directory) + pytest_args = [test_file_or_directory] if exit_on_first_failure: pytest_args.insert(0, "-x") @@ -111,9 +119,10 @@ def run( "--verbose" if verbose else "--quiet", f"--color={color}", f"--durations={durations}", - "--pdb" if pdb else "", ] ) + if pdb: + pytest_args.append("--pdb") # Add the deepeval plugin file to pytest arguments pytest_args.extend(["-p", "plugins"]) @@ -122,7 +131,6 @@ def run( TextColumn("[progress.description]{task.description}"), transient=True, ) as progress: - # progress.add_task(description="Preparing tests...", total=None) progress.add_task( description="Downloading models (may take up to 2 minutes if running for the first time)...", total=None, diff --git a/deepeval/dataset.py b/deepeval/dataset.py index 7057c5b7c..994a6114f 100644 --- a/deepeval/dataset.py +++ b/deepeval/dataset.py @@ -76,6 +76,54 @@ def from_csv( def from_test_cases(self, test_cases: list): self.data = test_cases + @classmethod + def from_hf_dataset( + cls, + dataset_name: str, + split: str, + query_column: str, + expected_output_column: str, + context_column: str = None, + output_column: str = None, + id_column: str = None, + ): + """ + Load test cases from a HuggingFace dataset. + + Args: + dataset_name (str): The name of the HuggingFace dataset to load. + split (str): The split of the dataset to load (e.g., 'train', 'test'). + query_column (str): The column in the dataset corresponding to the query. + expected_output_column (str): The column in the dataset corresponding to the expected output. + context_column (str, optional): The column in the dataset corresponding to the context. Defaults to None. + output_column (str, optional): The column in the dataset corresponding to the output. Defaults to None. + id_column (str, optional): The column in the dataset corresponding to the ID. Defaults to None. + + Returns: + EvaluationDataset: An instance of EvaluationDataset containing the loaded test cases. + """ + try: + from datasets import load_dataset + except ImportError: + raise ImportError( + "The 'datasets' library is missing. Please install it using pip: pip install datasets" + ) + + hf_dataset = load_dataset(dataset_name, split=split) + test_cases = [] + + for i, row in enumerate(hf_dataset): + test_cases.append( + LLMTestCase( + query=row[query_column], + expected_output=row[expected_output_column], + context=row[context_column] if context_column else None, + output=row[output_column] if output_column else None, + id=row[id_column] if id_column else None, + ) + ) + return cls(test_cases) + @classmethod def from_json( cls, diff --git a/deepeval/metrics/bias_classifier.py b/deepeval/metrics/bias_classifier.py index fbfab5e41..7acc81776 100644 --- a/deepeval/metrics/bias_classifier.py +++ b/deepeval/metrics/bias_classifier.py @@ -25,7 +25,7 @@ def __call__(self, output, expected_output, query: Optional[str] = "-"): success = score >= self.minimum_score return score - def measure(self, test_case: LLMTestCase): + def measure(self, test_case: LLMTestCase, return_all_scores: bool = False): if test_case.output is None: raise ValueError("Required attributes for test_case cannot be None") @@ -49,6 +49,8 @@ def measure(self, test_case: LLMTestCase): self.success = True self.score = v + if return_all_scores: + return results return v def is_successful(self): diff --git a/deepeval/metrics/ragas_metric.py b/deepeval/metrics/ragas_metric.py new file mode 100644 index 000000000..4ba3cdd34 --- /dev/null +++ b/deepeval/metrics/ragas_metric.py @@ -0,0 +1,357 @@ +"""An implementation of the Ragas metric +""" +import os +import numpy as np +from deepeval.metrics.metric import Metric +from deepeval.test_case import LLMTestCase +from typing import List + + +class ContextualRelevancyRagasMetric(Metric): + """This metric checks the contextual relevancy using Ragas""" + + def __init__( + self, + minimum_score: float = 0.3, + ): + self.minimum_score = minimum_score + try: + # Adding a list of metrics + from ragas.metrics import context_relevancy + + self.metrics = [context_relevancy] + + except ModuleNotFoundError as e: + print( + "Please install ragas to use this metric. `pip install ragas`." + ) + + def measure(self, test_case: LLMTestCase): + # sends to server + try: + from ragas import evaluate + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Please install ragas to use this metric. `pip install ragas`." + ) + + try: + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + # Create a dataset from the test case + data = { + "ground_truths": [[test_case.expected_output]], + "contexts": [[test_case.context]], + "question": [test_case.query], + "answer": [test_case.output], + "id": [[test_case.id]], + } + dataset = Dataset.from_dict(data) + + # Evaluate the dataset using Ragas + scores = evaluate(dataset, metrics=self.metrics) + + # Ragas only does dataset-level comparisons + context_relevancy_score = scores["context_relevancy"] + self.success = context_relevancy_score >= self.minimum_score + self.score = context_relevancy_score + return context_relevancy_score + + def is_successful(self): + return self.success + + @property + def __name__(self): + return "Contextual Relevancy Ragas Score" + + +class AnswerRelevancyRagasMetric(Metric): + """This metric checks the answer relevancy using Ragas""" + + def __init__( + self, + minimum_score: float = 0.3, + ): + self.minimum_score = minimum_score + try: + from ragas.metrics import answer_relevancy + + self.metrics = [answer_relevancy] + except ModuleNotFoundError as e: + print( + "Please install ragas to use this metric. `pip install ragas`." + ) + + def measure(self, test_case: LLMTestCase): + # sends to server + try: + from ragas import evaluate + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Please install ragas to use this metric. `pip install ragas`." + ) + + try: + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + data = { + "ground_truths": [[test_case.expected_output]], + "contexts": [[test_case.context]], + "question": [test_case.query], + "answer": [test_case.output], + "id": [[test_case.id]], + } + dataset = Dataset.from_dict(data) + scores = evaluate(dataset, metrics=self.metrics) + answer_relevancy_score = scores["answer_relevancy"] + self.success = answer_relevancy_score >= self.minimum_score + self.score = answer_relevancy_score + return answer_relevancy_score + + def is_successful(self): + return self.success + + @property + def __name__(self): + return "Answer Relevancy Ragas Score" + + +class FaithfulnessRagasMetric(Metric): + def __init__( + self, + minimum_score: float = 0.3, + ): + self.minimum_score = minimum_score + try: + from ragas.metrics import faithfulness + + self.metrics = [faithfulness] + except ModuleNotFoundError as e: + print( + "Please install ragas to use this metric. `pip install ragas`." + ) + + def measure(self, test_case: LLMTestCase): + # sends to server + try: + from ragas import evaluate + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Please install ragas to use this metric. `pip install ragas`." + ) + + try: + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + data = { + "ground_truths": [[test_case.expected_output]], + "contexts": [[test_case.context]], + "question": [test_case.query], + "answer": [test_case.output], + "id": [[test_case.id]], + } + dataset = Dataset.from_dict(data) + scores = evaluate(dataset, metrics=self.metrics) + faithfulness_score = scores["faithfulness"] + self.success = faithfulness_score >= self.minimum_score + self.score = faithfulness_score + return faithfulness_score + + def is_successful(self): + return self.success + + @property + def __name__(self): + return "Faithfulness Ragas Score" + + +class ContextRecallRagasMetric(Metric): + """This metric checks the context recall using Ragas""" + + def __init__( + self, + minimum_score: float = 0.3, + ): + self.minimum_score = minimum_score + try: + from ragas.metrics import context_recall + + self.metrics = [context_recall] + except ModuleNotFoundError as e: + print( + "Please install ragas to use this metric. `pip install ragas`." + ) + + def measure(self, test_case: LLMTestCase): + # sends to server + try: + from ragas import evaluate + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Please install ragas to use this metric. `pip install ragas`." + ) + + try: + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + data = { + "ground_truths": [[test_case.expected_output]], + "contexts": [[test_case.context]], + "question": [test_case.query], + "answer": [test_case.output], + "id": [[test_case.id]], + } + dataset = Dataset.from_dict(data) + scores = evaluate(dataset, metrics=self.metrics) + context_recall_score = scores["context_recall"] + self.success = context_recall_score >= self.minimum_score + self.score = context_recall_score + return context_recall_score + + def is_successful(self): + return self.success + + @property + def __name__(self): + return "Context Recall Ragas Score" + + +class HarmfulnessRagasMetric(Metric): + """This metric checks the harmfulness using Ragas""" + + def __init__( + self, + minimum_score: float = 0.3, + ): + self.minimum_score = minimum_score + try: + from ragas.metrics.critique import harmfulness + + self.metrics = [harmfulness] + except ModuleNotFoundError as e: + print( + "Please install ragas to use this metric. `pip install ragas`." + ) + + def measure(self, test_case: LLMTestCase): + # sends to server + try: + from ragas import evaluate + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Please install ragas to use this metric. `pip install ragas`." + ) + + try: + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + data = { + "ground_truths": [[test_case.expected_output]], + "contexts": [[test_case.context]], + "question": [test_case.query], + "answer": [test_case.output], + "id": [[test_case.id]], + } + dataset = Dataset.from_dict(data) + scores = evaluate(dataset, metrics=self.metrics) + harmfulness_score = scores["harmfulness"] + self.success = harmfulness_score >= self.minimum_score + self.score = harmfulness_score + return harmfulness_score + + def is_successful(self): + return self.success + + @property + def __name__(self): + return "Harmfulness Ragas Score" + + +class RagasMetric(Metric): + """This metric checks if the output is more than 3 letters""" + + def __init__( + self, + metrics: List[Metric] = None, + minimum_score: float = 0.3, + ): + self.minimum_score = minimum_score + if metrics is None: + self.metrics = [ + HarmfulnessRagasMetric, + ContextRecallRagasMetric, + FaithfulnessRagasMetric, + AnswerRelevancyRagasMetric, + ContextualRelevancyRagasMetric, + ] + else: + self.metrics = metrics + + def measure(self, test_case: LLMTestCase): + # sends to server + try: + from ragas import evaluate + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Please install ragas to use this metric. `pip install ragas`." + ) + + try: + # How do i make sure this isn't just huggingface dataset + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + # Create a dataset from the test case + # Convert the LLMTestCase to a format compatible with Dataset + scores = [] + for metric in self.metrics: + m = metric() + score = m.measure(test_case) + scores.append(score) + + # ragas score is harmonic mean of all the scores + if len(scores) > 0: + ragas_score = len(scores) / sum( + 1.0 / score for score in scores if score != 0 + ) + else: + ragas_score = 0 + + # Ragas only does dataset-level comparisons + # >>> print(result["ragas_score"]) + # {'ragas_score': 0.860, 'context_relevancy': 0.817, 'faithfulness': 0.892, + # 'answer_relevancy': 0.874} + self.success = ragas_score >= self.minimum_score + self.score = ragas_score + return ragas_score + + def is_successful(self): + return self.success + + @property + def __name__(self): + return "Ragas Score" + + +def assert_ragas( + test_case: LLMTestCase, + metrics: List[str] = None, + minimum_score: float = 0.3, +): + """Asserts if the Ragas score is above the minimum score""" + metric = RagasMetric(metrics, minimum_score) + score = metric.measure(test_case) + assert ( + score >= metric.minimum_score + ), f"Ragas score {score} is below the minimum score {metric.minimum_score}" diff --git a/deepeval/plugins/plugin.py b/deepeval/plugins/plugin.py index 56edac02a..a4f31860b 100644 --- a/deepeval/plugins/plugin.py +++ b/deepeval/plugins/plugin.py @@ -19,18 +19,18 @@ def pytest_sessionstart(session): @pytest.hookimpl(tryfirst=True, hookwrapper=True) def pytest_sessionfinish(session, exitstatus): # Code before yield will run before the test teardown - api: Api = Api() # yield control back to pytest for the actual teardown yield # Code after yield will run after the test teardown - if os.getenv(PYTEST_RUN_ENV_VAR): + if os.getenv(PYTEST_RUN_ENV_VAR) and os.path.exists(".deepeval"): + api: Api = Api() test_run = TestRun.load(test_filename) result = api.post_test_run(test_run) run_id = result["id"] print( "✅ Tests finished! View results on " - + f"https://app.confident-ai.com/unit-tests/{run_id}/test-cases" + + f"https://app.confident-ai.com/unit-tests/{run_id}" ) os.remove(test_filename) diff --git a/tests/test_ragas.py b/tests/test_ragas.py new file mode 100644 index 000000000..f00a36306 --- /dev/null +++ b/tests/test_ragas.py @@ -0,0 +1,25 @@ +import pytest +from deepeval.test_case import LLMTestCase +from deepeval.metrics.ragas_metric import RagasMetric +from deepeval.run_test import assert_test + + +query = "Who won the FIFA World Cup in 2018?" +output = "Winners of the FIFA world cup were the French national football team" +expected_output = "French national football team" +context = "The FIFA World Cup in 2018 was won by the French national football team. They defeated Croatia 4-2 in the final match to claim the championship." + + +@pytest.mark.skip(reason="openai is expensive") +def test_overall_score(): + test_case = LLMTestCase( + query=query, + output=output, + expected_output=expected_output, + context=context, + ) + metric = RagasMetric() + assert_test( + test_cases=[test_case], + metrics=[metric], + )