From 7c64751af719dfcf1a3860d4000edee91c0959da Mon Sep 17 00:00:00 2001 From: Jon Bennion Date: Fri, 22 Sep 2023 00:05:30 -0700 Subject: [PATCH 01/14] Added answer_length.py --- .idea/.gitignore | 8 +++++++ .idea/deepeval.iml | 15 +++++++++++++ .idea/inspectionProfiles/Project_Default.xml | 16 ++++++++++++++ .../inspectionProfiles/profiles_settings.xml | 6 ++++++ .idea/misc.xml | 4 ++++ .idea/modules.xml | 8 +++++++ .idea/vcs.xml | 6 ++++++ deepeval/metrics/answer_length.py | 21 +++++++++++++++++++ 8 files changed, 84 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/deepeval.iml create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 deepeval/metrics/answer_length.py diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 000000000..13566b81b --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/deepeval.iml b/.idea/deepeval.iml new file mode 100644 index 000000000..5fdd65ba2 --- /dev/null +++ b/.idea/deepeval.iml @@ -0,0 +1,15 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 000000000..370714195 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,16 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 000000000..105ce2da2 --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 000000000..66d59f811 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 000000000..56171818d --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 000000000..35eb1ddfb --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/deepeval/metrics/answer_length.py b/deepeval/metrics/answer_length.py new file mode 100644 index 000000000..d59ca7bc6 --- /dev/null +++ b/deepeval/metrics/answer_length.py @@ -0,0 +1,21 @@ +from .metric import Metric +from deepeval.test_case import LLMTestCase + +class LengthMetric(Metric): + """This metric checks if the output is more than 1 char""" + def __init__(self, minimum_length: int=1): + self.minimum_length = minimum_length + + def measure(self, test_case: LLMTestCase): + # sends to server + score = len(test_case.output) + self.success = score > self.minimum_length + return score + + def is_successful(self): + return self.success + + @property + def __name__(self): + return "AnswerLength" + From ed17ef4a9ff6b039f80a9739514384caa79b5177 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Fri, 22 Sep 2023 17:11:59 -0700 Subject: [PATCH 02/14] update ragas metric and dataset --- deepeval/dataset.py | 48 +++++++++++++++++ deepeval/metrics/ragas_metric.py | 89 ++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 deepeval/metrics/ragas_metric.py diff --git a/deepeval/dataset.py b/deepeval/dataset.py index 7057c5b7c..994a6114f 100644 --- a/deepeval/dataset.py +++ b/deepeval/dataset.py @@ -76,6 +76,54 @@ def from_csv( def from_test_cases(self, test_cases: list): self.data = test_cases + @classmethod + def from_hf_dataset( + cls, + dataset_name: str, + split: str, + query_column: str, + expected_output_column: str, + context_column: str = None, + output_column: str = None, + id_column: str = None, + ): + """ + Load test cases from a HuggingFace dataset. + + Args: + dataset_name (str): The name of the HuggingFace dataset to load. + split (str): The split of the dataset to load (e.g., 'train', 'test'). + query_column (str): The column in the dataset corresponding to the query. + expected_output_column (str): The column in the dataset corresponding to the expected output. + context_column (str, optional): The column in the dataset corresponding to the context. Defaults to None. + output_column (str, optional): The column in the dataset corresponding to the output. Defaults to None. + id_column (str, optional): The column in the dataset corresponding to the ID. Defaults to None. + + Returns: + EvaluationDataset: An instance of EvaluationDataset containing the loaded test cases. + """ + try: + from datasets import load_dataset + except ImportError: + raise ImportError( + "The 'datasets' library is missing. Please install it using pip: pip install datasets" + ) + + hf_dataset = load_dataset(dataset_name, split=split) + test_cases = [] + + for i, row in enumerate(hf_dataset): + test_cases.append( + LLMTestCase( + query=row[query_column], + expected_output=row[expected_output_column], + context=row[context_column] if context_column else None, + output=row[output_column] if output_column else None, + id=row[id_column] if id_column else None, + ) + ) + return cls(test_cases) + @classmethod def from_json( cls, diff --git a/deepeval/metrics/ragas_metric.py b/deepeval/metrics/ragas_metric.py new file mode 100644 index 000000000..356b4319d --- /dev/null +++ b/deepeval/metrics/ragas_metric.py @@ -0,0 +1,89 @@ +"""An implementation of the Ragas metric +""" +import os +from deepeval.metrics.metric import Metric +from deepeval.test_case import LLMTestCase +from deepeval.run_test import run_test +from typing import List + + +class RagasMetric(Metric): + """This metric checks if the output is more than 3 letters""" + + def __init__( + self, + openai_api_key: str, + metrics: List[str] = None, + minimum_score: float = 0.3, + ): + self.minimum_score = minimum_score + os.environ["OPENAI_API_KEY"] = openai_api_key + if metrics is None: + try: + # Adding a list of metrics + from ragas.metrics import ( + context_relevancy, + answer_relevancy, + faithfulness, + context_recall, + ) + from ragas.metrics.critique import harmfulness + + self.metrics = [ + context_relevancy, + answer_relevancy, + faithfulness, + context_recall, + harmfulness, + ] + + except ModuleNotFoundError as e: + print( + "Please install ragas to use this metric. `pip install ragas`." + ) + else: + metrics = self.metrics + + def measure(self, test_case: LLMTestCase): + # sends to server + try: + from ragas import evaluate + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Please install ragas to use this metric. `pip install ragas`." + ) + + try: + # How do i make sure this isn't just huggingface dataset + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + # Create a dataset from the test case + # Convert the LLMTestCase to a format compatible with Dataset + data = { + "expected_output": [test_case.expected_output], + "contexts": [test_case.context], + "output": [test_case.output], + "id": [test_case.id], + } + dataset = Dataset.from_dict(data) + + # Evaluate the dataset using Ragas + scores = evaluate(dataset, metrics=self.metrics) + + # Ragas only does dataset-level comparisons + # >>> print(result["ragas_score"]) + # {'ragas_score': 0.860, 'context_relevancy': 0.817, 'faithfulness': 0.892, + # 'answer_relevancy': 0.874} + ragas_score = scores["ragas_score"] + self.success = ragas_score >= self.minimum_score + self.score = ragas_score + return ragas_score + + def is_successful(self): + return self.success + + @property + def __name__(self): + return "Ragas Score" From 03123db5fd7914f9b7d97c596295f5b729939538 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Fri, 22 Sep 2023 17:21:48 -0700 Subject: [PATCH 03/14] update ragas test and metric --- .github/workflows/test.yml | 2 +- deepeval/metrics/ragas_metric.py | 14 ++++++++++++++ tests/test_ragas.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 tests/test_ragas.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 73f6cdf71..35c280841 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -37,7 +37,7 @@ jobs: run: | python -c "import sys; print(sys.version)" python -m pip install --upgrade pip - python -m pip install -r requirements.txt + python -m pip install -r requirements.txt ragas python -m pip install . pytest-rerunfailures pytest-asyncio - name: Run Unit Tests (pytest) diff --git a/deepeval/metrics/ragas_metric.py b/deepeval/metrics/ragas_metric.py index 356b4319d..714564eba 100644 --- a/deepeval/metrics/ragas_metric.py +++ b/deepeval/metrics/ragas_metric.py @@ -87,3 +87,17 @@ def is_successful(self): @property def __name__(self): return "Ragas Score" + + +def assert_ragas( + test_case: LLMTestCase, + openai_api_key: str, + metrics: List[str] = None, + minimum_score: float = 0.3, +): + """Asserts if the Ragas score is above the minimum score""" + metric = RagasMetric(openai_api_key, metrics, minimum_score) + score = metric.measure(test_case) + assert ( + score >= metric.minimum_score + ), f"Ragas score {score} is below the minimum score {metric.minimum_score}" diff --git a/tests/test_ragas.py b/tests/test_ragas.py new file mode 100644 index 000000000..2ec529f05 --- /dev/null +++ b/tests/test_ragas.py @@ -0,0 +1,28 @@ +import pytest +import os +from deepeval.test_case import LLMTestCase +from deepeval.metrics.ragas_metric import RagasMetric +from deepeval.run_test import assert_test + +metric = RagasMetric() + +query = "Who won the FIFA World Cup in 2018?" +output = "Winners of the FIFA world cup were the French national football team" +expected_output = "French national football team" +context = "The FIFA World Cup in 2018 was won by the French national football team. They defeated Croatia 4-2 in the final match to claim the championship." + + +class TestOverallScore: + metric = RagasMetric() + + def test_overall_score(self): + test_case = LLMTestCase( + query=query, + output=output, + expected_output=expected_output, + context=context, + ) + assert_test( + test_cases=[test_case], + metrics=[metric], + ) From 157a77cde441ae253461fee47bd78ab15b71f988 Mon Sep 17 00:00:00 2001 From: Jon Bennion Date: Fri, 22 Sep 2023 17:46:01 -0700 Subject: [PATCH 04/14] applied black reformat --- deepeval/metrics/answer_length.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/deepeval/metrics/answer_length.py b/deepeval/metrics/answer_length.py index d59ca7bc6..318030924 100644 --- a/deepeval/metrics/answer_length.py +++ b/deepeval/metrics/answer_length.py @@ -1,9 +1,11 @@ from .metric import Metric from deepeval.test_case import LLMTestCase + class LengthMetric(Metric): """This metric checks if the output is more than 1 char""" - def __init__(self, minimum_length: int=1): + + def __init__(self, minimum_length: int = 1): self.minimum_length = minimum_length def measure(self, test_case: LLMTestCase): @@ -18,4 +20,3 @@ def is_successful(self): @property def __name__(self): return "AnswerLength" - From 7b5c20004aa32d051b1cbfeb241d0fe355f0881b Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Fri, 22 Sep 2023 18:07:22 -0700 Subject: [PATCH 05/14] add tests --- tests/test_ragas.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_ragas.py b/tests/test_ragas.py index 2ec529f05..6c2feaa58 100644 --- a/tests/test_ragas.py +++ b/tests/test_ragas.py @@ -4,7 +4,6 @@ from deepeval.metrics.ragas_metric import RagasMetric from deepeval.run_test import assert_test -metric = RagasMetric() query = "Who won the FIFA World Cup in 2018?" output = "Winners of the FIFA world cup were the French national football team" @@ -13,8 +12,6 @@ class TestOverallScore: - metric = RagasMetric() - def test_overall_score(self): test_case = LLMTestCase( query=query, @@ -22,6 +19,7 @@ def test_overall_score(self): expected_output=expected_output, context=context, ) + metric = RagasMetric() assert_test( test_cases=[test_case], metrics=[metric], From ade9053b20e50e6224808b842c8b26dc0d3c3ddf Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Sat, 23 Sep 2023 05:13:05 -0700 Subject: [PATCH 06/14] update the ragas metric --- deepeval/metrics/ragas_metric.py | 325 ++++++++++++++++++++++++++++--- 1 file changed, 293 insertions(+), 32 deletions(-) diff --git a/deepeval/metrics/ragas_metric.py b/deepeval/metrics/ragas_metric.py index 714564eba..fcfe8ca87 100644 --- a/deepeval/metrics/ragas_metric.py +++ b/deepeval/metrics/ragas_metric.py @@ -1,48 +1,32 @@ """An implementation of the Ragas metric """ import os +import numpy as np from deepeval.metrics.metric import Metric from deepeval.test_case import LLMTestCase -from deepeval.run_test import run_test from typing import List -class RagasMetric(Metric): - """This metric checks if the output is more than 3 letters""" +class ContextualRelevancyRagasMetric(Metric): + """This metric checks the contextual relevancy using Ragas""" def __init__( self, openai_api_key: str, - metrics: List[str] = None, minimum_score: float = 0.3, ): self.minimum_score = minimum_score os.environ["OPENAI_API_KEY"] = openai_api_key - if metrics is None: - try: - # Adding a list of metrics - from ragas.metrics import ( - context_relevancy, - answer_relevancy, - faithfulness, - context_recall, - ) - from ragas.metrics.critique import harmfulness - - self.metrics = [ - context_relevancy, - answer_relevancy, - faithfulness, - context_recall, - harmfulness, - ] - - except ModuleNotFoundError as e: - print( - "Please install ragas to use this metric. `pip install ragas`." - ) - else: - metrics = self.metrics + try: + # Adding a list of metrics + from ragas.metrics import context_relevancy + + self.metrics = [context_relevancy] + + except ModuleNotFoundError as e: + print( + "Please install ragas to use this metric. `pip install ragas`." + ) def measure(self, test_case: LLMTestCase): # sends to server @@ -54,13 +38,11 @@ def measure(self, test_case: LLMTestCase): ) try: - # How do i make sure this isn't just huggingface dataset from datasets import Dataset except ModuleNotFoundError: raise ModuleNotFoundError("Please install dataset") # Create a dataset from the test case - # Convert the LLMTestCase to a format compatible with Dataset data = { "expected_output": [test_case.expected_output], "contexts": [test_case.context], @@ -72,11 +54,290 @@ def measure(self, test_case: LLMTestCase): # Evaluate the dataset using Ragas scores = evaluate(dataset, metrics=self.metrics) + # Ragas only does dataset-level comparisons + context_relevancy_score = scores["context_relevancy"] + self.success = context_relevancy_score >= self.minimum_score + self.score = context_relevancy_score + return context_relevancy_score + + def is_successful(self): + return self.success + + @property + def __name__(self): + return "Contextual Relevancy Ragas Score" + + +class AnswerRelevancyRagasMetric(Metric): + """This metric checks the answer relevancy using Ragas""" + + def __init__( + self, + openai_api_key: str, + minimum_score: float = 0.3, + ): + self.minimum_score = minimum_score + os.environ["OPENAI_API_KEY"] = openai_api_key + try: + from ragas.metrics import answer_relevancy + + self.metrics = [answer_relevancy] + except ModuleNotFoundError as e: + print( + "Please install ragas to use this metric. `pip install ragas`." + ) + + def measure(self, test_case: LLMTestCase): + # sends to server + try: + from ragas import evaluate + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Please install ragas to use this metric. `pip install ragas`." + ) + + try: + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + data = { + "expected_output": [test_case.expected_output], + "contexts": [test_case.context], + "output": [test_case.output], + "id": [test_case.id], + } + dataset = Dataset.from_dict(data) + scores = evaluate(dataset, metrics=self.metrics) + answer_relevancy_score = scores["answer_relevancy"] + self.success = answer_relevancy_score >= self.minimum_score + self.score = answer_relevancy_score + return answer_relevancy_score + + def is_successful(self): + return self.success + + @property + def __name__(self): + return "Answer Relevancy Ragas Score" + + +class FaithfulnessRagasMetric(Metric): + def __init__( + self, + openai_api_key: str, + minimum_score: float = 0.3, + ): + self.minimum_score = minimum_score + os.environ["OPENAI_API_KEY"] = openai_api_key + try: + from ragas.metrics import faithfulness + + self.metrics = [faithfulness] + except ModuleNotFoundError as e: + print( + "Please install ragas to use this metric. `pip install ragas`." + ) + + def measure(self, test_case: LLMTestCase): + # sends to server + try: + from ragas import evaluate + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Please install ragas to use this metric. `pip install ragas`." + ) + + try: + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + data = { + "expected_output": [test_case.expected_output], + "contexts": [test_case.context], + "output": [test_case.output], + "id": [test_case.id], + } + dataset = Dataset.from_dict(data) + scores = evaluate(dataset, metrics=self.metrics) + faithfulness_score = scores["faithfulness"] + self.success = faithfulness_score >= self.minimum_score + self.score = faithfulness_score + return faithfulness_score + + def is_successful(self): + return self.success + + @property + def __name__(self): + return "Faithfulness Ragas Score" + + +class ContextRecallRagasMetric(Metric): + """This metric checks the context recall using Ragas""" + + def __init__( + self, + openai_api_key: str, + minimum_score: float = 0.3, + ): + self.minimum_score = minimum_score + os.environ["OPENAI_API_KEY"] = openai_api_key + try: + from ragas.metrics import context_recall + + self.metrics = [context_recall] + except ModuleNotFoundError as e: + print( + "Please install ragas to use this metric. `pip install ragas`." + ) + + def measure(self, test_case: LLMTestCase): + # sends to server + try: + from ragas import evaluate + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Please install ragas to use this metric. `pip install ragas`." + ) + + try: + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + data = { + "expected_output": [test_case.expected_output], + "contexts": [test_case.context], + "output": [test_case.output], + "id": [test_case.id], + } + dataset = Dataset.from_dict(data) + scores = evaluate(dataset, metrics=self.metrics) + context_recall_score = scores["context_recall"] + self.success = context_recall_score >= self.minimum_score + self.score = context_recall_score + return context_recall_score + + def is_successful(self): + return self.success + + @property + def __name__(self): + return "Context Recall Ragas Score" + + +class HarmfulnessRagasMetric(Metric): + """This metric checks the harmfulness using Ragas""" + + def __init__( + self, + openai_api_key: str, + minimum_score: float = 0.3, + ): + self.minimum_score = minimum_score + os.environ["OPENAI_API_KEY"] = openai_api_key + try: + from ragas.metrics.critique import harmfulness + + self.metrics = [harmfulness] + except ModuleNotFoundError as e: + print( + "Please install ragas to use this metric. `pip install ragas`." + ) + + def measure(self, test_case: LLMTestCase): + # sends to server + try: + from ragas import evaluate + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Please install ragas to use this metric. `pip install ragas`." + ) + + try: + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + data = { + "expected_output": [test_case.expected_output], + "contexts": [test_case.context], + "output": [test_case.output], + "id": [test_case.id], + } + dataset = Dataset.from_dict(data) + scores = evaluate(dataset, metrics=self.metrics) + harmfulness_score = scores["harmfulness"] + self.success = harmfulness_score >= self.minimum_score + self.score = harmfulness_score + return harmfulness_score + + def is_successful(self): + return self.success + + @property + def __name__(self): + return "Harmfulness Ragas Score" + + +class RagasMetric(Metric): + """This metric checks if the output is more than 3 letters""" + + def __init__( + self, + openai_api_key: str, + metrics: List[Metric] = None, + minimum_score: float = 0.3, + ): + self.minimum_score = minimum_score + os.environ["OPENAI_API_KEY"] = openai_api_key + if metrics is None: + self.metrics = [ + HarmfulnessRagasMetric, + ContextRecallRagasMetric, + FaithfulnessRagasMetric, + AnswerRelevancyRagasMetric, + ContextualRelevancyRagasMetric, + ] + else: + self.metrics = metrics + + def measure(self, test_case: LLMTestCase): + # sends to server + try: + from ragas import evaluate + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Please install ragas to use this metric. `pip install ragas`." + ) + + try: + # How do i make sure this isn't just huggingface dataset + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + # Create a dataset from the test case + # Convert the LLMTestCase to a format compatible with Dataset + scores = [] + for metric in self.metrics: + score = metric.measure(test_case) + scores.append(score) + + # ragas score is harmonic mean of all the scores + if len(scores) > 0: + ragas_score = len(scores) / sum( + 1.0 / score for score in scores if score != 0 + ) + else: + ragas_score = 0 + # Ragas only does dataset-level comparisons # >>> print(result["ragas_score"]) # {'ragas_score': 0.860, 'context_relevancy': 0.817, 'faithfulness': 0.892, # 'answer_relevancy': 0.874} - ragas_score = scores["ragas_score"] self.success = ragas_score >= self.minimum_score self.score = ragas_score return ragas_score From e4c4323ebc400a32e2737a30354debca346301f5 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Sat, 23 Sep 2023 06:18:25 -0700 Subject: [PATCH 07/14] update deepeval score --- deepeval/api.py | 8 +++++++- deepeval/cli/test.py | 33 ++++++++++++++++++++++---------- deepeval/metrics/ragas_metric.py | 26 ++++++++++++------------- tests/test_ragas.py | 25 ++++++++++++------------ 4 files changed, 54 insertions(+), 38 deletions(-) diff --git a/deepeval/api.py b/deepeval/api.py index 2fe408e9f..2860bdec9 100644 --- a/deepeval/api.py +++ b/deepeval/api.py @@ -468,7 +468,13 @@ def list_implementations(self): def post_test_run(self, test_run: TestRun): """Post a test run""" + try: + body = test_run.model_dump(by_alias=True) + except AttributeError: + # Pydantic version below 2.0 + body = test_run.dict(by_alias=True) + return self.post_request( endpoint="/v1/test-run", - body=test_run.model_dump(by_alias=True), + body=body, ) diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py index 5435764bc..07d69fca9 100644 --- a/deepeval/cli/test.py +++ b/deepeval/cli/test.py @@ -78,18 +78,30 @@ def sample(): pass -def check_if_legit_file(test_file: str): - if test_file.endswith(".py"): - if not test_file.startswith("test_"): - raise ValueError( - "Test will not run. Please ensure the `test_` prefix." - ) +def check_if_legit_file(test_file_or_directory: str): + if os.path.isfile(test_file_or_directory): + if test_file_or_directory.endswith(".py"): + if not os.path.basename(test_file_or_directory).startswith("test_"): + raise ValueError( + "Test will not run. Please ensure the file starts with `test_` prefix." + ) + elif os.path.isdir(test_file_or_directory): + for filename in os.listdir(test_file_or_directory): + if filename.endswith(".py"): + if not filename.startswith("test_"): + raise ValueError( + "Test will not run. Please ensure all files in the directory start with `test_` prefix." + ) + else: + raise ValueError( + "Provided path is neither a valid file nor a directory." + ) @app.command() def run( test_file_or_directory: str, - verbose: bool = False, + verbose: bool = True, color: str = "yes", durations: int = 10, pdb: bool = False, @@ -98,7 +110,8 @@ def run( ] = False, ): """Run a test""" - pytest_args = ["-k", test_file_or_directory] + check_if_legit_file(test_file_or_directory) + pytest_args = [test_file_or_directory] if exit_on_first_failure: pytest_args.insert(0, "-x") @@ -111,9 +124,10 @@ def run( "--verbose" if verbose else "--quiet", f"--color={color}", f"--durations={durations}", - "--pdb" if pdb else "", ] ) + if pdb: + pytest_args.append("--pdb") # Add the deepeval plugin file to pytest arguments pytest_args.extend(["-p", "plugins"]) @@ -122,7 +136,6 @@ def run( TextColumn("[progress.description]{task.description}"), transient=True, ) as progress: - # progress.add_task(description="Preparing tests...", total=None) progress.add_task( description="Downloading models (may take up to 2 minutes if running for the first time)...", total=None, diff --git a/deepeval/metrics/ragas_metric.py b/deepeval/metrics/ragas_metric.py index fcfe8ca87..5ade815f5 100644 --- a/deepeval/metrics/ragas_metric.py +++ b/deepeval/metrics/ragas_metric.py @@ -12,11 +12,9 @@ class ContextualRelevancyRagasMetric(Metric): def __init__( self, - openai_api_key: str, minimum_score: float = 0.3, ): self.minimum_score = minimum_score - os.environ["OPENAI_API_KEY"] = openai_api_key try: # Adding a list of metrics from ragas.metrics import context_relevancy @@ -46,7 +44,8 @@ def measure(self, test_case: LLMTestCase): data = { "expected_output": [test_case.expected_output], "contexts": [test_case.context], - "output": [test_case.output], + "question": [test_case.query], + "answer": [test_case.output], "id": [test_case.id], } dataset = Dataset.from_dict(data) @@ -73,11 +72,9 @@ class AnswerRelevancyRagasMetric(Metric): def __init__( self, - openai_api_key: str, minimum_score: float = 0.3, ): self.minimum_score = minimum_score - os.environ["OPENAI_API_KEY"] = openai_api_key try: from ragas.metrics import answer_relevancy @@ -104,7 +101,8 @@ def measure(self, test_case: LLMTestCase): data = { "expected_output": [test_case.expected_output], "contexts": [test_case.context], - "output": [test_case.output], + "question": [test_case.query], + "answer": [test_case.output], "id": [test_case.id], } dataset = Dataset.from_dict(data) @@ -125,11 +123,9 @@ def __name__(self): class FaithfulnessRagasMetric(Metric): def __init__( self, - openai_api_key: str, minimum_score: float = 0.3, ): self.minimum_score = minimum_score - os.environ["OPENAI_API_KEY"] = openai_api_key try: from ragas.metrics import faithfulness @@ -156,7 +152,8 @@ def measure(self, test_case: LLMTestCase): data = { "expected_output": [test_case.expected_output], "contexts": [test_case.context], - "output": [test_case.output], + "question": [test_case.query], + "answer": [test_case.output], "id": [test_case.id], } dataset = Dataset.from_dict(data) @@ -210,7 +207,8 @@ def measure(self, test_case: LLMTestCase): data = { "expected_output": [test_case.expected_output], "contexts": [test_case.context], - "output": [test_case.output], + "question": [test_case.query], + "answer": [test_case.output], "id": [test_case.id], } dataset = Dataset.from_dict(data) @@ -264,7 +262,8 @@ def measure(self, test_case: LLMTestCase): data = { "expected_output": [test_case.expected_output], "contexts": [test_case.context], - "output": [test_case.output], + "question": [test_case.query], + "answer": [test_case.output], "id": [test_case.id], } dataset = Dataset.from_dict(data) @@ -287,12 +286,10 @@ class RagasMetric(Metric): def __init__( self, - openai_api_key: str, metrics: List[Metric] = None, minimum_score: float = 0.3, ): self.minimum_score = minimum_score - os.environ["OPENAI_API_KEY"] = openai_api_key if metrics is None: self.metrics = [ HarmfulnessRagasMetric, @@ -323,7 +320,8 @@ def measure(self, test_case: LLMTestCase): # Convert the LLMTestCase to a format compatible with Dataset scores = [] for metric in self.metrics: - score = metric.measure(test_case) + m = metric() + score = m.measure(test_case) scores.append(score) # ragas score is harmonic mean of all the scores diff --git a/tests/test_ragas.py b/tests/test_ragas.py index 6c2feaa58..89331aa4d 100644 --- a/tests/test_ragas.py +++ b/tests/test_ragas.py @@ -11,16 +11,15 @@ context = "The FIFA World Cup in 2018 was won by the French national football team. They defeated Croatia 4-2 in the final match to claim the championship." -class TestOverallScore: - def test_overall_score(self): - test_case = LLMTestCase( - query=query, - output=output, - expected_output=expected_output, - context=context, - ) - metric = RagasMetric() - assert_test( - test_cases=[test_case], - metrics=[metric], - ) +def test_overall_score(): + test_case = LLMTestCase( + query=query, + output=output, + expected_output=expected_output, + context=context, + ) + metric = RagasMetric() + assert_test( + test_cases=[test_case], + metrics=[metric], + ) From 0d3d4ad8f22b40e2fb3ba1af282a24fced66f3a2 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Sat, 23 Sep 2023 06:35:05 -0700 Subject: [PATCH 08/14] add support for RAGAS metric --- deepeval/metrics/ragas_metric.py | 37 ++++++++++++++------------------ tests/test_ragas.py | 1 + 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/deepeval/metrics/ragas_metric.py b/deepeval/metrics/ragas_metric.py index 5ade815f5..4ba3cdd34 100644 --- a/deepeval/metrics/ragas_metric.py +++ b/deepeval/metrics/ragas_metric.py @@ -42,11 +42,11 @@ def measure(self, test_case: LLMTestCase): # Create a dataset from the test case data = { - "expected_output": [test_case.expected_output], - "contexts": [test_case.context], + "ground_truths": [[test_case.expected_output]], + "contexts": [[test_case.context]], "question": [test_case.query], "answer": [test_case.output], - "id": [test_case.id], + "id": [[test_case.id]], } dataset = Dataset.from_dict(data) @@ -99,11 +99,11 @@ def measure(self, test_case: LLMTestCase): raise ModuleNotFoundError("Please install dataset") data = { - "expected_output": [test_case.expected_output], - "contexts": [test_case.context], + "ground_truths": [[test_case.expected_output]], + "contexts": [[test_case.context]], "question": [test_case.query], "answer": [test_case.output], - "id": [test_case.id], + "id": [[test_case.id]], } dataset = Dataset.from_dict(data) scores = evaluate(dataset, metrics=self.metrics) @@ -150,11 +150,11 @@ def measure(self, test_case: LLMTestCase): raise ModuleNotFoundError("Please install dataset") data = { - "expected_output": [test_case.expected_output], - "contexts": [test_case.context], + "ground_truths": [[test_case.expected_output]], + "contexts": [[test_case.context]], "question": [test_case.query], "answer": [test_case.output], - "id": [test_case.id], + "id": [[test_case.id]], } dataset = Dataset.from_dict(data) scores = evaluate(dataset, metrics=self.metrics) @@ -176,11 +176,9 @@ class ContextRecallRagasMetric(Metric): def __init__( self, - openai_api_key: str, minimum_score: float = 0.3, ): self.minimum_score = minimum_score - os.environ["OPENAI_API_KEY"] = openai_api_key try: from ragas.metrics import context_recall @@ -205,11 +203,11 @@ def measure(self, test_case: LLMTestCase): raise ModuleNotFoundError("Please install dataset") data = { - "expected_output": [test_case.expected_output], - "contexts": [test_case.context], + "ground_truths": [[test_case.expected_output]], + "contexts": [[test_case.context]], "question": [test_case.query], "answer": [test_case.output], - "id": [test_case.id], + "id": [[test_case.id]], } dataset = Dataset.from_dict(data) scores = evaluate(dataset, metrics=self.metrics) @@ -231,11 +229,9 @@ class HarmfulnessRagasMetric(Metric): def __init__( self, - openai_api_key: str, minimum_score: float = 0.3, ): self.minimum_score = minimum_score - os.environ["OPENAI_API_KEY"] = openai_api_key try: from ragas.metrics.critique import harmfulness @@ -260,11 +256,11 @@ def measure(self, test_case: LLMTestCase): raise ModuleNotFoundError("Please install dataset") data = { - "expected_output": [test_case.expected_output], - "contexts": [test_case.context], + "ground_truths": [[test_case.expected_output]], + "contexts": [[test_case.context]], "question": [test_case.query], "answer": [test_case.output], - "id": [test_case.id], + "id": [[test_case.id]], } dataset = Dataset.from_dict(data) scores = evaluate(dataset, metrics=self.metrics) @@ -350,12 +346,11 @@ def __name__(self): def assert_ragas( test_case: LLMTestCase, - openai_api_key: str, metrics: List[str] = None, minimum_score: float = 0.3, ): """Asserts if the Ragas score is above the minimum score""" - metric = RagasMetric(openai_api_key, metrics, minimum_score) + metric = RagasMetric(metrics, minimum_score) score = metric.measure(test_case) assert ( score >= metric.minimum_score diff --git a/tests/test_ragas.py b/tests/test_ragas.py index 89331aa4d..8200cd0b9 100644 --- a/tests/test_ragas.py +++ b/tests/test_ragas.py @@ -11,6 +11,7 @@ context = "The FIFA World Cup in 2018 was won by the French national football team. They defeated Croatia 4-2 in the final match to claim the championship." +@pytest.mark.skip(reason="openai is expensive") def test_overall_score(): test_case = LLMTestCase( query=query, From 723547c7c590af6514cf11137a043dec598679f0 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Sat, 23 Sep 2023 07:20:49 -0700 Subject: [PATCH 09/14] update the test CLI --- deepeval/cli/test.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py index 07d69fca9..5dd772cee 100644 --- a/deepeval/cli/test.py +++ b/deepeval/cli/test.py @@ -86,12 +86,7 @@ def check_if_legit_file(test_file_or_directory: str): "Test will not run. Please ensure the file starts with `test_` prefix." ) elif os.path.isdir(test_file_or_directory): - for filename in os.listdir(test_file_or_directory): - if filename.endswith(".py"): - if not filename.startswith("test_"): - raise ValueError( - "Test will not run. Please ensure all files in the directory start with `test_` prefix." - ) + return else: raise ValueError( "Provided path is neither a valid file nor a directory." From 9df8adc1839ecb45237e52c5374690f8226bc4ba Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Sat, 23 Sep 2023 13:13:15 -0700 Subject: [PATCH 10/14] fix ragas --- tests/test_ragas.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_ragas.py b/tests/test_ragas.py index 8200cd0b9..f00a36306 100644 --- a/tests/test_ragas.py +++ b/tests/test_ragas.py @@ -1,5 +1,4 @@ import pytest -import os from deepeval.test_case import LLMTestCase from deepeval.metrics.ragas_metric import RagasMetric from deepeval.run_test import assert_test From 4cb971aa9b4bbb0603e7046ac25a6a4f35467996 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Sat, 23 Sep 2023 17:05:17 -0700 Subject: [PATCH 11/14] add multiple metrics --- deepeval/_version.py | 2 +- deepeval/metrics/bias_classifier.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/deepeval/_version.py b/deepeval/_version.py index 44ff9c403..06ce73115 100644 --- a/deepeval/_version.py +++ b/deepeval/_version.py @@ -1 +1 @@ -__version__: str = "0.16.4" +__version__: str = "0.17.0" diff --git a/deepeval/metrics/bias_classifier.py b/deepeval/metrics/bias_classifier.py index fbfab5e41..7acc81776 100644 --- a/deepeval/metrics/bias_classifier.py +++ b/deepeval/metrics/bias_classifier.py @@ -25,7 +25,7 @@ def __call__(self, output, expected_output, query: Optional[str] = "-"): success = score >= self.minimum_score return score - def measure(self, test_case: LLMTestCase): + def measure(self, test_case: LLMTestCase, return_all_scores: bool = False): if test_case.output is None: raise ValueError("Required attributes for test_case cannot be None") @@ -49,6 +49,8 @@ def measure(self, test_case: LLMTestCase): self.success = True self.score = v + if return_all_scores: + return results return v def is_successful(self): From 9a62614662aa4eccca894892b8c7f323571eec70 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Sat, 23 Sep 2023 22:18:45 -0700 Subject: [PATCH 12/14] make thing --- deepeval/plugins/plugin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepeval/plugins/plugin.py b/deepeval/plugins/plugin.py index 276a3b4e3..a4f31860b 100644 --- a/deepeval/plugins/plugin.py +++ b/deepeval/plugins/plugin.py @@ -19,13 +19,13 @@ def pytest_sessionstart(session): @pytest.hookimpl(tryfirst=True, hookwrapper=True) def pytest_sessionfinish(session, exitstatus): # Code before yield will run before the test teardown - api: Api = Api() # yield control back to pytest for the actual teardown yield # Code after yield will run after the test teardown - if os.getenv(PYTEST_RUN_ENV_VAR): + if os.getenv(PYTEST_RUN_ENV_VAR) and os.path.exists(".deepeval"): + api: Api = Api() test_run = TestRun.load(test_filename) result = api.post_test_run(test_run) run_id = result["id"] From af444bd2c50e5e0a749c3c8c75fe882deec6f154 Mon Sep 17 00:00:00 2001 From: Jacky Wong Date: Sat, 23 Sep 2023 22:19:02 -0700 Subject: [PATCH 13/14] v up --- deepeval/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepeval/_version.py b/deepeval/_version.py index 06ce73115..37fc43d7e 100644 --- a/deepeval/_version.py +++ b/deepeval/_version.py @@ -1 +1 @@ -__version__: str = "0.17.0" +__version__: str = "0.17.1" From 148278cd441a772ecd418ba42bc88a64b70bf012 Mon Sep 17 00:00:00 2001 From: Jon Bennion Date: Sun, 24 Sep 2023 11:30:44 -0700 Subject: [PATCH 14/14] removing .idea files --- .idea/.gitignore | 8 -------- .idea/deepeval.iml | 15 --------------- .idea/inspectionProfiles/Project_Default.xml | 16 ---------------- .idea/inspectionProfiles/profiles_settings.xml | 6 ------ .idea/misc.xml | 4 ---- .idea/modules.xml | 8 -------- .idea/vcs.xml | 6 ------ 7 files changed, 63 deletions(-) delete mode 100644 .idea/.gitignore delete mode 100644 .idea/deepeval.iml delete mode 100644 .idea/inspectionProfiles/Project_Default.xml delete mode 100644 .idea/inspectionProfiles/profiles_settings.xml delete mode 100644 .idea/misc.xml delete mode 100644 .idea/modules.xml delete mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 13566b81b..000000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml -# Editor-based HTTP Client requests -/httpRequests/ -# Datasource local storage ignored files -/dataSources/ -/dataSources.local.xml diff --git a/.idea/deepeval.iml b/.idea/deepeval.iml deleted file mode 100644 index 5fdd65ba2..000000000 --- a/.idea/deepeval.iml +++ /dev/null @@ -1,15 +0,0 @@ - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml deleted file mode 100644 index 370714195..000000000 --- a/.idea/inspectionProfiles/Project_Default.xml +++ /dev/null @@ -1,16 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2da2..000000000 --- a/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index 66d59f811..000000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 56171818d..000000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 35eb1ddfb..000000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file