diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 73f6cdf71..35c280841 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -37,7 +37,7 @@ jobs:
         run: |
           python -c "import sys; print(sys.version)"
           python -m pip install --upgrade pip
-          python -m pip install -r requirements.txt
+          python -m pip install -r requirements.txt ragas
           python -m pip install . pytest-rerunfailures pytest-asyncio
 
       - name: Run Unit Tests (pytest)
diff --git a/README.md b/README.md
index 894d9baa9..d7b8d1d03 100644
--- a/README.md
+++ b/README.md
@@ -207,3 +207,18 @@ Built by the Confident AI Team. For any questions/business enquiries - please co
   howpublished = {\url{https://github.com/confident-ai/deepeval}},
 }
 ```
+
+# Contributors
+
+<!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->
+<!-- prettier-ignore-start -->
+<!-- markdownlint-disable -->
+
+<!-- markdownlint-restore -->
+<!-- prettier-ignore-end -->
+
+<!-- ALL-CONTRIBUTORS-LIST:END -->
+
+<a href="https://github.com/confident-ai/deepeval/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=confident-ai/deepeval" />
+</a>
diff --git a/deepeval/_version.py b/deepeval/_version.py
index 6d34ba33a..37fc43d7e 100644
--- a/deepeval/_version.py
+++ b/deepeval/_version.py
@@ -1 +1 @@
-__version__: str = "0.16.3"
+__version__: str = "0.17.1"
diff --git a/deepeval/api.py b/deepeval/api.py
index 5b22b18b2..2860bdec9 100644
--- a/deepeval/api.py
+++ b/deepeval/api.py
@@ -4,6 +4,7 @@
 import requests
 import json
 import warnings
+from collections import defaultdict
 
 from typing import Any, Optional
 from pydantic import BaseModel, Field
@@ -70,29 +71,45 @@ class TestRun(BaseModel):
     def add_llm_test_case(
         self, test_case: LLMTestCase, metrics: List[Metric], run_duration: float
     ):
-        self.metric_scores.extend([MetricScore.from_metric(m) for m in metrics])
+        metric_dict = defaultdict(list)
+        for metric in metrics:
+            metric_dict[metric.__name__].extend(
+                [metric.score]
+                + [
+                    ms.score
+                    for ms in self.metric_scores
+                    if ms.metric == metric.__name__
+                ]
+            )
+        self.metric_scores = [
+            MetricScore(metric=metric_name, score=sum(scores) / len(scores))
+            for metric_name, scores in metric_dict.items()
+        ]
         # Check if test case with the same ID already exists
         existing_test_case: APITestCase = next(
             (tc for tc in self.test_cases if tc.name == test_case.__name__),
             None,
         )
+        metric_dict = defaultdict(list)
+        for metric in metrics:
+            metric_dict[metric.__name__].append(metric.score)
+        metrics_metadata = [
+            MetricsMetadata(
+                metric=metric_name,
+                score=sum(scores) / len(scores),
+                minimumScore=min(scores),
+            )
+            for metric_name, scores in metric_dict.items()
+        ]
+        success = all([metric.is_successful() for metric in metrics])
+        threshold = metrics[0].minimum_score
+
         if existing_test_case:
             # If it exists, append the metrics to the existing test case
-            existing_test_case.metricsMetadata.extend(
-                [
-                    MetricsMetadata(
-                        metric=metric.__name__,
-                        score=metric.score,
-                        minimumScore=metric.minimum_score,
-                    )
-                    for metric in metrics
-                ]
-            )
+            existing_test_case.metricsMetadata.extend(metrics_metadata)
             # Update the success status and threshold
-            existing_test_case.success = all(
-                [metric.is_successful() for metric in metrics]
-            )
-            existing_test_case.threshold = metrics[0].minimum_score
+            existing_test_case.success = success
+            existing_test_case.threshold = threshold
         else:
             # If it doesn't exist, create a new test case
             name = "Test " + str(len(self.test_cases) + 1)
@@ -102,16 +119,9 @@ def add_llm_test_case(
                     input=test_case.query,
                     actualOutput=test_case.output,
                     expectedOutput=test_case.expected_output,
-                    success=all([metric.is_successful() for metric in metrics]),
-                    metricsMetadata=[
-                        MetricsMetadata(
-                            metric=metric.__name__,
-                            score=metric.score,
-                            minimumScore=metric.minimum_score,
-                        )
-                        for metric in metrics
-                    ],
-                    threshold=metrics[0].minimum_score,
+                    success=success,
+                    metricsMetadata=metrics_metadata,
+                    threshold=threshold,
                     runDuration=run_duration,
                 )
             )
@@ -124,8 +134,6 @@ def save(self, file_path: Optional[str] = None):
                 return
             elif not file_path.endswith(".json"):
                 file_path = f"{file_path}.json"
-        print({"save_filepath", file_path})
-
         with open(file_path, "w") as f:
             json.dump(self.dict(by_alias=True, exclude_none=True), f)
 
@@ -140,7 +148,6 @@ def load(cls, file_path: Optional[str] = None):
                 return
             elif not file_path.endswith(".json"):
                 file_path = f"{file_path}.json"
-        print({"load_filepath", file_path})
         with open(file_path, "r") as f:
             return cls(**json.load(f))
 
@@ -461,7 +468,13 @@ def list_implementations(self):
 
     def post_test_run(self, test_run: TestRun):
         """Post a test run"""
+        try:
+            body = test_run.model_dump(by_alias=True)
+        except AttributeError:
+            # Pydantic version below 2.0
+            body = test_run.dict(by_alias=True)
+
         return self.post_request(
             endpoint="/v1/test-run",
-            body=test_run.model_dump(by_alias=True),
+            body=body,
         )
diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py
index 5435764bc..5dd772cee 100644
--- a/deepeval/cli/test.py
+++ b/deepeval/cli/test.py
@@ -78,18 +78,25 @@ def sample():
         pass
 
 
-def check_if_legit_file(test_file: str):
-    if test_file.endswith(".py"):
-        if not test_file.startswith("test_"):
-            raise ValueError(
-                "Test will not run. Please ensure the `test_` prefix."
-            )
+def check_if_legit_file(test_file_or_directory: str):
+    if os.path.isfile(test_file_or_directory):
+        if test_file_or_directory.endswith(".py"):
+            if not os.path.basename(test_file_or_directory).startswith("test_"):
+                raise ValueError(
+                    "Test will not run. Please ensure the file starts with `test_` prefix."
+                )
+    elif os.path.isdir(test_file_or_directory):
+        return
+    else:
+        raise ValueError(
+            "Provided path is neither a valid file nor a directory."
+        )
 
 
 @app.command()
 def run(
     test_file_or_directory: str,
-    verbose: bool = False,
+    verbose: bool = True,
     color: str = "yes",
     durations: int = 10,
     pdb: bool = False,
@@ -98,7 +105,8 @@ def run(
     ] = False,
 ):
     """Run a test"""
-    pytest_args = ["-k", test_file_or_directory]
+    check_if_legit_file(test_file_or_directory)
+    pytest_args = [test_file_or_directory]
     if exit_on_first_failure:
         pytest_args.insert(0, "-x")
 
@@ -111,9 +119,10 @@ def run(
             "--verbose" if verbose else "--quiet",
             f"--color={color}",
             f"--durations={durations}",
-            "--pdb" if pdb else "",
         ]
     )
+    if pdb:
+        pytest_args.append("--pdb")
     # Add the deepeval plugin file to pytest arguments
     pytest_args.extend(["-p", "plugins"])
 
@@ -122,7 +131,6 @@ def run(
         TextColumn("[progress.description]{task.description}"),
         transient=True,
     ) as progress:
-        # progress.add_task(description="Preparing tests...", total=None)
         progress.add_task(
             description="Downloading models (may take up to 2 minutes if running for the first time)...",
             total=None,
diff --git a/deepeval/dataset.py b/deepeval/dataset.py
index 7057c5b7c..994a6114f 100644
--- a/deepeval/dataset.py
+++ b/deepeval/dataset.py
@@ -76,6 +76,54 @@ def from_csv(
     def from_test_cases(self, test_cases: list):
         self.data = test_cases
 
+    @classmethod
+    def from_hf_dataset(
+        cls,
+        dataset_name: str,
+        split: str,
+        query_column: str,
+        expected_output_column: str,
+        context_column: str = None,
+        output_column: str = None,
+        id_column: str = None,
+    ):
+        """
+        Load test cases from a HuggingFace dataset.
+
+        Args:
+            dataset_name (str): The name of the HuggingFace dataset to load.
+            split (str): The split of the dataset to load (e.g., 'train', 'test').
+            query_column (str): The column in the dataset corresponding to the query.
+            expected_output_column (str): The column in the dataset corresponding to the expected output.
+            context_column (str, optional): The column in the dataset corresponding to the context. Defaults to None.
+            output_column (str, optional): The column in the dataset corresponding to the output. Defaults to None.
+            id_column (str, optional): The column in the dataset corresponding to the ID. Defaults to None.
+
+        Returns:
+            EvaluationDataset: An instance of EvaluationDataset containing the loaded test cases.
+        """
+        try:
+            from datasets import load_dataset
+        except ImportError:
+            raise ImportError(
+                "The 'datasets' library is missing. Please install it using pip: pip install datasets"
+            )
+
+        hf_dataset = load_dataset(dataset_name, split=split)
+        test_cases = []
+
+        for i, row in enumerate(hf_dataset):
+            test_cases.append(
+                LLMTestCase(
+                    query=row[query_column],
+                    expected_output=row[expected_output_column],
+                    context=row[context_column] if context_column else None,
+                    output=row[output_column] if output_column else None,
+                    id=row[id_column] if id_column else None,
+                )
+            )
+        return cls(test_cases)
+
     @classmethod
     def from_json(
         cls,
diff --git a/deepeval/metrics/bias_classifier.py b/deepeval/metrics/bias_classifier.py
index fbfab5e41..7acc81776 100644
--- a/deepeval/metrics/bias_classifier.py
+++ b/deepeval/metrics/bias_classifier.py
@@ -25,7 +25,7 @@ def __call__(self, output, expected_output, query: Optional[str] = "-"):
         success = score >= self.minimum_score
         return score
 
-    def measure(self, test_case: LLMTestCase):
+    def measure(self, test_case: LLMTestCase, return_all_scores: bool = False):
         if test_case.output is None:
             raise ValueError("Required attributes for test_case cannot be None")
 
@@ -49,6 +49,8 @@ def measure(self, test_case: LLMTestCase):
             self.success = True
 
         self.score = v
+        if return_all_scores:
+            return results
         return v
 
     def is_successful(self):
diff --git a/deepeval/metrics/ragas_metric.py b/deepeval/metrics/ragas_metric.py
new file mode 100644
index 000000000..4ba3cdd34
--- /dev/null
+++ b/deepeval/metrics/ragas_metric.py
@@ -0,0 +1,357 @@
+"""An implementation of the Ragas metric
+"""
+import os
+import numpy as np
+from deepeval.metrics.metric import Metric
+from deepeval.test_case import LLMTestCase
+from typing import List
+
+
+class ContextualRelevancyRagasMetric(Metric):
+    """This metric checks the contextual relevancy using Ragas"""
+
+    def __init__(
+        self,
+        minimum_score: float = 0.3,
+    ):
+        self.minimum_score = minimum_score
+        try:
+            # Adding a list of metrics
+            from ragas.metrics import context_relevancy
+
+            self.metrics = [context_relevancy]
+
+        except ModuleNotFoundError as e:
+            print(
+                "Please install ragas to use this metric. `pip install ragas`."
+            )
+
+    def measure(self, test_case: LLMTestCase):
+        # sends to server
+        try:
+            from ragas import evaluate
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError(
+                "Please install ragas to use this metric. `pip install ragas`."
+            )
+
+        try:
+            from datasets import Dataset
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install dataset")
+
+        # Create a dataset from the test case
+        data = {
+            "ground_truths": [[test_case.expected_output]],
+            "contexts": [[test_case.context]],
+            "question": [test_case.query],
+            "answer": [test_case.output],
+            "id": [[test_case.id]],
+        }
+        dataset = Dataset.from_dict(data)
+
+        # Evaluate the dataset using Ragas
+        scores = evaluate(dataset, metrics=self.metrics)
+
+        # Ragas only does dataset-level comparisons
+        context_relevancy_score = scores["context_relevancy"]
+        self.success = context_relevancy_score >= self.minimum_score
+        self.score = context_relevancy_score
+        return context_relevancy_score
+
+    def is_successful(self):
+        return self.success
+
+    @property
+    def __name__(self):
+        return "Contextual Relevancy Ragas Score"
+
+
+class AnswerRelevancyRagasMetric(Metric):
+    """This metric checks the answer relevancy using Ragas"""
+
+    def __init__(
+        self,
+        minimum_score: float = 0.3,
+    ):
+        self.minimum_score = minimum_score
+        try:
+            from ragas.metrics import answer_relevancy
+
+            self.metrics = [answer_relevancy]
+        except ModuleNotFoundError as e:
+            print(
+                "Please install ragas to use this metric. `pip install ragas`."
+            )
+
+    def measure(self, test_case: LLMTestCase):
+        # sends to server
+        try:
+            from ragas import evaluate
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError(
+                "Please install ragas to use this metric. `pip install ragas`."
+            )
+
+        try:
+            from datasets import Dataset
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install dataset")
+
+        data = {
+            "ground_truths": [[test_case.expected_output]],
+            "contexts": [[test_case.context]],
+            "question": [test_case.query],
+            "answer": [test_case.output],
+            "id": [[test_case.id]],
+        }
+        dataset = Dataset.from_dict(data)
+        scores = evaluate(dataset, metrics=self.metrics)
+        answer_relevancy_score = scores["answer_relevancy"]
+        self.success = answer_relevancy_score >= self.minimum_score
+        self.score = answer_relevancy_score
+        return answer_relevancy_score
+
+    def is_successful(self):
+        return self.success
+
+    @property
+    def __name__(self):
+        return "Answer Relevancy Ragas Score"
+
+
+class FaithfulnessRagasMetric(Metric):
+    def __init__(
+        self,
+        minimum_score: float = 0.3,
+    ):
+        self.minimum_score = minimum_score
+        try:
+            from ragas.metrics import faithfulness
+
+            self.metrics = [faithfulness]
+        except ModuleNotFoundError as e:
+            print(
+                "Please install ragas to use this metric. `pip install ragas`."
+            )
+
+    def measure(self, test_case: LLMTestCase):
+        # sends to server
+        try:
+            from ragas import evaluate
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError(
+                "Please install ragas to use this metric. `pip install ragas`."
+            )
+
+        try:
+            from datasets import Dataset
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install dataset")
+
+        data = {
+            "ground_truths": [[test_case.expected_output]],
+            "contexts": [[test_case.context]],
+            "question": [test_case.query],
+            "answer": [test_case.output],
+            "id": [[test_case.id]],
+        }
+        dataset = Dataset.from_dict(data)
+        scores = evaluate(dataset, metrics=self.metrics)
+        faithfulness_score = scores["faithfulness"]
+        self.success = faithfulness_score >= self.minimum_score
+        self.score = faithfulness_score
+        return faithfulness_score
+
+    def is_successful(self):
+        return self.success
+
+    @property
+    def __name__(self):
+        return "Faithfulness Ragas Score"
+
+
+class ContextRecallRagasMetric(Metric):
+    """This metric checks the context recall using Ragas"""
+
+    def __init__(
+        self,
+        minimum_score: float = 0.3,
+    ):
+        self.minimum_score = minimum_score
+        try:
+            from ragas.metrics import context_recall
+
+            self.metrics = [context_recall]
+        except ModuleNotFoundError as e:
+            print(
+                "Please install ragas to use this metric. `pip install ragas`."
+            )
+
+    def measure(self, test_case: LLMTestCase):
+        # sends to server
+        try:
+            from ragas import evaluate
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError(
+                "Please install ragas to use this metric. `pip install ragas`."
+            )
+
+        try:
+            from datasets import Dataset
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install dataset")
+
+        data = {
+            "ground_truths": [[test_case.expected_output]],
+            "contexts": [[test_case.context]],
+            "question": [test_case.query],
+            "answer": [test_case.output],
+            "id": [[test_case.id]],
+        }
+        dataset = Dataset.from_dict(data)
+        scores = evaluate(dataset, metrics=self.metrics)
+        context_recall_score = scores["context_recall"]
+        self.success = context_recall_score >= self.minimum_score
+        self.score = context_recall_score
+        return context_recall_score
+
+    def is_successful(self):
+        return self.success
+
+    @property
+    def __name__(self):
+        return "Context Recall Ragas Score"
+
+
+class HarmfulnessRagasMetric(Metric):
+    """This metric checks the harmfulness using Ragas"""
+
+    def __init__(
+        self,
+        minimum_score: float = 0.3,
+    ):
+        self.minimum_score = minimum_score
+        try:
+            from ragas.metrics.critique import harmfulness
+
+            self.metrics = [harmfulness]
+        except ModuleNotFoundError as e:
+            print(
+                "Please install ragas to use this metric. `pip install ragas`."
+            )
+
+    def measure(self, test_case: LLMTestCase):
+        # sends to server
+        try:
+            from ragas import evaluate
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError(
+                "Please install ragas to use this metric. `pip install ragas`."
+            )
+
+        try:
+            from datasets import Dataset
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install dataset")
+
+        data = {
+            "ground_truths": [[test_case.expected_output]],
+            "contexts": [[test_case.context]],
+            "question": [test_case.query],
+            "answer": [test_case.output],
+            "id": [[test_case.id]],
+        }
+        dataset = Dataset.from_dict(data)
+        scores = evaluate(dataset, metrics=self.metrics)
+        harmfulness_score = scores["harmfulness"]
+        self.success = harmfulness_score >= self.minimum_score
+        self.score = harmfulness_score
+        return harmfulness_score
+
+    def is_successful(self):
+        return self.success
+
+    @property
+    def __name__(self):
+        return "Harmfulness Ragas Score"
+
+
+class RagasMetric(Metric):
+    """This metric checks if the output is more than 3 letters"""
+
+    def __init__(
+        self,
+        metrics: List[Metric] = None,
+        minimum_score: float = 0.3,
+    ):
+        self.minimum_score = minimum_score
+        if metrics is None:
+            self.metrics = [
+                HarmfulnessRagasMetric,
+                ContextRecallRagasMetric,
+                FaithfulnessRagasMetric,
+                AnswerRelevancyRagasMetric,
+                ContextualRelevancyRagasMetric,
+            ]
+        else:
+            self.metrics = metrics
+
+    def measure(self, test_case: LLMTestCase):
+        # sends to server
+        try:
+            from ragas import evaluate
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError(
+                "Please install ragas to use this metric. `pip install ragas`."
+            )
+
+        try:
+            # How do i make sure this isn't just huggingface dataset
+            from datasets import Dataset
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install dataset")
+
+        # Create a dataset from the test case
+        # Convert the LLMTestCase to a format compatible with Dataset
+        scores = []
+        for metric in self.metrics:
+            m = metric()
+            score = m.measure(test_case)
+            scores.append(score)
+
+        # ragas score is harmonic mean of all the scores
+        if len(scores) > 0:
+            ragas_score = len(scores) / sum(
+                1.0 / score for score in scores if score != 0
+            )
+        else:
+            ragas_score = 0
+
+        # Ragas only does dataset-level comparisons
+        # >>> print(result["ragas_score"])
+        # {'ragas_score': 0.860, 'context_relevancy': 0.817, 'faithfulness': 0.892,
+        # 'answer_relevancy': 0.874}
+        self.success = ragas_score >= self.minimum_score
+        self.score = ragas_score
+        return ragas_score
+
+    def is_successful(self):
+        return self.success
+
+    @property
+    def __name__(self):
+        return "Ragas Score"
+
+
+def assert_ragas(
+    test_case: LLMTestCase,
+    metrics: List[str] = None,
+    minimum_score: float = 0.3,
+):
+    """Asserts if the Ragas score is above the minimum score"""
+    metric = RagasMetric(metrics, minimum_score)
+    score = metric.measure(test_case)
+    assert (
+        score >= metric.minimum_score
+    ), f"Ragas score {score} is below the minimum score {metric.minimum_score}"
diff --git a/deepeval/plugins/plugin.py b/deepeval/plugins/plugin.py
index 56edac02a..a4f31860b 100644
--- a/deepeval/plugins/plugin.py
+++ b/deepeval/plugins/plugin.py
@@ -19,18 +19,18 @@ def pytest_sessionstart(session):
 @pytest.hookimpl(tryfirst=True, hookwrapper=True)
 def pytest_sessionfinish(session, exitstatus):
     # Code before yield will run before the test teardown
-    api: Api = Api()
 
     # yield control back to pytest for the actual teardown
     yield
 
     # Code after yield will run after the test teardown
-    if os.getenv(PYTEST_RUN_ENV_VAR):
+    if os.getenv(PYTEST_RUN_ENV_VAR) and os.path.exists(".deepeval"):
+        api: Api = Api()
         test_run = TestRun.load(test_filename)
         result = api.post_test_run(test_run)
         run_id = result["id"]
         print(
             "✅ Tests finished! View results on "
-            + f"https://app.confident-ai.com/unit-tests/{run_id}/test-cases"
+            + f"https://app.confident-ai.com/unit-tests/{run_id}"
         )
         os.remove(test_filename)
diff --git a/tests/test_ragas.py b/tests/test_ragas.py
new file mode 100644
index 000000000..f00a36306
--- /dev/null
+++ b/tests/test_ragas.py
@@ -0,0 +1,25 @@
+import pytest
+from deepeval.test_case import LLMTestCase
+from deepeval.metrics.ragas_metric import RagasMetric
+from deepeval.run_test import assert_test
+
+
+query = "Who won the FIFA World Cup in 2018?"
+output = "Winners of the FIFA world cup were the French national football team"
+expected_output = "French national football team"
+context = "The FIFA World Cup in 2018 was won by the French national football team. They defeated Croatia 4-2 in the final match to claim the championship."
+
+
+@pytest.mark.skip(reason="openai is expensive")
+def test_overall_score():
+    test_case = LLMTestCase(
+        query=query,
+        output=output,
+        expected_output=expected_output,
+        context=context,
+    )
+    metric = RagasMetric()
+    assert_test(
+        test_cases=[test_case],
+        metrics=[metric],
+    )