Feature/rag descriptors (#1478)

* draft new descriptors * format
evidentlyai · Feb 12, 2025 · c991385 · c991385
1 parent d9360db
commit c991385
Show file tree

Hide file tree

Showing 6 changed files with 376 additions and 481 deletions.
diff --git a/examples/cookbook/descriptors.ipynb b/examples/cookbook/descriptors.ipynb
diff --git a/src/evidently/descriptors/__init__.py b/src/evidently/descriptors/__init__.py
@@ -12,8 +12,11 @@
 from .json_match_descriptor import JSONMatch
 from .json_schema_match_descriptor import JSONSchemaMatch
 from .llm_judges import BiasLLMEval
+from .llm_judges import CompletenessLLMEval
 from .llm_judges import ContextQualityLLMEval
+from .llm_judges import CorrectnessLLMEval
 from .llm_judges import DeclineLLMEval
+from .llm_judges import FaithfulnessLLMEval
 from .llm_judges import LLMEval
 from .llm_judges import NegativityLLMEval
 from .llm_judges import PIILLMEval
@@ -52,6 +55,9 @@
     "ContextQualityLLMEval",
     "BiasLLMEval",
     "ToxicityLLMEval",
+    "CorrectnessLLMEval",
+    "FaithfulnessLLMEval",
+    "CompletenessLLMEval",
     "OpenAIPrompting",
     "NonLetterCharacterPercentage",
     "OOV",

diff --git a/src/evidently/descriptors/_registry.py b/src/evidently/descriptors/_registry.py
@@ -49,6 +49,19 @@
 register_type_alias(
     FeatureDescriptor, "evidently.descriptors.llm_judges.ToxicityLLMEval", "evidently:descriptor:ToxicityLLMEval"
 )
+register_type_alias(
+    FeatureDescriptor, "evidently.descriptors.llm_judges.CorrectnessLLMEval", "evidently:descriptor:CorrectnessLLMEval"
+)
+register_type_alias(
+    FeatureDescriptor,
+    "evidently.descriptors.llm_judges.FaithfulnessLLMEval",
+    "evidently:descriptor:FaithfulnessLLMEval",
+)
+register_type_alias(
+    FeatureDescriptor,
+    "evidently.descriptors.llm_judges.CompletenessLLMEval",
+    "evidently:descriptor:CompletenessLLMEval",
+)
 register_type_alias(
     FeatureDescriptor,
     "evidently.descriptors.non_letter_character_percentage_descriptor.NonLetterCharacterPercentage",

diff --git a/src/evidently/descriptors/llm_judges.py b/src/evidently/descriptors/llm_judges.py
@@ -227,3 +227,131 @@ class Config:
     )
     provider = "openai"
     model = "gpt-4o-mini"
+
+
+class CorrectnessLLMEval(BinaryClassificationLLMEval):
+    class Config:
+        type_alias = "evidently:descriptor:CorrectnessLLMEval"
+
+    name: ClassVar = "Correctness"
+    target_output: str
+    provider = "openai"
+    model = "gpt-4o-mini"
+    template: ClassVar = BinaryClassificationPromptTemplate(
+        criteria="""An OUTPUT is correct if:
+        - It conveys the same facts and details as the REFERENCE, even if worded differently.
+        - It preserves the original meaning without introducing inaccuracies or omissions.
+
+        An OUTPUT is incorrect if:
+        - It contradicts the REFERENCE.
+        - It introduces additional claims that are not present in the REFERENCE.
+        - It omits or alters key details in a way that changes the original meaning.
+
+        Here is the REFERENCE:
+        -----reference_starts-----
+        {target_output}
+        -----reference_finishes-----""",
+        target_category="INCORRECT",
+        non_target_category="CORRECT",
+        uncertainty="unknown",
+        include_reasoning=True,
+        pre_messages=[
+            (
+                "system",
+                """You are an impartial expert evaluator.
+                You will be given an OUTPUT and REFERENCE.
+                Your job is to evaluate correctness of the OUTPUT.""",
+            )
+        ],
+    )
+
+    def get_input_columns(self, column_name: str) -> Dict[str, str]:
+        input_columns = super().get_input_columns(column_name)
+        input_columns.update({self.target_output: "target_output"})
+        return input_columns
+
+
+class FaithfulnessLLMEval(BinaryClassificationLLMEval):
+    class Config:
+        type_alias = "evidently:descriptor:FaithfulnessLLMEval"
+
+    name: ClassVar = "Faithfulness"
+    context: str
+    provider = "openai"
+    model = "gpt-4o-mini"
+    template: ClassVar = BinaryClassificationPromptTemplate(
+        criteria="""An unfaithful RESPONSE is any RESPONSE that:
+        - Contradicts the information provided in the SOURCE.
+        - Adds new information that is not present in the SOURCE.
+        - Provides a RESPONSE that is not grounded in the SOURCE, unless it is a refusal to answer or a clarifying question.
+
+        A faithful RESPONSE is a RESPONSE that:
+        - Accurately uses information from the SOURCE, even if only partially.
+        - Declines to answer when the SOURCE does not provide enough information.
+        - Asks a clarifying question when needed instead of making unsupported assumptions.
+
+        Here is a SOURCE:
+        -----source_starts-----
+        {context}
+        -----source_finishes-----""",
+        target_category="UNFAITHFUL",
+        non_target_category="FAITHFUL",
+        uncertainty="unknown",
+        include_reasoning=True,
+        pre_messages=[
+            (
+                "system",
+                """You are an impartial expert evaluator.
+                You will be given a text.
+                Your job is to evaluate faithfulness of responses by comparing them to the trusted information source.""",
+            )
+        ],
+    )
+
+    def get_input_columns(self, column_name: str) -> Dict[str, str]:
+        input_columns = super().get_input_columns(column_name)
+        input_columns.update({self.context: "context"})
+        return input_columns
+
+
+class CompletenessLLMEval(BinaryClassificationLLMEval):
+    class Config:
+        type_alias = "evidently:descriptor:CompletenessLLMEval"
+
+    name: ClassVar = "Completeness"
+    context: str
+    provider = "openai"
+    model = "gpt-4o-mini"
+    template: ClassVar = BinaryClassificationPromptTemplate(
+        criteria="""An OUTPUT is complete if:
+        - It includes all relevant facts and details from the SOURCE.
+        - It does not omit key information necessary for a full understanding of the response.
+        - It preserves the structure and intent of the SOURCE while ensuring all critical elements are covered.
+
+        An OUTPUT is incomplete if:
+        - It is missing key facts or details present in the SOURCE.
+        - It omits context that is necessary for a full and accurate response.
+        - It shortens or summarizes the SOURCE in a way that leads to loss of essential information.
+
+        Here is the SOURCE:
+        -----source_starts-----
+        {context}
+        -----source_finishes-----""",
+        target_category="IMCOMPLETE",
+        non_target_category="COMPLETE",
+        uncertainty="unknown",
+        include_reasoning=True,
+        pre_messages=[
+            (
+                "system",
+                """You are an impartial expert evaluator.
+                You will be given a text.
+                Your job is to evaluate completeness of responses.""",
+            )
+        ],
+    )
+
+    def get_input_columns(self, column_name: str) -> Dict[str, str]:
+        input_columns = super().get_input_columns(column_name)
+        input_columns.update({self.context: "context"})
+        return input_columns
diff --git a/src/evidently/future/descriptors/__init__.py b/src/evidently/future/descriptors/__init__.py
@@ -6,14 +6,17 @@
 from .generated_descriptors import BERTScore
 from .generated_descriptors import BiasLLMEval
 from .generated_descriptors import BinaryClassificationLLMEval
+from .generated_descriptors import CompletenessLLMEval
 from .generated_descriptors import Contains
 from .generated_descriptors import ContainsLink
 from .generated_descriptors import ContextQualityLLMEval
+from .generated_descriptors import CorrectnessLLMEval
 from .generated_descriptors import DeclineLLMEval
 from .generated_descriptors import DoesNotContain
 from .generated_descriptors import EndsWith
 from .generated_descriptors import ExactMatch
 from .generated_descriptors import ExcludesWords
+from .generated_descriptors import FaithfulnessLLMEval
 from .generated_descriptors import HuggingFace
 from .generated_descriptors import HuggingFaceToxicity
 from .generated_descriptors import IncludesWords
@@ -85,5 +88,8 @@
     "NegativityLLMEval",
     "PIILLMEval",
     "ToxicityLLMEval",
+    "CompletenessLLMEval",
+    "FaithfulnessLLMEval",
+    "CorrectnessLLMEval",
     "ContextRelevance",
 ]
diff --git a/src/evidently/future/descriptors/generated_descriptors.py b/src/evidently/future/descriptors/generated_descriptors.py
@@ -609,3 +609,93 @@ def __init__(
             display_name=alias,
         ).feature(column_name)
         super().__init__(feature, alias=alias)
+
+
+class CorrectnessLLMEval(FeatureDescriptor):
+    def __init__(
+        self,
+        column_name: str,
+        target_output: str,
+        provider: str = "openai",
+        model: str = "gpt-4o-mini",
+        additional_columns: Optional[Dict[str, str]] = None,
+        include_category: Optional[bool] = None,
+        include_score: Optional[bool] = None,
+        include_reasoning: Optional[bool] = None,
+        uncertainty: Optional[Uncertainty] = None,
+        alias: Optional[str] = None,
+    ):
+        from evidently.descriptors.llm_judges import CorrectnessLLMEval as CorrectnessLLMEvalV1
+
+        feature = CorrectnessLLMEvalV1(
+            target_output=target_output,
+            provider=provider,
+            model=model,
+            additional_columns=additional_columns,
+            include_category=include_category,
+            include_score=include_score,
+            include_reasoning=include_reasoning,
+            uncertainty=uncertainty,
+            display_name=alias,
+        ).feature(column_name)
+        super().__init__(feature, alias=alias)
+
+
+class FaithfulnessLLMEval(FeatureDescriptor):
+    def __init__(
+        self,
+        column_name: str,
+        context: str,
+        provider: str = "openai",
+        model: str = "gpt-4o-mini",
+        additional_columns: Optional[Dict[str, str]] = None,
+        include_category: Optional[bool] = None,
+        include_score: Optional[bool] = None,
+        include_reasoning: Optional[bool] = None,
+        uncertainty: Optional[Uncertainty] = None,
+        alias: Optional[str] = None,
+    ):
+        from evidently.descriptors.llm_judges import FaithfulnessLLMEval as FaithfulnessLLMEvalV1
+
+        feature = FaithfulnessLLMEvalV1(
+            context=context,
+            provider=provider,
+            model=model,
+            additional_columns=additional_columns,
+            include_category=include_category,
+            include_score=include_score,
+            include_reasoning=include_reasoning,
+            uncertainty=uncertainty,
+            display_name=alias,
+        ).feature(column_name)
+        super().__init__(feature, alias=alias)
+
+
+class CompletenessLLMEval(FeatureDescriptor):
+    def __init__(
+        self,
+        column_name: str,
+        context: str,
+        provider: str = "openai",
+        model: str = "gpt-4o-mini",
+        additional_columns: Optional[Dict[str, str]] = None,
+        include_category: Optional[bool] = None,
+        include_score: Optional[bool] = None,
+        include_reasoning: Optional[bool] = None,
+        uncertainty: Optional[Uncertainty] = None,
+        alias: Optional[str] = None,
+    ):
+        from evidently.descriptors.llm_judges import CompletenessLLMEval as CompletenessLLMEvalV1
+
+        feature = CompletenessLLMEvalV1(
+            context=context,
+            provider=provider,
+            model=model,
+            additional_columns=additional_columns,
+            include_category=include_category,
+            include_score=include_score,
+            include_reasoning=include_reasoning,
+            uncertainty=uncertainty,
+            display_name=alias,
+        ).feature(column_name)
+        super().__init__(feature, alias=alias)