Skip to content

Commit

Permalink
Feature/rag descriptors (#1478)
Browse files Browse the repository at this point in the history
* draft new descriptors
* format
  • Loading branch information
emeli-dral authored Feb 12, 2025
1 parent d9360db commit c991385
Show file tree
Hide file tree
Showing 6 changed files with 376 additions and 481 deletions.
614 changes: 133 additions & 481 deletions examples/cookbook/descriptors.ipynb

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions src/evidently/descriptors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@
from .json_match_descriptor import JSONMatch
from .json_schema_match_descriptor import JSONSchemaMatch
from .llm_judges import BiasLLMEval
from .llm_judges import CompletenessLLMEval
from .llm_judges import ContextQualityLLMEval
from .llm_judges import CorrectnessLLMEval
from .llm_judges import DeclineLLMEval
from .llm_judges import FaithfulnessLLMEval
from .llm_judges import LLMEval
from .llm_judges import NegativityLLMEval
from .llm_judges import PIILLMEval
Expand Down Expand Up @@ -52,6 +55,9 @@
"ContextQualityLLMEval",
"BiasLLMEval",
"ToxicityLLMEval",
"CorrectnessLLMEval",
"FaithfulnessLLMEval",
"CompletenessLLMEval",
"OpenAIPrompting",
"NonLetterCharacterPercentage",
"OOV",
Expand Down
13 changes: 13 additions & 0 deletions src/evidently/descriptors/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,19 @@
register_type_alias(
FeatureDescriptor, "evidently.descriptors.llm_judges.ToxicityLLMEval", "evidently:descriptor:ToxicityLLMEval"
)
register_type_alias(
FeatureDescriptor, "evidently.descriptors.llm_judges.CorrectnessLLMEval", "evidently:descriptor:CorrectnessLLMEval"
)
register_type_alias(
FeatureDescriptor,
"evidently.descriptors.llm_judges.FaithfulnessLLMEval",
"evidently:descriptor:FaithfulnessLLMEval",
)
register_type_alias(
FeatureDescriptor,
"evidently.descriptors.llm_judges.CompletenessLLMEval",
"evidently:descriptor:CompletenessLLMEval",
)
register_type_alias(
FeatureDescriptor,
"evidently.descriptors.non_letter_character_percentage_descriptor.NonLetterCharacterPercentage",
Expand Down
128 changes: 128 additions & 0 deletions src/evidently/descriptors/llm_judges.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,3 +227,131 @@ class Config:
)
provider = "openai"
model = "gpt-4o-mini"


class CorrectnessLLMEval(BinaryClassificationLLMEval):
class Config:
type_alias = "evidently:descriptor:CorrectnessLLMEval"

name: ClassVar = "Correctness"
target_output: str
provider = "openai"
model = "gpt-4o-mini"
template: ClassVar = BinaryClassificationPromptTemplate(
criteria="""An OUTPUT is correct if:
- It conveys the same facts and details as the REFERENCE, even if worded differently.
- It preserves the original meaning without introducing inaccuracies or omissions.
An OUTPUT is incorrect if:
- It contradicts the REFERENCE.
- It introduces additional claims that are not present in the REFERENCE.
- It omits or alters key details in a way that changes the original meaning.
Here is the REFERENCE:
-----reference_starts-----
{target_output}
-----reference_finishes-----""",
target_category="INCORRECT",
non_target_category="CORRECT",
uncertainty="unknown",
include_reasoning=True,
pre_messages=[
(
"system",
"""You are an impartial expert evaluator.
You will be given an OUTPUT and REFERENCE.
Your job is to evaluate correctness of the OUTPUT.""",
)
],
)

def get_input_columns(self, column_name: str) -> Dict[str, str]:
input_columns = super().get_input_columns(column_name)
input_columns.update({self.target_output: "target_output"})
return input_columns


class FaithfulnessLLMEval(BinaryClassificationLLMEval):
class Config:
type_alias = "evidently:descriptor:FaithfulnessLLMEval"

name: ClassVar = "Faithfulness"
context: str
provider = "openai"
model = "gpt-4o-mini"
template: ClassVar = BinaryClassificationPromptTemplate(
criteria="""An unfaithful RESPONSE is any RESPONSE that:
- Contradicts the information provided in the SOURCE.
- Adds new information that is not present in the SOURCE.
- Provides a RESPONSE that is not grounded in the SOURCE, unless it is a refusal to answer or a clarifying question.
A faithful RESPONSE is a RESPONSE that:
- Accurately uses information from the SOURCE, even if only partially.
- Declines to answer when the SOURCE does not provide enough information.
- Asks a clarifying question when needed instead of making unsupported assumptions.
Here is a SOURCE:
-----source_starts-----
{context}
-----source_finishes-----""",
target_category="UNFAITHFUL",
non_target_category="FAITHFUL",
uncertainty="unknown",
include_reasoning=True,
pre_messages=[
(
"system",
"""You are an impartial expert evaluator.
You will be given a text.
Your job is to evaluate faithfulness of responses by comparing them to the trusted information source.""",
)
],
)

def get_input_columns(self, column_name: str) -> Dict[str, str]:
input_columns = super().get_input_columns(column_name)
input_columns.update({self.context: "context"})
return input_columns


class CompletenessLLMEval(BinaryClassificationLLMEval):
class Config:
type_alias = "evidently:descriptor:CompletenessLLMEval"

name: ClassVar = "Completeness"
context: str
provider = "openai"
model = "gpt-4o-mini"
template: ClassVar = BinaryClassificationPromptTemplate(
criteria="""An OUTPUT is complete if:
- It includes all relevant facts and details from the SOURCE.
- It does not omit key information necessary for a full understanding of the response.
- It preserves the structure and intent of the SOURCE while ensuring all critical elements are covered.
An OUTPUT is incomplete if:
- It is missing key facts or details present in the SOURCE.
- It omits context that is necessary for a full and accurate response.
- It shortens or summarizes the SOURCE in a way that leads to loss of essential information.
Here is the SOURCE:
-----source_starts-----
{context}
-----source_finishes-----""",
target_category="IMCOMPLETE",
non_target_category="COMPLETE",
uncertainty="unknown",
include_reasoning=True,
pre_messages=[
(
"system",
"""You are an impartial expert evaluator.
You will be given a text.
Your job is to evaluate completeness of responses.""",
)
],
)

def get_input_columns(self, column_name: str) -> Dict[str, str]:
input_columns = super().get_input_columns(column_name)
input_columns.update({self.context: "context"})
return input_columns
6 changes: 6 additions & 0 deletions src/evidently/future/descriptors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@
from .generated_descriptors import BERTScore
from .generated_descriptors import BiasLLMEval
from .generated_descriptors import BinaryClassificationLLMEval
from .generated_descriptors import CompletenessLLMEval
from .generated_descriptors import Contains
from .generated_descriptors import ContainsLink
from .generated_descriptors import ContextQualityLLMEval
from .generated_descriptors import CorrectnessLLMEval
from .generated_descriptors import DeclineLLMEval
from .generated_descriptors import DoesNotContain
from .generated_descriptors import EndsWith
from .generated_descriptors import ExactMatch
from .generated_descriptors import ExcludesWords
from .generated_descriptors import FaithfulnessLLMEval
from .generated_descriptors import HuggingFace
from .generated_descriptors import HuggingFaceToxicity
from .generated_descriptors import IncludesWords
Expand Down Expand Up @@ -85,5 +88,8 @@
"NegativityLLMEval",
"PIILLMEval",
"ToxicityLLMEval",
"CompletenessLLMEval",
"FaithfulnessLLMEval",
"CorrectnessLLMEval",
"ContextRelevance",
]
90 changes: 90 additions & 0 deletions src/evidently/future/descriptors/generated_descriptors.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,3 +609,93 @@ def __init__(
display_name=alias,
).feature(column_name)
super().__init__(feature, alias=alias)


class CorrectnessLLMEval(FeatureDescriptor):
def __init__(
self,
column_name: str,
target_output: str,
provider: str = "openai",
model: str = "gpt-4o-mini",
additional_columns: Optional[Dict[str, str]] = None,
include_category: Optional[bool] = None,
include_score: Optional[bool] = None,
include_reasoning: Optional[bool] = None,
uncertainty: Optional[Uncertainty] = None,
alias: Optional[str] = None,
):
from evidently.descriptors.llm_judges import CorrectnessLLMEval as CorrectnessLLMEvalV1

feature = CorrectnessLLMEvalV1(
target_output=target_output,
provider=provider,
model=model,
additional_columns=additional_columns,
include_category=include_category,
include_score=include_score,
include_reasoning=include_reasoning,
uncertainty=uncertainty,
display_name=alias,
).feature(column_name)
super().__init__(feature, alias=alias)


class FaithfulnessLLMEval(FeatureDescriptor):
def __init__(
self,
column_name: str,
context: str,
provider: str = "openai",
model: str = "gpt-4o-mini",
additional_columns: Optional[Dict[str, str]] = None,
include_category: Optional[bool] = None,
include_score: Optional[bool] = None,
include_reasoning: Optional[bool] = None,
uncertainty: Optional[Uncertainty] = None,
alias: Optional[str] = None,
):
from evidently.descriptors.llm_judges import FaithfulnessLLMEval as FaithfulnessLLMEvalV1

feature = FaithfulnessLLMEvalV1(
context=context,
provider=provider,
model=model,
additional_columns=additional_columns,
include_category=include_category,
include_score=include_score,
include_reasoning=include_reasoning,
uncertainty=uncertainty,
display_name=alias,
).feature(column_name)
super().__init__(feature, alias=alias)


class CompletenessLLMEval(FeatureDescriptor):
def __init__(
self,
column_name: str,
context: str,
provider: str = "openai",
model: str = "gpt-4o-mini",
additional_columns: Optional[Dict[str, str]] = None,
include_category: Optional[bool] = None,
include_score: Optional[bool] = None,
include_reasoning: Optional[bool] = None,
uncertainty: Optional[Uncertainty] = None,
alias: Optional[str] = None,
):
from evidently.descriptors.llm_judges import CompletenessLLMEval as CompletenessLLMEvalV1

feature = CompletenessLLMEvalV1(
context=context,
provider=provider,
model=model,
additional_columns=additional_columns,
include_category=include_category,
include_score=include_score,
include_reasoning=include_reasoning,
uncertainty=uncertainty,
display_name=alias,
).feature(column_name)
super().__init__(feature, alias=alias)

0 comments on commit c991385

Please sign in to comment.