From 0b592c37964ff56e1414362b733b11653fdab024 Mon Sep 17 00:00:00 2001
From: lievan <evan.li@datadoqhq.com>
Date: Fri, 20 Sep 2024 16:14:04 -0400
Subject: [PATCH] add real faithfulness

---
 .../llmobs/_evaluators/ragas/faithfulness.py  | 217 +++++++++++++++++-
 ddtrace/llmobs/_evaluators/ragas/utils.py     |  96 ++++++++
 ddtrace/llmobs/_evaluators/runner.py          |   4 +-
 3 files changed, 312 insertions(+), 5 deletions(-)
 create mode 100644 ddtrace/llmobs/_evaluators/ragas/utils.py

diff --git a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py
index 3be0700c1c..58c1009651 100644
--- a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py
+++ b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py
@@ -1,21 +1,232 @@
+import json
 import math
 import time
+import typing
+from typing import Optional
+
+from langchain_core.pydantic_v1 import ValidationError
+import numpy as np
+from ragas.llms import llm_factory
+from ragas.llms.output_parser import RagasoutputParser
+from ragas.llms.output_parser import get_json_format_instructions
+from ragas.metrics import faithfulness
+from ragas.metrics.base import ensembler
+from ragas.metrics.base import get_segmenter
 
 from ddtrace import config
+from ddtrace.internal.logger import get_logger
+
+from .utils import FaithfulnessInputs
+from .utils import StatementFaithfulnessAnswers
+from .utils import StatementsAnswers
+from .utils import context_parser
+from .utils import extract_inputs_from_messages_prompt
+
+
+logger = get_logger(__name__)
+
+# populate default values for faithfulness class
+faithfulness.llm = llm_factory()
+
+statement_prompt = faithfulness.statement_prompt
+
+statements_output_instructions = get_json_format_instructions(StatementsAnswers)
+statements_output_parser = RagasoutputParser(pydantic_object=StatementsAnswers)
+
+faithfulness_output_instructions = get_json_format_instructions(StatementFaithfulnessAnswers)
+faithfulness_output_parser = RagasoutputParser(pydantic_object=StatementFaithfulnessAnswers)
+
+sentence_segmenter = get_segmenter(language=faithfulness.nli_statements_message.language, clean=False)
+
+
+def create_statements_prompt(answer, question, llmobs_service):
+    with llmobs_service.task("ragas.create_statements_prompt") as task:
+        task.service = "ragas"
+        sentences = sentence_segmenter.segment(answer)
+        sentences = [sentence for sentence in sentences if sentence.strip().endswith(".")]
+        sentences = "\n".join([f"{i}:{x}" for i, x in enumerate(sentences)])
+        return statement_prompt.format(question=question, answer=answer, sentences=sentences)
+
+
+def create_nli_prompt(statements, context_str, llmobs_service):
+    with llmobs_service.task("ragas.create_nli_prompt") as task:
+        task.service = "ragas"
+        statements_str: str = json.dumps(statements)
+        prompt_value = faithfulness.nli_statements_message.format(context=context_str, statements=statements_str)
+        return prompt_value
+
+
+def compute_score(answers, llmobs_service):
+    with llmobs_service.task("ragas.compute_score") as task:
+        task.service = "ragas"
+        faithful_statements = sum(1 if answer.verdict else 0 for answer in answers.__root__)
+        num_statements = len(answers.__root__)
+        if num_statements:
+            score = faithful_statements / num_statements
+        else:
+            score = np.nan
+        llmobs_service.annotate(
+            metadata={
+                "faithful_statements": faithful_statements,
+                "num_statements": num_statements,
+            },
+            output_data=score,
+        )
+        return score
+
+
+def extract_question_and_context_using_llm(messages, llmobs_service):
+    with llmobs_service.workflow("ragas.extract_question_and_context_using_llm"):
+        llmobs_service.annotate(input_data=messages)
+        extracted_inputs = faithfulness.llm.generate_text(
+            prompt=extract_inputs_from_messages_prompt.format(messages=messages)
+        )
+        statements = context_parser.parse(extracted_inputs.generations[0][0].text)
+        llmobs_service.annotate(
+            input_data=messages, output_data={"question": statements.question, "context": statements.context}
+        )
+        llmobs_service.annotate(output_data={"question": statements.question, "context": statements.context})
+        return statements.question, statements.context
+
+
+def extract_faithfulness_inputs(span: dict, llmobs_service) -> typing.Optional[FaithfulnessInputs]:
+    with llmobs_service.workflow("ragas.extract_faithfulness_inputs"):
+        llmobs_service.annotate(input_data=span)
+        question, answer, context_str = None, None, None
+
+        meta_io = span.get("meta")
+        if meta_io is None:
+            return None
+
+        meta_input = meta_io.get("input")
+        meta_output = meta_io.get("output")
+
+        if meta_input or meta_output is None:
+            return None
+
+        messages = meta_output.get("messages")
+        if messages is not None and len(messages) > 0:
+            answer = messages[-1].get("content")
+
+        prompt = meta_input.get("prompt")
+        question = None
+        context = None
+        if prompt is not None and prompt.get("variables") is not None:
+            variables = prompt.get("variables")
+            question = variables.get("question")
+            context = variables.get("context")
+
+        if question is None or context is None:
+            question, context_str = extract_question_and_context_using_llm(span, llmobs_service)
+        try:
+            llmobs_service.annotate(output_data={"question": question, "context": context, "answer": answer})
+            return FaithfulnessInputs(question=question, context=context_str, answer=answer)
+        except ValidationError as e:
+            logger.debug("Failed to validate faithfulness inputs", e)
+            return None
+
+
+def score_faithfulness(span, llmobs_service):
+    llmobs_metadata = {}
+    token_usage = {"input_tokens": 0, "output_tokens": 0}
+    score = np.nan
+    with llmobs_service.workflow("ragas.faithfulness") as workflow:
+        try:
+            workflow.service = "ragas"
+
+            faithfulness_inputs = extract_faithfulness_inputs(span, llmobs_service)
+            if faithfulness_inputs is None:
+                return np.nan, None, llmobs_service.export_span()
+
+            question, answer, context_str = (
+                faithfulness_inputs.question,
+                faithfulness_inputs.answer,
+                faithfulness_inputs.context,
+            )
+
+            statements_prompt = create_statements_prompt(question, answer, llmobs_service=llmobs_service)
+
+            statements = faithfulness.llm.generate_text(statements_prompt)
+
+            usage = statements.llm_output.get("token_usage")
+            if usage:
+                token_usage["input_tokens"] += usage.get("prompt_tokens") if usage.get("prompt_tokens") else 0
+                token_usage["output_tokens"] += usage.get("completion_tokens") if usage.get("completion_tokens") else 0
+
+            statements = statements_output_parser.parse(statements.generations[0][0].text)
+
+            if statements is None:
+                return np.nan
+            statements = [item["simpler_statements"] for item in statements.dicts()]
+            statements = [item for sublist in statements for item in sublist]
+
+            llmobs_metadata["statements"] = statements
+
+            assert isinstance(statements, typing.List), "statements must be a list"
+
+            p_value = create_nli_prompt(statements, context_str, llmobs_service=llmobs_service)
+
+            nli_result = faithfulness.llm.generate_text(p_value)
+
+            usage = nli_result.llm_output.get("token_usage")
+            if usage:
+                token_usage["input_tokens"] += usage.get("prompt_tokens") if usage.get("completion_tokens") else 0
+                token_usage["output_tokens"] += usage.get("prompt_tokens") if usage.get("completion_tokens") else 0
+
+            nli_result_text = [nli_result.generations[0][i].text for i in range(faithfulness._reproducibility)]
+            faithfulness_list = [faithfulness_output_parser.parse(text) for text in nli_result_text]
+
+            faithfulness_list = [faith.dicts() for faith in faithfulness_list if faith is not None]
+
+            llmobs_metadata["faithfulness_list"] = faithfulness_list
+
+            if faithfulness_list:
+                faithfulness_list = ensembler.from_discrete(
+                    faithfulness_list,
+                    "verdict",
+                )
+
+                faithfulness_list = StatementFaithfulnessAnswers.parse_obj(faithfulness_list)
+            else:
+                return np.nan, None, llmobs_service.export_span()
+            score = compute_score(faithfulness_list, llmobs_service=llmobs_service)
+            return score, faithfulness_list.json(), llmobs_service.export_span()
+        finally:
+            llmobs_metadata.update(token_usage)
+            llmobs_service.annotate(
+                input_data={
+                    "answer": answer,
+                    "question": question,
+                    "context_str": context_str,
+                },
+                output_data=score,
+                metadata=llmobs_metadata,
+            )
 
 
 class RagasFaithfulnessEvaluator:
     label = "ragas_faithfulness"
     metric_type = "score"
+    llmobs_service = None
+
+    def __init__(self, llmobs_service):
+        RagasFaithfulnessEvaluator.llmobs_service = llmobs_service
 
     @classmethod
-    def evaluate(cls, span):
+    def evaluate(cls, span) -> Optional[dict]:
+        if cls.llmobs_service is None:
+            return None
+
+        score, faithfulness_list, exported_span = score_faithfulness(span, cls.llmobs_service)
+        if math.isnan(score):
+            return None
         return {
-            "span_id": span.get("span_id"),
-            "trace_id": span.get("trace_id"),
+            "span_id": exported_span.get("span_id"),
+            "trace_id": exported_span.get("trace_id"),
             "score_value": 1,
             "ml_app": config._llmobs_ml_app,
             "timestamp_ms": math.floor(time.time() * 1000),
             "metric_type": cls.metric_type,
             "label": cls.label,
+            "metadata": {"ragas.faithfulness_list": faithfulness_list},
         }
diff --git a/ddtrace/llmobs/_evaluators/ragas/utils.py b/ddtrace/llmobs/_evaluators/ragas/utils.py
new file mode 100644
index 0000000000..11fad1d5b8
--- /dev/null
+++ b/ddtrace/llmobs/_evaluators/ragas/utils.py
@@ -0,0 +1,96 @@
+import typing as t
+
+from langchain_core.pydantic_v1 import BaseModel
+from langchain_core.pydantic_v1 import Field
+from ragas.llms.output_parser import RagasoutputParser
+from ragas.llms.prompt import Prompt
+
+
+class FaithfulnessInputs(BaseModel):
+    question: str = Field(..., description="the question to be answered")
+    context: str = Field(..., description="the context to be used to answer the question")
+    answer: str = Field(..., description="the answer to the question")
+
+
+class StatementFaithfulnessAnswer(BaseModel):
+    statement: str = Field(..., description="the original statement, word-by-word")
+    reason: str = Field(..., description="the reason of the verdict")
+    verdict: int = Field(..., description="the verdict(0/1) of the faithfulness.")
+
+
+class StatementFaithfulnessAnswers(BaseModel):
+    __root__: t.List[StatementFaithfulnessAnswer]
+
+    def dicts(self) -> t.List[t.Dict]:
+        return self.dict()["__root__"]
+
+
+class Statements(BaseModel):
+    sentence_index: int = Field(..., description="Index of the sentence from the statement list")
+    simpler_statements: t.List[str] = Field(..., description="the simpler statements")
+
+
+class StatementsAnswers(BaseModel):
+    __root__: t.List[Statements]
+
+    def dicts(self) -> t.List[t.Dict]:
+        return self.dict()["__root__"]
+
+
+class ExtractedContext(BaseModel):
+    context: str = Field(..., description="the extracted context")
+    question: str = Field(..., description="the extracted question")
+
+
+context_parser = RagasoutputParser(pydantic_object=ExtractedContext)
+
+
+extract_inputs_from_messages_prompt = Prompt(
+    name="extract_context",
+    instruction="""You will be given a prompt to a large language model.
+                    The prompt will contain a question and the reference information
+                    that should be used to reference that question.
+                    Your task is to extract out the reference information.
+                    Do not include any text that is not in the original input.""",
+    examples=[
+        {
+            "messages": [
+                {
+                    "role": "user",
+                    "content": """
+Given the following question and reference context, answer the user's question
+question: What are the effects of carbonated water on teeth?
+context_str: Carbonated water has negative, destructive effects on teeth, and result in
+decreasing microhardness and removal of the adhesive material on etched or sealed enamel.
+Erosion occurred when the etched enamel of teeth was exposed to carbonated water,
+particularly in groups exposed to high-level carbonated water.
+Alleviation of this destructive effect is observed in groups exposed to carbonated water with calcium ion.
+Partial removal of the adhesive material on sealed enamel could be observed after exposure to carbonated water.
+                """,
+                },
+            ],
+            "output": {
+                "context": """
+Carbonated water has negative, destructive effects on teeth, and result in
+decreasing microhardness and removal of the adhesive material on etched or sealed enamel.
+Erosion occurred when the etched enamel of teeth was exposed to carbonated water,
+particularly in groups exposed to high-level carbonated water.
+Alleviation of this destructive effect is observed in groups exposed to carbonated water with calcium ion.
+Partial removal of the adhesive material on sealed enamel could be observed after exposure to carbonated water.
+                """,
+                "question": "What are the effects of carbonated water on teeth?",
+            },
+        },
+    ],
+    input_keys=["messages"],
+    output_key="output",
+    output_format_instruction="""
+    The output should be a json with the question and the context extracted from the messages.
+    For example:
+    {
+        "context": "Extracted context",
+        "question": "Extracted question"
+    }
+    """,
+    output_type="json",
+)
diff --git a/ddtrace/llmobs/_evaluators/runner.py b/ddtrace/llmobs/_evaluators/runner.py
index 91d07568c9..a47bc3bd03 100644
--- a/ddtrace/llmobs/_evaluators/runner.py
+++ b/ddtrace/llmobs/_evaluators/runner.py
@@ -20,7 +20,7 @@
 class EvaluatorRunner(PeriodicService):
     """Base class for evaluating LLM Observability span events"""
 
-    def __init__(self, interval: float, _evaluation_metric_writer=None):
+    def __init__(self, interval: float, _evaluation_metric_writer=None, _llmobs_service=None):
         super(EvaluatorRunner, self).__init__(interval=interval)
         self._lock = forksafe.RLock()
         self._buffer = []  # type: list[Dict]
@@ -36,7 +36,7 @@ def __init__(self, interval: float, _evaluation_metric_writer=None):
             evaluators = evaluator_str.split(",")
             for evaluator in evaluators:
                 if evaluator in SUPPORTED_EVALUATORS:
-                    self.evaluators.append(SUPPORTED_EVALUATORS[evaluator])
+                    self.evaluators.append(SUPPORTED_EVALUATORS[evaluator](llmobs_service=_llmobs_service))
 
     def start(self, *args, **kwargs):
         super(EvaluatorRunner, self).start()