From a3ca1cff99c3b6d349c6f485f1f061650e59306d Mon Sep 17 00:00:00 2001 From: Olmo Maldonado Date: Thu, 12 Dec 2024 16:39:37 -0800 Subject: [PATCH] feature: configurable python (OpenAI) client (#102) --- Makefile | 2 +- README.md | 31 +++++ py/autoevals/__init__.py | 1 + py/autoevals/llm.py | 42 +++++-- py/autoevals/moderation.py | 24 +++- py/autoevals/oai.py | 244 +++++++++++++++++++++++++++---------- py/autoevals/ragas.py | 164 ++++++++++++++++--------- py/autoevals/string.py | 23 +++- py/autoevals/test_llm.py | 194 +++++++++++++++++++++++++++++ py/autoevals/version.py | 2 +- setup.py | 1 + 11 files changed, 584 insertions(+), 144 deletions(-) diff --git a/Makefile b/Makefile index 57cd501..d1c6f72 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ VENV_PYTHON_PACKAGES := venv/.python_packages ${VENV_PYTHON_PACKAGES}: ${VENV_INITIALIZED} bash -c 'source venv/bin/activate && python -m pip install --upgrade pip setuptools build twine openai' - bash -c 'source venv/bin/activate && python -m pip install -e .[dev]' + bash -c 'source venv/bin/activate && python -m pip install -e ".[dev]"' @touch $@ ${VENV_PRE_COMMIT}: ${VENV_PYTHON_PACKAGES} diff --git a/README.md b/README.md index 2467c3b..4356a36 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,37 @@ print(f"Factuality score: {result.score}") print(f"Factuality metadata: {result.metadata['rationale']}") ``` +#### Custom Client + +If you need to use a custom OpenAI client, you can initialize the library with a custom client. + +```python +import openai +from autoevals import init +from autoevals.oai import LLMClient + +openai_client = openai.OpenAI(base_url="https://api.openai.com/v1/") + +class CustomClient(LLMClient): + openai=openai_client # you can also pass in openai module and we will instantiate it for you + embed = openai.embeddings.create + moderation = openai.moderations.create + RateLimitError = openai.RateLimitError + + def complete(self, **kwargs): + # make adjustments as needed + return self.openai.chat.completions.create(**kwargs) + +# Autoevals will now use your custom client +client = init(client=CustomClient) +``` + +If you only need to use a custom client for a specific evaluator, you can pass in the client to the evaluator. + +```python +evaluator = Factuality(client=CustomClient) +``` + ### Node.js ```javascript diff --git a/py/autoevals/__init__.py b/py/autoevals/__init__.py index 4d00a2b..af69f5c 100644 --- a/py/autoevals/__init__.py +++ b/py/autoevals/__init__.py @@ -36,6 +36,7 @@ from .llm import * from .moderation import * from .number import * +from .oai import init from .ragas import * from .string import * from .value import ExactMatch diff --git a/py/autoevals/llm.py b/py/autoevals/llm.py index a3f3a27..8f6346d 100644 --- a/py/autoevals/llm.py +++ b/py/autoevals/llm.py @@ -1,7 +1,7 @@ -import abc import json import os import re +from collections import defaultdict from dataclasses import dataclass from typing import Dict, List, Optional @@ -11,7 +11,7 @@ from autoevals.partial import ScorerWithPartial -from .oai import arun_cached_request, run_cached_request +from .oai import LLMClient, arun_cached_request, run_cached_request SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -79,6 +79,7 @@ def __init__( self, api_key=None, base_url=None, + client: Optional[LLMClient] = None, ): self.extra_args = {} if api_key: @@ -86,6 +87,8 @@ def __init__( if base_url: self.extra_args["base_url"] = base_url + self.client = client + class OpenAILLMScorer(OpenAIScorer): def __init__( @@ -93,10 +96,12 @@ def __init__( temperature=None, api_key=None, base_url=None, + client: Optional[LLMClient] = None, ): super().__init__( api_key=api_key, base_url=base_url, + client=client, ) self.extra_args["temperature"] = temperature or 0 @@ -115,8 +120,10 @@ def __init__( engine=None, api_key=None, base_url=None, + client: Optional[LLMClient] = None, ): super().__init__( + client=client, api_key=api_key, base_url=base_url, ) @@ -162,6 +169,7 @@ def _render_messages(self, **kwargs): def _request_args(self, output, expected, **kwargs): ret = { + "client": self.client, **self.extra_args, **self._build_args(output, expected, **kwargs), } @@ -219,7 +227,7 @@ class LLMClassifier(OpenAILLMClassifier): An LLM-based classifier that wraps `OpenAILLMClassifier` and provides a standard way to apply chain of thought, parse the output, and score the result.""" - _SPEC_FILE_CONTENTS: Optional[str] = None + _SPEC_FILE_CONTENTS: Dict[str, str] = defaultdict(str) def __init__( self, @@ -233,6 +241,7 @@ def __init__( engine=None, api_key=None, base_url=None, + client: Optional[LLMClient] = None, **extra_render_args, ): choice_strings = list(choice_scores.keys()) @@ -257,24 +266,33 @@ def __init__( api_key=api_key, base_url=base_url, render_args={"__choices": choice_strings, **extra_render_args}, + client=client, ) @classmethod - def from_spec(cls, name: str, spec: ModelGradedSpec, **kwargs): - return cls(name, spec.prompt, spec.choice_scores, **kwargs) + def from_spec(cls, name: str, spec: ModelGradedSpec, client: Optional[LLMClient] = None, **kwargs): + return cls(name, spec.prompt, spec.choice_scores, client=client, **kwargs) @classmethod - def from_spec_file(cls, name: str, path: str, **kwargs): - if cls._SPEC_FILE_CONTENTS is None: + def from_spec_file(cls, name: str, path: str, client: Optional[LLMClient] = None, **kwargs): + if cls._SPEC_FILE_CONTENTS[name] == "": with open(path) as f: - cls._SPEC_FILE_CONTENTS = f.read() - spec = yaml.safe_load(cls._SPEC_FILE_CONTENTS) - return cls.from_spec(name, ModelGradedSpec(**spec), **kwargs) + cls._SPEC_FILE_CONTENTS[name] = f.read() + spec = yaml.safe_load(cls._SPEC_FILE_CONTENTS[name]) + return cls.from_spec(name, ModelGradedSpec(**spec), client=client, **kwargs) class SpecFileClassifier(LLMClassifier): def __new__( - cls, model=None, engine=None, use_cot=None, max_tokens=None, temperature=None, api_key=None, base_url=None + cls, + model=None, + engine=None, + use_cot=None, + max_tokens=None, + temperature=None, + api_key=None, + base_url=None, + client: Optional[LLMClient] = None, ): kwargs = {} if model is not None: @@ -302,7 +320,7 @@ def __new__( extra_render_args = cls._partial_args() if hasattr(cls, "_partial_args") else {} - return LLMClassifier.from_spec_file(cls_name, template_path, **kwargs, **extra_render_args) + return LLMClassifier.from_spec_file(cls_name, template_path, client=client, **kwargs, **extra_render_args) class Battle(SpecFileClassifier): diff --git a/py/autoevals/moderation.py b/py/autoevals/moderation.py index 4164e13..842e6f0 100644 --- a/py/autoevals/moderation.py +++ b/py/autoevals/moderation.py @@ -1,8 +1,10 @@ +from typing import Optional + from braintrust_core.score import Score from autoevals.llm import OpenAIScorer -from .oai import arun_cached_request, run_cached_request +from .oai import LLMClient, arun_cached_request, run_cached_request REQUEST_TYPE = "moderation" @@ -15,7 +17,13 @@ class Moderation(OpenAIScorer): threshold = None extra_args = {} - def __init__(self, threshold=None, api_key=None, base_url=None): + def __init__( + self, + threshold=None, + api_key=None, + base_url=None, + client: Optional[LLMClient] = None, + ): """ Create a new Moderation scorer. @@ -24,11 +32,13 @@ def __init__(self, threshold=None, api_key=None, base_url=None): :param api_key: OpenAI key :param base_url: Base URL to be used to reach OpenAI moderation endpoint. """ - super().__init__(api_key=api_key, base_url=base_url) + super().__init__(api_key=api_key, base_url=base_url, client=client) self.threshold = threshold def _run_eval_sync(self, output, __expected=None): - moderation_response = run_cached_request(REQUEST_TYPE, input=output, **self.extra_args)["results"][0] + moderation_response = run_cached_request( + client=self.client, request_type=REQUEST_TYPE, input=output, **self.extra_args + )["results"][0] return self.__postprocess_response(moderation_response) def __postprocess_response(self, moderation_response) -> Score: @@ -42,7 +52,9 @@ def __postprocess_response(self, moderation_response) -> Score: ) async def _run_eval_async(self, output, expected=None, **kwargs) -> Score: - moderation_response = (await arun_cached_request(REQUEST_TYPE, input=output, **self.extra_args))["results"][0] + moderation_response = ( + await arun_cached_request(client=self.client, request_type=REQUEST_TYPE, input=output, **self.extra_args) + )["results"][0] return self.__postprocess_response(moderation_response) @staticmethod @@ -59,3 +71,5 @@ def compute_score(moderation_result, threshold): __all__ = ["Moderation"] +__all__ = ["Moderation"] +__all__ = ["Moderation"] diff --git a/py/autoevals/oai.py b/py/autoevals/oai.py index 9141945..dbd0bc6 100644 --- a/py/autoevals/oai.py +++ b/py/autoevals/oai.py @@ -3,95 +3,211 @@ import sys import textwrap import time +from contextvars import ContextVar from dataclasses import dataclass -from pathlib import Path -from typing import Any +from typing import Any, Optional PROXY_URL = "https://api.braintrust.dev/v1/proxy" @dataclass -class OpenAIWrapper: +class LLMClient: + """A client wrapper for LLM operations that supports both OpenAI SDK v0 and v1. + + This class provides a consistent interface for common LLM operations regardless of the + underlying OpenAI SDK version. It's designed to be extensible for custom implementations. + + Attributes: + openai: The OpenAI module or client instance (either v0 or v1 SDK). + complete: Completion function that creates chat completions. + - For v0: openai.ChatCompletion.create or acreate + - For v1: openai.chat.completions.create + embed: Embedding function that creates embeddings. + - For v0: openai.Embedding.create or acreate + - For v1: openai.embeddings.create + moderation: Moderation function that creates content moderations. + - For v0: openai.Moderations.create or acreate + - For v1: openai.moderations.create + RateLimitError: The rate limit exception class for the SDK version. + - For v0: openai.error.RateLimitError + - For v1: openai.RateLimitError + + Note: + If using async OpenAI methods you must use the async methods in Autoevals. + + Example: + ```python + # Using with OpenAI v1 + import openai + client = LLMClient( + openai=openai, + complete=openai.chat.completions.create, + embed=openai.embeddings.create, + moderation=openai.moderations.create, + RateLimitError=openai.RateLimitError + ) + + # Extending for custom implementation + @dataclass + class CustomLLMClient(LLMClient): + def complete(self, **kwargs): + # make adjustments as needed + return openai.chat.completions.create(**kwargs) + ``` + + Note: + This class is typically instantiated via the `prepare_openai()` function, which handles + the SDK version detection and proper function assignment automatically. + """ + + openai: Any complete: Any embed: Any moderation: Any RateLimitError: Exception -def prepare_openai(is_async=False, api_key=None, base_url=None): - if api_key is None: - api_key = os.environ.get("OPENAI_API_KEY") or os.environ.get("BRAINTRUST_API_KEY") - if base_url is None: - base_url = os.environ.get("OPENAI_BASE_URL", PROXY_URL) +_client_var = ContextVar[Optional[LLMClient]]("client") - try: - import openai - except Exception as e: - print( - textwrap.dedent( - f"""\ - Unable to import openai: {e} - - Please install it, e.g. with - - pip install 'openai' - """ - ), - file=sys.stderr, - ) - raise + +def init(*, client: Optional[LLMClient] = None): + """Initialize Autoevals with an optional custom LLM client. + + This function sets up the global client context for Autoevals to use. If no client is provided, + the default OpenAI client will be used. + + Args: + client (Optional[LLMClient]): A custom LLM client instance that implements the LLMClient interface. + If None, the default OpenAI client will be used.\ + """ + _client_var.set(client) + + +def prepare_openai(client: Optional[LLMClient] = None, is_async=False, api_key=None, base_url=None): + """Prepares and configures an OpenAI client for use with AutoEval, if client is not provided. + + This function handles both v0 and v1 of the OpenAI SDK, configuring the client + with the appropriate authentication and base URL settings. + + We will also attempt to enable Braintrust tracing export, if you've configured tracing. + + Args: + client (Optional[LLMClient], optional): Existing LLMClient instance. + If provided, this client will be used instead of creating a new one. + + is_async (bool, optional): Whether to create a client with async operations. Defaults to False. + Deprecated: Use the `client` argument and set the `openai` with the async/sync that you'd like to use. + + api_key (str, optional): OpenAI API key. If not provided, will look for + OPENAI_API_KEY or BRAINTRUST_API_KEY in environment variables. + + Deprecated: Use the `client` argument and set the `openai`. + + base_url (str, optional): Base URL for API requests. If not provided, will + use OPENAI_BASE_URL from environment or fall back to PROXY_URL. + + Deprecated: Use the `client` argument and set the `openai`. + + Returns: + Tuple[LLMClient, bool]: A tuple containing: + - The configured LLMClient instance, or the client you've provided + - A boolean indicating whether the client was wrapped with Braintrust tracing + + Raises: + ImportError: If the OpenAI package is not installed + """ + client = client or _client_var.get(None) + + openai = getattr(client, "openai", None) + if not openai: + try: + import openai + except Exception as e: + print( + textwrap.dedent( + f"""\ + Unable to import openai: {e} + + Please install it, e.g. with + + pip install 'openai' + """ + ), + file=sys.stderr, + ) + raise openai_obj = openai + is_v1 = False if hasattr(openai, "OpenAI"): # This is the new v1 API is_v1 = True - if is_async: - openai_obj = openai.AsyncOpenAI(api_key=api_key, base_url=base_url) + + if client is None: + # prepare the default openai sdk, if not provided + if api_key is None: + api_key = os.environ.get("OPENAI_API_KEY") or os.environ.get("BRAINTRUST_API_KEY") + if base_url is None: + base_url = os.environ.get("OPENAI_BASE_URL", PROXY_URL) + + if is_v1: + if is_async: + openai_obj = openai.AsyncOpenAI(api_key=api_key, base_url=base_url) + else: + openai_obj = openai.OpenAI(api_key=api_key, base_url=base_url) else: - openai_obj = openai.OpenAI(api_key=api_key, base_url=base_url) - else: - if api_key: - openai.api_key = api_key - openai.api_base = base_url + if api_key: + openai.api_key = api_key + openai.api_base = base_url + # optimistically wrap openai instance for tracing wrapped = False try: - from braintrust.oai import wrap_openai + from braintrust.oai import NamedWrapper, wrap_openai + + if not isinstance(openai_obj, NamedWrapper): + openai_obj = wrap_openai(openai_obj) - openai_obj = wrap_openai(openai_obj) wrapped = True except ImportError: pass - complete_fn = None - rate_limit_error = None - if is_v1: - wrapper = OpenAIWrapper( - complete=openai_obj.chat.completions.create, - embed=openai_obj.embeddings.create, - moderation=openai_obj.moderations.create, - RateLimitError=openai.RateLimitError, - ) - else: - rate_limit_error = openai.error.RateLimitError - if is_async: - complete_fn = openai_obj.ChatCompletion.acreate - embedding_fn = openai_obj.Embedding.acreate - moderation_fn = openai_obj.Moderations.acreate + if client is None: + # prepare the default client if not provided + complete_fn = None + rate_limit_error = None + + Client = LLMClient + + if is_v1: + client = Client( + openai=openai, + complete=openai_obj.chat.completions.create, + embed=openai_obj.embeddings.create, + moderation=openai_obj.moderations.create, + RateLimitError=openai.RateLimitError, + ) else: - complete_fn = openai_obj.ChatCompletion.create - embedding_fn = openai_obj.Embedding.create - moderation_fn = openai_obj.Moderations.create - wrapper = OpenAIWrapper( - complete=complete_fn, - embed=embedding_fn, - moderation=moderation_fn, - RateLimitError=rate_limit_error, - ) + rate_limit_error = openai.error.RateLimitError + if is_async: + complete_fn = openai_obj.ChatCompletion.acreate + embedding_fn = openai_obj.Embedding.acreate + moderation_fn = openai_obj.Moderations.acreate + else: + complete_fn = openai_obj.ChatCompletion.create + embedding_fn = openai_obj.Embedding.create + moderation_fn = openai_obj.Moderations.create + client = Client( + openai=openai, + complete=complete_fn, + embed=embedding_fn, + moderation=moderation_fn, + RateLimitError=rate_limit_error, + ) - return wrapper, wrapped + return client, wrapped def post_process_response(resp): @@ -108,8 +224,10 @@ def set_span_purpose(kwargs): kwargs.setdefault("span_info", {}).setdefault("span_attributes", {})["purpose"] = "scorer" -def run_cached_request(request_type="complete", api_key=None, base_url=None, **kwargs): - wrapper, wrapped = prepare_openai(is_async=False, api_key=api_key, base_url=base_url) +def run_cached_request( + *, client: Optional[LLMClient] = None, request_type="complete", api_key=None, base_url=None, **kwargs +): + wrapper, wrapped = prepare_openai(client=client, is_async=False, api_key=api_key, base_url=base_url) if wrapped: set_span_purpose(kwargs) @@ -127,8 +245,10 @@ def run_cached_request(request_type="complete", api_key=None, base_url=None, **k return resp -async def arun_cached_request(request_type="complete", api_key=None, base_url=None, **kwargs): - wrapper, wrapped = prepare_openai(is_async=True, api_key=api_key, base_url=base_url) +async def arun_cached_request( + *, client: Optional[LLMClient] = None, request_type="complete", api_key=None, base_url=None, **kwargs +): + wrapper, wrapped = prepare_openai(client=client, is_async=True, api_key=api_key, base_url=base_url) if wrapped: set_span_purpose(kwargs) diff --git a/py/autoevals/ragas.py b/py/autoevals/ragas.py index 6d5a820..82d094a 100644 --- a/py/autoevals/ragas.py +++ b/py/autoevals/ragas.py @@ -2,13 +2,14 @@ import asyncio import json +from typing import Optional import chevron from . import Score from .list import ListContains from .llm import OpenAILLMScorer -from .oai import arun_cached_request, run_cached_request +from .oai import LLMClient, arun_cached_request, run_cached_request from .string import EmbeddingSimilarity @@ -76,13 +77,13 @@ def extract_entities_request(text, **extra_args): ) -async def aextract_entities(text, **extra_args): - response = await arun_cached_request(**extract_entities_request(text=text, **extra_args)) +async def aextract_entities(*, text, client: Optional[LLMClient] = None, **extra_args): + response = await arun_cached_request(client=client, **extract_entities_request(text=text, **extra_args)) return json.loads(response["choices"][0]["message"]["tool_calls"][0]["function"]["arguments"]) -def extract_entities(text, **extra_args): - response = run_cached_request(**extract_entities_request(text=text, **extra_args)) +def extract_entities(*, text, client: Optional[LLMClient] = None, **extra_args): + response = run_cached_request(client=client, **extract_entities_request(text=text, **extra_args)) return json.loads(response["choices"][0]["message"]["tool_calls"][0]["function"]["arguments"]) @@ -92,12 +93,12 @@ class ContextEntityRecall(OpenAILLMScorer): retrieved context. """ - def __init__(self, pairwise_scorer=None, model=DEFAULT_RAGAS_MODEL, **kwargs): - super().__init__(**kwargs) + def __init__(self, pairwise_scorer=None, model=DEFAULT_RAGAS_MODEL, client: Optional[LLMClient] = None, **kwargs): + super().__init__(client=client, **kwargs) self.extraction_model = model self.contains_scorer = ListContains( - pairwise_scorer=pairwise_scorer or EmbeddingSimilarity(), allow_extra_entities=True + pairwise_scorer=pairwise_scorer or EmbeddingSimilarity(client=client), allow_extra_entities=True ) async def _run_eval_async(self, output, expected=None, context=None, **kwargs): @@ -106,8 +107,8 @@ async def _run_eval_async(self, output, expected=None, context=None, **kwargs): context = "\n".join(context) if isinstance(context, list) else context expected_entities_future, context_entities_future = ( - aextract_entities(text=expected, model=self.extraction_model, **self.extra_args), - aextract_entities(text=context, model=self.extraction_model, **self.extra_args), + aextract_entities(client=self.client, text=expected, model=self.extraction_model, **self.extra_args), + aextract_entities(client=self.client, text=context, model=self.extraction_model, **self.extra_args), ) expected_entities = [e for e in (await expected_entities_future)["entities"]] @@ -127,10 +128,16 @@ def _run_eval_sync(self, output, expected=None, context=None, **kwargs): context = "\n".join(context) if isinstance(context, list) else context expected_entities = [ - e for e in (extract_entities(text=expected, model=self.extraction_model, **self.extra_args))["entities"] + e + for e in ( + extract_entities(client=self.client, text=expected, model=self.extraction_model, **self.extra_args) + )["entities"] ] context_entities = [ - e for e in (extract_entities(text=context, model=self.extraction_model, **self.extra_args))["entities"] + e + for e in ( + extract_entities(client=self.client, text=context, model=self.extraction_model, **self.extra_args) + )["entities"] ] score = self.contains_scorer.eval(output=context_entities, expected=expected_entities) @@ -208,8 +215,8 @@ class ContextRelevancy(OpenAILLMScorer): self-consistency checks. The number of relevant sentences and is used as the score. """ - def __init__(self, pairwise_scorer=None, model=DEFAULT_RAGAS_MODEL, **kwargs): - super().__init__(**kwargs) + def __init__(self, pairwise_scorer=None, model=DEFAULT_RAGAS_MODEL, client: Optional[LLMClient] = None, **kwargs): + super().__init__(client=client, **kwargs) self.model = model @@ -234,7 +241,8 @@ async def _run_eval_async(self, output, expected=None, input=None, context=None, return self._postprocess( context, await arun_cached_request( - **extract_sentences_request(question=input, context=context, model=self.model, **self.extra_args) + client=self.client, + **extract_sentences_request(question=input, context=context, model=self.model, **self.extra_args), ), ) @@ -247,7 +255,8 @@ def _run_eval_sync(self, output, expected=None, input=None, context=None, **kwar return self._postprocess( context, run_cached_request( - **extract_sentences_request(question=input, context=context, model=self.model, **self.extra_args) + client=self.client, + **extract_sentences_request(question=input, context=context, model=self.model, **self.extra_args), ), ) @@ -342,8 +351,8 @@ class ContextRecall(OpenAILLMScorer): retrieved context. """ - def __init__(self, pairwise_scorer=None, model=DEFAULT_RAGAS_MODEL, **kwargs): - super().__init__(**kwargs) + def __init__(self, pairwise_scorer=None, model=DEFAULT_RAGAS_MODEL, client: Optional[LLMClient] = None, **kwargs): + super().__init__(client=client, **kwargs) self.model = model @@ -369,9 +378,10 @@ async def _run_eval_async(self, output, expected=None, input=None, context=None, return self._postprocess( await arun_cached_request( + client=self.client, **extract_context_recall_request( question=input, answer=expected, context=context, model=self.model, **self.extra_args - ) + ), ) ) @@ -383,9 +393,10 @@ def _run_eval_sync(self, output, expected=None, input=None, context=None, **kwar return self._postprocess( run_cached_request( + client=self.client, **extract_context_recall_request( question=input, answer=expected, context=context, model=self.model, **self.extra_args - ) + ), ) ) @@ -475,8 +486,8 @@ class ContextPrecision(OpenAILLMScorer): relevant items selected by the model are ranked higher or not. """ - def __init__(self, pairwise_scorer=None, model=DEFAULT_RAGAS_MODEL, **kwargs): - super().__init__(**kwargs) + def __init__(self, pairwise_scorer=None, model=DEFAULT_RAGAS_MODEL, client: Optional[LLMClient] = None, **kwargs): + super().__init__(client=client, **kwargs) self.model = model @@ -499,9 +510,10 @@ async def _run_eval_async(self, output, expected=None, input=None, context=None, return self._postprocess( await arun_cached_request( + client=self.client, **extract_context_precision_request( question=input, answer=expected, context=context, model=self.model, **self.extra_args - ) + ), ) ) @@ -513,9 +525,10 @@ def _run_eval_sync(self, output, expected=None, input=None, context=None, **kwar return self._postprocess( run_cached_request( + client=self.client, **extract_context_precision_request( question=input, answer=expected, context=context, model=self.model, **self.extra_args - ) + ), ) ) @@ -679,25 +692,31 @@ def extract_faithfulness_request(context, statements, **extra_args): ) -async def aextract_statements(question, answer, **extra_args): - response = await arun_cached_request(**extract_statements_request(question=question, answer=answer, **extra_args)) +async def aextract_statements(question, answer, client: Optional[LLMClient] = None, **extra_args): + response = await arun_cached_request( + client=client, **extract_statements_request(question=question, answer=answer, **extra_args) + ) return load_function_call(response) -def extract_statements(question, answer, **extra_args): - response = run_cached_request(**extract_statements_request(question=question, answer=answer, **extra_args)) +def extract_statements(question, answer, client: Optional[LLMClient] = None, **extra_args): + response = run_cached_request( + client=client, **extract_statements_request(question=question, answer=answer, **extra_args) + ) return load_function_call(response) -async def aextract_faithfulness(context, statements, **extra_args): +async def aextract_faithfulness(context, statements, client: Optional[LLMClient] = None, **extra_args): response = await arun_cached_request( - **extract_faithfulness_request(context=context, statements=statements, **extra_args) + client=client, **extract_faithfulness_request(context=context, statements=statements, **extra_args) ) return load_function_call(response) -def extract_faithfulness(context, statements, **extra_args): - response = run_cached_request(**extract_faithfulness_request(context=context, statements=statements, **extra_args)) +def extract_faithfulness(context, statements, client: Optional[LLMClient] = None, **extra_args): + response = run_cached_request( + client=client, **extract_faithfulness_request(context=context, statements=statements, **extra_args) + ) return load_function_call(response) @@ -706,20 +725,24 @@ class Faithfulness(OpenAILLMScorer): Measures factual consistency of a generated answer against the given context. """ - def __init__(self, model=DEFAULT_RAGAS_MODEL, **kwargs): - super().__init__(**kwargs) + def __init__(self, model=DEFAULT_RAGAS_MODEL, client: Optional[LLMClient] = None, **kwargs): + super().__init__(client=client, **kwargs) self.model = model async def _run_eval_async(self, output, expected=None, input=None, context=None, **kwargs): check_required("Faithfulness", input=input, output=output, context=context) - statements = (await aextract_statements(question=input, answer=expected, model=self.model, **self.extra_args))[ - "statements" - ] + statements = ( + await aextract_statements( + client=self.client, question=input, answer=expected, model=self.model, **self.extra_args + ) + )["statements"] faithfulness = ( - await aextract_faithfulness(context=context, statements=statements, model=self.model, **self.extra_args) + await aextract_faithfulness( + client=self.client, context=context, statements=statements, model=self.model, **self.extra_args + ) )["faithfulness"] return Score( @@ -734,12 +757,16 @@ async def _run_eval_async(self, output, expected=None, input=None, context=None, def _run_eval_sync(self, output, expected=None, input=None, context=None, **kwargs): check_required("Faithfulness", input=input, context=context) - statements = (extract_statements(question=input, answer=expected, model=self.model, **self.extra_args))[ - "statements" - ] + statements = ( + extract_statements( + client=self.client, question=input, answer=expected, model=self.model, **self.extra_args + ) + )["statements"] faithfulness = ( - extract_faithfulness(context=context, statements=statements, model=self.model, **self.extra_args) + extract_faithfulness( + client=self.client, context=context, statements=statements, model=self.model, **self.extra_args + ) )["faithfulness"] return Score( @@ -837,9 +864,10 @@ def __init__( strictness=3, temperature=0.5, embedding_model=DEFAULT_RAGAS_EMBEDDING_MODEL, + client: Optional[LLMClient] = None, **kwargs, ): - super().__init__(temperature=temperature, **kwargs) + super().__init__(temperature=temperature, client=client, **kwargs) self.model = model self.strictness = strictness @@ -868,14 +896,19 @@ async def _run_eval_async(self, output, expected=None, input=None, context=None, questions = await asyncio.gather( *[ aload_function_call_request( - **extract_question_gen_request(answer=output, context=context, model=self.model, **self.extra_args) + client=self.client, + **extract_question_gen_request( + answer=output, context=context, model=self.model, **self.extra_args + ), ) for _ in range(self.strictness) ] ) similarity = await asyncio.gather( *[ - EmbeddingSimilarity().eval_async(output=q["question"], expected=input, model=self.embedding_model) + EmbeddingSimilarity(client=self.client).eval_async( + output=q["question"], expected=input, model=self.embedding_model + ) for q in questions ] ) @@ -887,12 +920,14 @@ def _run_eval_sync(self, output, expected=None, input=None, context=None, **kwar questions = [ load_function_call_request( - **extract_question_gen_request(answer=output, context=context, model=self.model, **self.extra_args) + client=self.client, + **extract_question_gen_request(answer=output, context=context, model=self.model, **self.extra_args), ) for _ in range(self.strictness) ] similarity = [ - EmbeddingSimilarity().eval(output=q["question"], expected=input, model=self.model) for q in questions + EmbeddingSimilarity(client=self.client).eval(output=q["question"], expected=input, model=self.model) + for q in questions ] return self._postprocess(questions, similarity) @@ -903,22 +938,30 @@ class AnswerSimilarity(OpenAILLMScorer): Measures the similarity between the generated answer and the expected answer. """ - def __init__(self, pairwise_scorer=None, model=DEFAULT_RAGAS_EMBEDDING_MODEL, **kwargs): - super().__init__(**kwargs) + def __init__( + self, + pairwise_scorer=None, + model=DEFAULT_RAGAS_EMBEDDING_MODEL, + client: Optional[LLMClient] = None, + **kwargs, + ): + super().__init__(client=client, **kwargs) self.model = model async def _run_eval_async(self, output, expected=None, input=None, **kwargs): check_required("AnswerSimilarity", expected=expected, output=output) - return await EmbeddingSimilarity().eval_async( + return await EmbeddingSimilarity(client=self.client).eval_async( output=output, expected=expected, model=self.model, **self.extra_args ) def _run_eval_sync(self, output, expected=None, input=None, **kwargs): check_required("AnswerSimilarity", expected=expected, output=output) - return EmbeddingSimilarity().eval(output=output, expected=expected, model=self.model, **self.extra_args) + return EmbeddingSimilarity(client=self.client).eval( + output=output, expected=expected, model=self.model, **self.extra_args + ) CORRECTNESS_PROMPT = """Given a ground truth and an answer, analyze each statement in the answer and classify them in one of the following categories: @@ -1015,12 +1058,13 @@ def __init__( factuality_weight=0.75, answer_similarity_weight=0.25, answer_similarity=None, + client: Optional[LLMClient] = None, **kwargs, ): - super().__init__(**kwargs) + super().__init__(client=client, **kwargs) self.model = model - self.answer_similarity = answer_similarity or AnswerSimilarity() + self.answer_similarity = answer_similarity or AnswerSimilarity(client=client) if factuality_weight == 0 and answer_similarity_weight == 0: raise ValueError("At least one weight must be nonzero") @@ -1065,9 +1109,10 @@ async def _run_eval_async(self, output, expected=None, input=None, **kwargs): factuality_future, similarity_future = ( aload_function_call_request( + client=self.client, **extract_correctness_request( question=input, answer=output, ground_truth=expected, model=self.model, **self.extra_args - ) + ), ), self._run_answer_similarity_async(output, expected), ) @@ -1079,9 +1124,10 @@ def _run_eval_sync(self, output, expected=None, input=None, **kwargs): factuality, similarity = ( load_function_call_request( + client=self.client, **extract_correctness_request( question=input, answer=output, ground_truth=expected, model=self.model, **self.extra_args - ) + ), ), self._run_answer_similarity_sync(output, expected), ) @@ -1093,9 +1139,9 @@ def load_function_call(response): return json.loads(response["choices"][0]["message"]["tool_calls"][0]["function"]["arguments"]) -async def aload_function_call_request(**kwargs): - return load_function_call(await arun_cached_request(**kwargs)) +async def aload_function_call_request(client: Optional[LLMClient] = None, **kwargs): + return load_function_call(await arun_cached_request(client=client, **kwargs)) -def load_function_call_request(**kwargs): - return load_function_call(run_cached_request(**kwargs)) +def load_function_call_request(client: Optional[LLMClient] = None, **kwargs): + return load_function_call(run_cached_request(client=client, **kwargs)) diff --git a/py/autoevals/string.py b/py/autoevals/string.py index e078969..370132f 100644 --- a/py/autoevals/string.py +++ b/py/autoevals/string.py @@ -1,4 +1,5 @@ import threading +from typing import Optional from braintrust_core.score import Score from Levenshtein import distance @@ -6,7 +7,7 @@ from autoevals.partial import ScorerWithPartial from autoevals.value import normalize_value -from .oai import arun_cached_request, run_cached_request +from .oai import LLMClient, arun_cached_request, run_cached_request class Levenshtein(ScorerWithPartial): @@ -41,7 +42,15 @@ class EmbeddingSimilarity(ScorerWithPartial): _CACHE = {} _CACHE_LOCK = threading.Lock() - def __init__(self, prefix="", model=MODEL, expected_min=0.7, api_key=None, base_url=None): + def __init__( + self, + prefix="", + model=MODEL, + expected_min=0.7, + api_key=None, + base_url=None, + client: Optional[LLMClient] = None, + ): """ Create a new EmbeddingSimilarity scorer. @@ -59,13 +68,17 @@ def __init__(self, prefix="", model=MODEL, expected_min=0.7, api_key=None, base_ if base_url: self.extra_args["base_url"] = base_url + self.client = client + async def _a_embed(self, value): value = normalize_value(value, maybe_object=False) with self._CACHE_LOCK: if value in self._CACHE: return self._CACHE[value] - result = await arun_cached_request("embed", input=f"{self.prefix}{value}", **self.extra_args) + result = await arun_cached_request( + client=self.client, request_type="embed", input=f"{self.prefix}{value}", **self.extra_args + ) with self._CACHE_LOCK: self._CACHE[value] = result @@ -78,7 +91,9 @@ def _embed(self, value): if value in self._CACHE: return self._CACHE[value] - result = run_cached_request("embed", input=f"{self.prefix}{value}", **self.extra_args) + result = run_cached_request( + client=self.client, request_type="embed", input=f"{self.prefix}{value}", **self.extra_args + ) with self._CACHE_LOCK: self._CACHE[value] = result diff --git a/py/autoevals/test_llm.py b/py/autoevals/test_llm.py index b845101..12ec823 100644 --- a/py/autoevals/test_llm.py +++ b/py/autoevals/test_llm.py @@ -1,7 +1,12 @@ import asyncio +from typing import cast +from unittest.mock import Mock import chevron +import pytest +import respx +from autoevals import init from autoevals.llm import * from autoevals.llm import build_classification_tools @@ -107,6 +112,195 @@ async def nested_async(): asyncio.run(nested_async()) +@respx.mock +def test_factuality(): + # something is wrong with respx that it couldn't match the url from openai + respx.route().respond( + json={ + "id": "chatcmpl-AdiS4bHWjqSclA5rx7OkuZ6EA9QIp", + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": None, + "message": { + "content": None, + "refusal": None, + "role": "assistant", + "tool_calls": [ + { + "id": "call_JKoeGAX2zGPJAmF2muDgjpHp", + "function": { + "arguments": '{"reasons":"1. The question asks to add the numbers 1, 2, and 3.\\n2. The expert answer provides the sum of these numbers as 6.\\n3. The submitted answer also provides the sum as 6.\\n4. Both the expert and submitted answers provide the same numerical result, which is 6.\\n5. Since both answers provide the same factual content, the submitted answer contains all the same details as the expert answer.\\n6. There is no additional information or discrepancy between the two answers.\\n7. Therefore, the submitted answer is neither a subset nor a superset; it is exactly the same as the expert answer in terms of factual content.","choice":"C"}', + "name": "select_choice", + }, + "type": "function", + } + ], + }, + } + ], + "created": 1734029028, + "model": "gpt-4o-2024-08-06", + "object": "chat.completion", + "system_fingerprint": "fp_cc5cf1c6e3", + "usage": { + "completion_tokens": 149, + "prompt_tokens": 404, + "total_tokens": 553, + "completion_tokens_details": { + "accepted_prediction_tokens": 0, + "audio_tokens": 0, + "reasoning_tokens": 0, + "rejected_prediction_tokens": 0, + }, + "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0}, + }, + } + ) + + llm = Factuality(base_url="https://api.openai.com/v1/") + result = llm.eval( + output="6", + expected="6", + input="Add the following numbers: 1, 2, 3", + ) + + assert result.score == 1 + + +def test_factuality_client(): + client = Mock() + client.RateLimitError = Exception + + completion = Mock() + completion.to_dict.return_value = { + "id": "chatcmpl-AdiS4bHWjqSclA5rx7OkuZ6EA9QIp", + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": None, + "message": { + "content": None, + "refusal": None, + "role": "assistant", + "tool_calls": [ + { + "id": "call_JKoeGAX2zGPJAmF2muDgjpHp", + "function": { + "arguments": '{"reasons":"1. The question asks to add the numbers 1, 2, and 3.\\n2. The expert answer provides the sum of these numbers as 6.\\n3. The submitted answer also provides the sum as 6.\\n4. Both the expert and submitted answers provide the same numerical result, which is 6.\\n5. Since both answers provide the same factual content, the submitted answer contains all the same details as the expert answer.\\n6. There is no additional information or discrepancy between the two answers.\\n7. Therefore, the submitted answer is neither a subset nor a superset; it is exactly the same as the expert answer in terms of factual content.","choice":"C"}', + "name": "select_choice", + }, + "type": "function", + } + ], + }, + } + ], + "created": 1734029028, + "model": "gpt-4o-2024-08-06", + "object": "chat.completion", + "system_fingerprint": "fp_cc5cf1c6e3", + "usage": { + "completion_tokens": 149, + "prompt_tokens": 404, + "total_tokens": 553, + "completion_tokens_details": { + "accepted_prediction_tokens": 0, + "audio_tokens": 0, + "reasoning_tokens": 0, + "rejected_prediction_tokens": 0, + }, + "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0}, + }, + } + + client.complete.return_value = completion + + llm = Factuality(client=cast(LLMClient, client)) + result = llm.eval( + output="6", + expected="6", + input="Add the following numbers: 1, 2, 3", + ) + + assert client.complete.call_count == 1 + + assert result.score == 1 + + +@pytest.fixture(autouse=True) +def reset_client(): + yield + init(client=None) + + +# make sure we deny any leaked calls to OpenAI +@respx.mock(base_url="https://api.openai.com/v1/") +def test_init_client(): + client = Mock() + client.RateLimitError = Exception + + completion = Mock() + completion.to_dict.return_value = { + "id": "chatcmpl-AdiS4bHWjqSclA5rx7OkuZ6EA9QIp", + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "logprobs": None, + "message": { + "content": None, + "refusal": None, + "role": "assistant", + "tool_calls": [ + { + "id": "call_JKoeGAX2zGPJAmF2muDgjpHp", + "function": { + "arguments": '{"reasons":"1. The question asks to add the numbers 1, 2, and 3.\\n2. The expert answer provides the sum of these numbers as 6.\\n3. The submitted answer also provides the sum as 6.\\n4. Both the expert and submitted answers provide the same numerical result, which is 6.\\n5. Since both answers provide the same factual content, the submitted answer contains all the same details as the expert answer.\\n6. There is no additional information or discrepancy between the two answers.\\n7. Therefore, the submitted answer is neither a subset nor a superset; it is exactly the same as the expert answer in terms of factual content.","choice":"C"}', + "name": "select_choice", + }, + "type": "function", + } + ], + }, + } + ], + "created": 1734029028, + "model": "gpt-4o-2024-08-06", + "object": "chat.completion", + "system_fingerprint": "fp_cc5cf1c6e3", + "usage": { + "completion_tokens": 149, + "prompt_tokens": 404, + "total_tokens": 553, + "completion_tokens_details": { + "accepted_prediction_tokens": 0, + "audio_tokens": 0, + "reasoning_tokens": 0, + "rejected_prediction_tokens": 0, + }, + "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0}, + }, + } + + client.complete.return_value = completion + + init(client=client) + + llm = Factuality(base_url="https://api.openai.com/v1/") + result = llm.eval( + output="6", + expected="6", + input="Add the following numbers: 1, 2, 3", + ) + + assert client.complete.call_count == 1 + + assert result.score == 1 + + def test_battle(): for use_cot in [True, False]: print("use_cot", use_cot) diff --git a/py/autoevals/version.py b/py/autoevals/version.py index a35b79e..9134c22 100644 --- a/py/autoevals/version.py +++ b/py/autoevals/version.py @@ -1 +1 @@ -VERSION = "0.0.109" +VERSION = "0.0.110" diff --git a/setup.py b/setup.py index 70e7c4c..2d92e31 100644 --- a/setup.py +++ b/setup.py @@ -23,6 +23,7 @@ "isort==5.12.0", "pre-commit", "pytest", + "respx", "twine", ], "doc": ["pydoc-markdown"],