From 564ca89d86b419439f3a40756dfb14fe365cba79 Mon Sep 17 00:00:00 2001 From: Josh Reini <60949774+joshreini1@users.noreply.github.com> Date: Mon, 18 Dec 2023 17:26:53 -0500 Subject: [PATCH] releases/rc-trulens-eval-0.19.2 (#694) * bump versions in quickstarts * bump version * remove openai references in function definitions page --- docs/trulens_eval/function_definitions.md | 15 +++++++-------- .../examples/quickstart/groundtruth_evals.ipynb | 2 +- .../examples/quickstart/human_feedback.ipynb | 2 +- .../quickstart/langchain_quickstart.ipynb | 2 +- .../quickstart/llama_index_quickstart.ipynb | 2 +- .../examples/quickstart/prototype_evals.ipynb | 2 +- trulens_eval/examples/quickstart/quickstart.ipynb | 4 ++-- .../quickstart/text2text_quickstart.ipynb | 2 +- trulens_eval/trulens_eval/__init__.py | 2 +- 9 files changed, 16 insertions(+), 17 deletions(-) diff --git a/docs/trulens_eval/function_definitions.md b/docs/trulens_eval/function_definitions.md index e955091dd..5d85a1d45 100644 --- a/docs/trulens_eval/function_definitions.md +++ b/docs/trulens_eval/function_definitions.md @@ -8,7 +8,7 @@ See also: This evaluates the *relevance* of the LLM response to the given text by LLM prompting. -Relevance is currently only available with OpenAI ChatCompletion API. +Relevance is currently only available with LLM-based feedback functions. TruLens offers two particular flavors of relevance: @@ -33,24 +33,23 @@ TruLens offers two particular flavors of relevance: ## Groundedness -Groundedness uses OpenAI LLMs or Huggingface NLI to attempt to check if an answer is grounded in its supplied contexts on a scale from 1 to 10. The information overlap or entailment between source and response is then measured, choosing the highest score between sources and then averaged and scaled from 0 to 1. +Groundedness uses an LLM or Huggingface NLI to attempt to check if an answer is grounded in its supplied contexts on a scale from 1 to 10. The information overlap or entailment between source and response is then measured, choosing the highest score between sources and then averaged and scaled from 0 to 1. You can read about the performance of groundedness evaluations by viewing its [smoke test results](../groundedness_smoke_tests/). - ## Sentiment This evaluates the *positive sentiment* of either the prompt or response. -Sentiment is currently available to use with OpenAI, HuggingFace or Cohere as the model provider. +Sentiment is currently available to use with LLM-based feedback functions or Huggingface (as a classification model) as the model provider. -* The OpenAI sentiment feedback function prompts a Chat Completion model to rate the sentiment from 1 to 10, and then scales the response down to 0-1. +* The LLM-based sentiment feedback function prompts a Chat Completion model to rate the sentiment from 0 to 10, and then scales the response down to 0-1. * The HuggingFace sentiment feedback function returns a raw score from 0 to 1. * The Cohere sentiment feedback function uses the classification endpoint and a small set of examples stored in `feedback_prompts.py` to return either a 0 or a 1. ## Model Agreement -Model agreement uses OpenAI to attempt an honest answer at your prompt with system prompts for correctness, and then evaluates the agreement of your LLM response to this model on a scale from 1 to 10. The agreement with each honest bot is then averaged and scaled from 0 to 1. +Model agreement uses an LLM to attempt an honest answer at your prompt with system prompts for correctness, and then evaluates the agreement of your LLM response to this model on a scale from 1 to 10. The agreement with each honest bot is then averaged and scaled from 0 to 1. ## Language Match @@ -70,11 +69,11 @@ The OpenAI Moderation API is made available for use as feedback functions. This ## Stereotypes -This evaluates stereotypes using OpenAI LLMs to check if gender or race were assumed with no prior indication. This is rated on a scale from 1 to 10 where 10 being no new gender or race assumptions. A two indicates gender or race assumption with no indication, and a one indicates gender or race changes with prior indication that is different. +This evaluates stereotypes using an LLM to check if gender or race were assumed with no prior indication. This is rated on a scale from 1 to 10 where 10 being no new gender or race assumptions. A two indicates gender or race assumption with no indication, and a one indicates gender or race changes with prior indication that is different. ## Summarization -This evaluates summarization tasks using OpenAI LLMs to check how well a summarization hits upon main points. This is rated on a scale from 1 to 10 where 10 being all points are addressed. +This evaluates summarization tasks using an LLM to check how well a summarization hits upon main points. This is rated on a scale from 1 to 10 where 10 being all points are addressed. ## Embeddings Distance diff --git a/trulens_eval/examples/quickstart/groundtruth_evals.ipynb b/trulens_eval/examples/quickstart/groundtruth_evals.ipynb index af088399d..763125c67 100644 --- a/trulens_eval/examples/quickstart/groundtruth_evals.ipynb +++ b/trulens_eval/examples/quickstart/groundtruth_evals.ipynb @@ -29,7 +29,7 @@ "metadata": {}, "outputs": [], "source": [ - "# ! pip install trulens_eval==0.19.1 openai==1.3.7" + "# ! pip install trulens_eval==0.19.2 openai==1.3.7" ] }, { diff --git a/trulens_eval/examples/quickstart/human_feedback.ipynb b/trulens_eval/examples/quickstart/human_feedback.ipynb index c57bc658b..85cbce1ea 100644 --- a/trulens_eval/examples/quickstart/human_feedback.ipynb +++ b/trulens_eval/examples/quickstart/human_feedback.ipynb @@ -17,7 +17,7 @@ "metadata": {}, "outputs": [], "source": [ - "# ! pip install trulens_eval==0.19.1 openai==1.3.7" + "# ! pip install trulens_eval==0.19.2 openai==1.3.7" ] }, { diff --git a/trulens_eval/examples/quickstart/langchain_quickstart.ipynb b/trulens_eval/examples/quickstart/langchain_quickstart.ipynb index 93f57884e..63415ac97 100644 --- a/trulens_eval/examples/quickstart/langchain_quickstart.ipynb +++ b/trulens_eval/examples/quickstart/langchain_quickstart.ipynb @@ -28,7 +28,7 @@ "metadata": {}, "outputs": [], "source": [ - "# ! pip install trulens_eval==0.19.1 openai==1.3.7 langchain chromadb langchainhub bs4" + "# ! pip install trulens_eval==0.19.2 openai==1.3.7 langchain chromadb langchainhub bs4" ] }, { diff --git a/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb b/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb index 990b008e7..cd7cd3e0e 100644 --- a/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb +++ b/trulens_eval/examples/quickstart/llama_index_quickstart.ipynb @@ -31,7 +31,7 @@ "metadata": {}, "outputs": [], "source": [ - "# pip install trulens_eval==0.19.1 llama_index>=0.9.15post2 html2text>=2020.1.16 " + "# pip install trulens_eval==0.19.2 llama_index>=0.9.15post2 html2text>=2020.1.16 " ] }, { diff --git a/trulens_eval/examples/quickstart/prototype_evals.ipynb b/trulens_eval/examples/quickstart/prototype_evals.ipynb index a9877cef0..5cb496e0a 100644 --- a/trulens_eval/examples/quickstart/prototype_evals.ipynb +++ b/trulens_eval/examples/quickstart/prototype_evals.ipynb @@ -28,7 +28,7 @@ "metadata": {}, "outputs": [], "source": [ - "# ! pip install trulens_eval==0.19.1" + "# ! pip install trulens_eval==0.19.2" ] }, { diff --git a/trulens_eval/examples/quickstart/quickstart.ipynb b/trulens_eval/examples/quickstart/quickstart.ipynb index 621661f6e..cd908eeff 100644 --- a/trulens_eval/examples/quickstart/quickstart.ipynb +++ b/trulens_eval/examples/quickstart/quickstart.ipynb @@ -19,7 +19,7 @@ "metadata": {}, "outputs": [], "source": [ - "# ! pip install trulens_eval==0.19.1 chromadb==0.4.18 openai==1.3.7" + "# ! pip install trulens_eval==0.19.2 chromadb==0.4.18 openai==1.3.7" ] }, { @@ -29,7 +29,7 @@ "outputs": [], "source": [ "import os\n", - "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" + "os.environ[\"OPENAI_API_KEY\"] = \"...\"" ] }, { diff --git a/trulens_eval/examples/quickstart/text2text_quickstart.ipynb b/trulens_eval/examples/quickstart/text2text_quickstart.ipynb index 08e89c712..0adc42d50 100644 --- a/trulens_eval/examples/quickstart/text2text_quickstart.ipynb +++ b/trulens_eval/examples/quickstart/text2text_quickstart.ipynb @@ -28,7 +28,7 @@ "metadata": {}, "outputs": [], "source": [ - "# ! pip install trulens_eval==0.19.1 openai==1.3.1" + "# ! pip install trulens_eval==0.19.2 openai==1.3.1" ] }, { diff --git a/trulens_eval/trulens_eval/__init__.py b/trulens_eval/trulens_eval/__init__.py index a0dd79072..8880b96b8 100644 --- a/trulens_eval/trulens_eval/__init__.py +++ b/trulens_eval/trulens_eval/__init__.py @@ -78,7 +78,7 @@ """ -__version__ = "0.19.1" +__version__ = "0.19.2" from trulens_eval.feedback import Bedrock from trulens_eval.feedback import Feedback