Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhancement] Add LLM API key checks to LLM-based evaluators #1989

Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
28320b8
refactor (backend): add check for OpenAI API key with clear exception…
aybruhm Aug 14, 2024
d8a1bbd
feat (tests): add test case for auto_ai_critique and evaluators requi…
aybruhm Aug 14, 2024
e02fefa
refactor (backend): rewrite db function to check if evaluators exist …
aybruhm Aug 19, 2024
4cee49f
chore (backend): remove deprecated function 'check_ai_critique_inputs'
aybruhm Aug 19, 2024
c6ee3c8
feat (backend): implemented helper functions to:
aybruhm Aug 19, 2024
a8c1273
refactor (backend): update evaluator_router to:
aybruhm Aug 19, 2024
f3367ef
feat (tests): added test to create evaluation with no llm keys
aybruhm Aug 20, 2024
c499a19
refactor (backend): added
aybruhm Aug 20, 2024
cc90567
Merge branch 'main' into feature/age-532-poc-1e-add-llm-api-key-check…
aybruhm Aug 20, 2024
7197942
Merge branch 'feature/age-491-poc-1e-expose-running-evaluators-via-ap…
aybruhm Aug 20, 2024
a4ecc3c
Merge branch 'feature/age-491-poc-1e-expose-running-evaluators-via-ap…
aybruhm Aug 21, 2024
33e6e17
refactor (backend): clean up LLM key checks in evaluators
aybruhm Aug 22, 2024
7c28f6d
chore (tests): add '@pytest.mark.asyncio' to test cases in test_user_…
aybruhm Aug 22, 2024
91d23d8
fix ai critique
jp-agenta Aug 23, 2024
ca81cea
minor refactor (backend): resolve ValueError when casting string to f…
aybruhm Aug 23, 2024
0ce0022
Merge branch 'feature/age-491-poc-1e-expose-running-evaluators-via-ap…
aybruhm Aug 26, 2024
cc33a66
Update evaluators_service.py
jp-agenta Aug 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions agenta-backend/agenta_backend/services/evaluators_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,12 @@ async def auto_ai_critique(


async def ai_critique(input: EvaluatorInputInterface) -> EvaluatorOutputInterface:
openai_api_key = input.credentials["OPENAI_API_KEY"]
openai_api_key = input.credentials.get("OPENAI_API_KEY", None)

if not openai_api_key:
raise Exception(
"No OpenAI key was found. AI Critique evaluator requires a valid OpenAI API key to function. Please configure your OpenAI API and try again."
)

chain_run_args = {
"llm_app_prompt_template": input.inputs.get("prompt_user", ""),
Expand Down Expand Up @@ -786,7 +791,7 @@ async def measure_rag_consistency(
openai_api_key = input.credentials.get("OPENAI_API_KEY", None)
if not openai_api_key:
raise Exception(
"No LLM keys OpenAI key found. Please configure your OpenAI keys and try again."
"No OpenAI key was found. RAG evaluator requires a valid OpenAI API key to function. Please configure your OpenAI API and try again."
)

# Initialize RAG evaluator to calculate faithfulness score
Expand Down Expand Up @@ -885,10 +890,9 @@ async def measure_context_coherence(
input: EvaluatorInputInterface,
) -> EvaluatorOutputInterface:
openai_api_key = input.credentials.get("OPENAI_API_KEY", None)

if not openai_api_key:
raise Exception(
"No LLM keys OpenAI key found. Please configure your OpenAI keys and try again."
"No OpenAI key was found. RAG evaluator requires a valid OpenAI API key to function. Please configure your OpenAI API and try again."
)

# Initialize RAG evaluator to calculate context relevancy score
Expand Down Expand Up @@ -1124,8 +1128,13 @@ async def semantic_similarity(
float: the semantic similarity score
"""

api_key = input.credentials["OPENAI_API_KEY"]
openai = AsyncOpenAI(api_key=api_key)
openai_api_key = input.credentials.get("OPENAI_API_KEY", None)
if not openai_api_key:
raise Exception(
"No OpenAI key was found. Semantic evaluator requires a valid OpenAI API key to function. Please configure your OpenAI API and try again."
)

openai = AsyncOpenAI(api_key=openai_api_key)

async def encode(text: str):
response = await openai.embeddings.create(
Expand Down
107 changes: 99 additions & 8 deletions agenta-backend/agenta_backend/tests/unit/test_evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from agenta_backend.services.evaluators_service import (
auto_levenshtein_distance,
auto_ai_critique,
auto_starts_with,
auto_ends_with,
auto_contains,
Expand All @@ -18,6 +19,53 @@
)


@pytest.mark.parametrize(
"ground_truth, output, settings_values, openai_api_key, expected_min, expected_max",
[
(
{"correct_answer": "The capital of Kiribati is Tarawa."},
"The capital of Kiribati is South Tarawa.",
{
"prompt_template": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below:\nEvaluation strategy: 0 to 10 0 is very bad and 10 is very good.\nPrompt: {llm_app_prompt_template}\nInputs: country: {country}\nExpected Answer Column:{correct_answer}\nEvaluate this: {variant_output}\n\nAnswer ONLY with one of the given grading or evaluation options.",
"correct_answer_key": "correct_answer",
},
os.environ.get("OPENAI_API_KEY"),
0,
10,
),
(
{"correct_answer": "The capital of Kiribati is Tarawa."},
"The capital of Kiribati is South Tarawa.",
{
"prompt_template": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below:\nEvaluation strategy: 0 to 10 0 is very bad and 10 is very good.\nPrompt: {llm_app_prompt_template}\nInputs: country: {country}\nExpected Answer Column:{correct_answer}\nEvaluate this: {variant_output}\n\nAnswer ONLY with one of the given grading or evaluation options.",
"correct_answer_key": "correct_answer",
},
None,
None,
None,
),
],
)
@pytest.mark.asyncio
async def test_auto_ai_critique_evaluator(
ground_truth, output, settings_values, openai_api_key, expected_min, expected_max
):
result = await auto_ai_critique(
{},
output,
ground_truth,
{},
settings_values,
{"OPENAI_API_KEY": openai_api_key},
)
try:
assert expected_min <= round(result.value, 1) <= expected_max
except TypeError as error:
# exceptions
# - raised by evaluator (agenta) -> TypeError
assert not isinstance(result.value, float) or not isinstance(result.value, int)


@pytest.mark.parametrize(
"output, settings_values, expected",
[
Expand Down Expand Up @@ -287,6 +335,15 @@ async def test_auto_json_diff(
0.0,
1.0,
),
(
{"correct_answer": "The capital of Namibia is Windhoek."},
"Windhoek is the capital of Namibia.",
{
"correct_answer_key": "correct_answer",
},
None,
None,
),
],
)
@pytest.mark.asyncio
Expand All @@ -301,7 +358,12 @@ async def test_auto_semantic_similarity_match(
settings_values,
{"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")},
)
assert expected_min <= round(result.value, 3) <= expected_max
try:
assert expected_min <= round(result.value, 1) <= expected_max
except TypeError as error:
# exceptions
# - raised by evaluator (agenta) -> TypeError
assert not isinstance(result.value, float) or not isinstance(result.value, int)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -359,60 +421,89 @@ async def test_auto_levenshtein_distance(output, data_point, settings_values, ex


@pytest.mark.parametrize(
"settings_values, expected_min, expected_max",
"settings_values, expected_min, openai_api_key, expected_max",
[
(
{
"question_key": "rag.retriever.internals.prompt",
"answer_key": "rag.reporter.outputs.report",
"contexts_key": "rag.retriever.outputs.movies",
},
os.environ.get("OPENAI_API_KEY"),
0.0,
1.0,
),
(
{
"question_key": "rag.retriever.internals.prompt",
"answer_key": "rag.reporter.outputs.report",
"contexts_key": "rag.retriever.outputs.movies",
},
None,
None,
None,
),
# add more use cases
],
)
@pytest.mark.asyncio
async def test_rag_faithfulness_evaluator(settings_values, expected_min, expected_max):
async def test_rag_faithfulness_evaluator(
settings_values, expected_min, openai_api_key, expected_max
):
result = await rag_faithfulness(
{},
simple_rag_trace,
{},
{},
settings_values,
{"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")},
{"OPENAI_API_KEY": openai_api_key},
)

assert expected_min <= round(result.value, 1) <= expected_max
try:
assert expected_min <= round(result.value, 1) <= expected_max
except TypeError as error:
# exceptions
# - raised by evaluator (agenta) -> TypeError
assert not isinstance(result.value, float) or not isinstance(result.value, int)


@pytest.mark.parametrize(
"settings_values, expected_min, expected_max",
"settings_values, expected_min, openai_api_key, expected_max",
[
(
{
"question_key": "rag.retriever.internals.prompt",
"answer_key": "rag.reporter.outputs.report",
"contexts_key": "rag.retriever.outputs.movies",
},
os.environ.get("OPENAI_API_KEY"),
0.0,
1.0,
),
(
{
"question_key": "rag.retriever.internals.prompt",
"answer_key": "rag.reporter.outputs.report",
"contexts_key": "rag.retriever.outputs.movies",
},
None,
None,
None,
),
# add more use cases
],
)
@pytest.mark.asyncio
async def test_rag_context_relevancy_evaluator(
settings_values, expected_min, expected_max
settings_values, expected_min, openai_api_key, expected_max
):
result = await rag_context_relevancy(
{},
simple_rag_trace,
{},
{},
settings_values,
{"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")},
{"OPENAI_API_KEY": openai_api_key},
)

try:
Expand Down
Loading