Skip to content

Commit

Permalink
feat (tests): add test case for auto_ai_critique and evaluators requi…
Browse files Browse the repository at this point in the history
…ring OpenAI API key
  • Loading branch information
aybruhm committed Aug 14, 2024
1 parent 28320b8 commit d8a1bbd
Showing 1 changed file with 99 additions and 8 deletions.
107 changes: 99 additions & 8 deletions agenta-backend/agenta_backend/tests/unit/test_evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from agenta_backend.services.evaluators_service import (
auto_levenshtein_distance,
auto_ai_critique,
auto_starts_with,
auto_ends_with,
auto_contains,
Expand All @@ -18,6 +19,53 @@
)


@pytest.mark.parametrize(
"ground_truth, output, settings_values, openai_api_key, expected_min, expected_max",
[
(
{"correct_answer": "The capital of Kiribati is Tarawa."},
"The capital of Kiribati is South Tarawa.",
{
"prompt_template": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below:\nEvaluation strategy: 0 to 10 0 is very bad and 10 is very good.\nPrompt: {llm_app_prompt_template}\nInputs: country: {country}\nExpected Answer Column:{correct_answer}\nEvaluate this: {variant_output}\n\nAnswer ONLY with one of the given grading or evaluation options.",
"correct_answer_key": "correct_answer",
},
os.environ.get("OPENAI_API_KEY"),
0,
10,
),
(
{"correct_answer": "The capital of Kiribati is Tarawa."},
"The capital of Kiribati is South Tarawa.",
{
"prompt_template": "We have an LLM App that we want to evaluate its outputs. Based on the prompt and the parameters provided below evaluate the output based on the evaluation strategy below:\nEvaluation strategy: 0 to 10 0 is very bad and 10 is very good.\nPrompt: {llm_app_prompt_template}\nInputs: country: {country}\nExpected Answer Column:{correct_answer}\nEvaluate this: {variant_output}\n\nAnswer ONLY with one of the given grading or evaluation options.",
"correct_answer_key": "correct_answer",
},
None,
None,
None,
),
],
)
@pytest.mark.asyncio
async def test_auto_ai_critique_evaluator(
ground_truth, output, settings_values, openai_api_key, expected_min, expected_max
):
result = await auto_ai_critique(
{},
output,
ground_truth,
{},
settings_values,
{"OPENAI_API_KEY": openai_api_key},
)
try:
assert expected_min <= round(result.value, 1) <= expected_max
except TypeError as error:
# exceptions
# - raised by evaluator (agenta) -> TypeError
assert not isinstance(result.value, float) or not isinstance(result.value, int)


@pytest.mark.parametrize(
"output, settings_values, expected",
[
Expand Down Expand Up @@ -287,6 +335,15 @@ async def test_auto_json_diff(
0.0,
1.0,
),
(
{"correct_answer": "The capital of Namibia is Windhoek."},
"Windhoek is the capital of Namibia.",
{
"correct_answer_key": "correct_answer",
},
None,
None,
),
],
)
@pytest.mark.asyncio
Expand All @@ -301,7 +358,12 @@ async def test_auto_semantic_similarity_match(
settings_values,
{"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")},
)
assert expected_min <= round(result.value, 3) <= expected_max
try:
assert expected_min <= round(result.value, 1) <= expected_max
except TypeError as error:
# exceptions
# - raised by evaluator (agenta) -> TypeError
assert not isinstance(result.value, float) or not isinstance(result.value, int)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -359,60 +421,89 @@ async def test_auto_levenshtein_distance(output, data_point, settings_values, ex


@pytest.mark.parametrize(
"settings_values, expected_min, expected_max",
"settings_values, expected_min, openai_api_key, expected_max",
[
(
{
"question_key": "rag.retriever.internals.prompt",
"answer_key": "rag.reporter.outputs.report",
"contexts_key": "rag.retriever.outputs.movies",
},
os.environ.get("OPENAI_API_KEY"),
0.0,
1.0,
),
(
{
"question_key": "rag.retriever.internals.prompt",
"answer_key": "rag.reporter.outputs.report",
"contexts_key": "rag.retriever.outputs.movies",
},
None,
None,
None,
),
# add more use cases
],
)
@pytest.mark.asyncio
async def test_rag_faithfulness_evaluator(settings_values, expected_min, expected_max):
async def test_rag_faithfulness_evaluator(
settings_values, expected_min, openai_api_key, expected_max
):
result = await rag_faithfulness(
{},
simple_rag_trace,
{},
{},
settings_values,
{"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")},
{"OPENAI_API_KEY": openai_api_key},
)

assert expected_min <= round(result.value, 1) <= expected_max
try:
assert expected_min <= round(result.value, 1) <= expected_max
except TypeError as error:
# exceptions
# - raised by evaluator (agenta) -> TypeError
assert not isinstance(result.value, float) or not isinstance(result.value, int)


@pytest.mark.parametrize(
"settings_values, expected_min, expected_max",
"settings_values, expected_min, openai_api_key, expected_max",
[
(
{
"question_key": "rag.retriever.internals.prompt",
"answer_key": "rag.reporter.outputs.report",
"contexts_key": "rag.retriever.outputs.movies",
},
os.environ.get("OPENAI_API_KEY"),
0.0,
1.0,
),
(
{
"question_key": "rag.retriever.internals.prompt",
"answer_key": "rag.reporter.outputs.report",
"contexts_key": "rag.retriever.outputs.movies",
},
None,
None,
None,
),
# add more use cases
],
)
@pytest.mark.asyncio
async def test_rag_context_relevancy_evaluator(
settings_values, expected_min, expected_max
settings_values, expected_min, openai_api_key, expected_max
):
result = await rag_context_relevancy(
{},
simple_rag_trace,
{},
{},
settings_values,
{"OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY")},
{"OPENAI_API_KEY": openai_api_key},
)

try:
Expand Down

0 comments on commit d8a1bbd

Please sign in to comment.