Skip to content

Commit

Permalink
Moved to extract_answer from #148 and back to gpt-4o-mini (#161)
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesbraza authored Dec 18, 2024
1 parent c705773 commit afa6961
Show file tree
Hide file tree
Showing 18 changed files with 803 additions and 320 deletions.
2 changes: 2 additions & 0 deletions src/aviary/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
EvalAnswerMode,
encode_image_to_base64,
eval_answer,
extract_answer,
is_coroutine_callable,
partial_format,
)
Expand Down Expand Up @@ -82,6 +83,7 @@
"encode_image_to_base64",
"eval_answer",
"eval_answer",
"extract_answer",
"fenv",
"is_coroutine_callable",
"join",
Expand Down
105 changes: 57 additions & 48 deletions src/aviary/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
import inspect
import io
import random
import re
import string
from ast import literal_eval
from collections.abc import Awaitable, Callable, Sequence
from collections.abc import Sequence
from enum import StrEnum
from typing import TYPE_CHECKING, Any, ClassVar, Literal, Self, cast

Expand All @@ -21,8 +20,8 @@
import numpy as np


DEFAULT_EVAL_MODEL_NAME = "gpt-4o"
LLM_BOOL_EVAL_CONFIG = {
DEFAULT_EVAL_MODEL_NAME = "gpt-4o-mini"
LLM_BOOL_EVAL_CONFIG: dict[str, Any] = {
"prompt": (
"Here is a question, the correct answer to the question, and a proposed answer"
" to the question. Please tell me if the proposed answer is correct, given the"
Expand All @@ -35,6 +34,18 @@
"temperature": 0,
}

LLM_EXTRACT_CONFIG = LLM_BOOL_EVAL_CONFIG | {
"prompt": (
"You are evaluating answers for a test which has fixed options. "
"Repeat back which option the proposed answer matches. "
"GIVE ONLY THE VERBATIM TEXT OF A FIXED OPTION. "
"If the proposed answer is empty, invalid, or ambiguous, "
"return an empty string."
"\n\nOptions:\n{options}"
"\n\nProposed answer: {proposed_answer}"
)
}

LLM_SCORE_EVAL_CONFIG = LLM_BOOL_EVAL_CONFIG | {
"prompt": (
"Here is a question, the correct answer to the question, and a rubric for"
Expand Down Expand Up @@ -175,21 +186,36 @@ async def eval_answer(
raise RuntimeError(f"Invalid evaluation mode: {eval_mode}")


async def extract_answer(
proposed_answer: str, options: Sequence[str], llm_eval_config: dict | None = None
) -> str | None:
"""Extract the answer matching a proposal from a list of options using an LLM."""
for option in options:
if proposed_answer.strip().casefold() == option.strip().casefold():
return option

default_config = LLM_EXTRACT_CONFIG
config = llm_eval_config or default_config
response_msg = await run_prompt(
prompt=config.get("prompt", default_config["prompt"]).format(
options="\n".join(options),
proposed_answer=proposed_answer,
),
model=config.get("model", default_config["model"]),
temperature=config.get("temperature", default_config["temperature"]),
)
answer = response_msg.strip().casefold() # noqa: FURB184
for option in options:
if answer == option.strip().casefold():
return option
return None


_CAPITAL_A_INDEX = ord("A")


class MultipleChoiceQuestion(BaseModel):
QUESTION_PROMPT_TEMPLATE: ClassVar[str] = "Q: {question}\n\nOptions:\n{options}"
# TODO: combine with above eval_answer and its prompts
EVALUATION_PROMPT_TEMPLATE: ClassVar[str] = (
"Given the following question and a proposed answer to the question, return the"
" single-letter choice in the question that matches the proposed answer."
" If the proposed answer is blank or an empty string,"
" or multiple options are matched, respond with '0'."
"\n\nQuestion: {qa_prompt}"
"\n\nProposed Answer: {qa_answer}"
"\n\nSingle Letter Answer:"
)
DEFAULT_UNSURE_OPTION: ClassVar[str] = (
"Insufficient information to answer this question"
)
Expand Down Expand Up @@ -280,18 +306,14 @@ def split_options(options: str) -> list[str]:
return split_options

async def grade(
self, answer: str, prompt_runner: Callable[[str], Awaitable[str]] | None = None
) -> "tuple[MultipleChoiceEvaluation, str, str]":
if prompt_runner is None:
prompt_runner = run_prompt
eval_prompt = self.EVALUATION_PROMPT_TEMPLATE.format(
qa_prompt=self.question_prompt, qa_answer=answer
)
raw_evaluation = await prompt_runner(eval_prompt)
evaluation, parsed_answer = MultipleChoiceEvaluation.from_answer(
raw_evaluation, self
self, proposed_answer: str
) -> "tuple[MultipleChoiceEvaluation, str | None]":
extracted_answer = await extract_answer(
proposed_answer=proposed_answer, options=self.options
)
return evaluation, raw_evaluation, parsed_answer
return MultipleChoiceEvaluation.from_answer(
extracted_answer, self
), extracted_answer


class MultipleChoiceEvaluation(StrEnum):
Expand Down Expand Up @@ -323,32 +345,19 @@ def calculate_accuracy_precision(

@classmethod
def from_answer(
cls, answer: str, question: MultipleChoiceQuestion
) -> "tuple[MultipleChoiceEvaluation, str]":
cls, extracted_answer: str | None, question: MultipleChoiceQuestion
) -> "MultipleChoiceEvaluation":
"""Make an evaluation from the input answer and multiple choice question.
Returns:
Two-tuple of answer enum and the raw answer extracted from the input answer.
Evaluation corresponding to the parsed answer.
"""
# SEE: https://regex101.com/r/vcE9Hb/1
letter_search = re.search(r"([A-Z])\)?", answer, re.DOTALL)
# Get the letter answer, or fail over to the first non-whitespace char
answer_char = (
letter_search.group(1)
if letter_search is not None
else answer.split()[0][0].upper()
)
answer_letter_index = ord(answer_char[0]) - _CAPITAL_A_INDEX
if answer_letter_index < 0 or answer_letter_index > len(question.options):
# The result extracted was not in the options (e.g. '0')
return cls.INCORRECT, answer_char
if extracted_answer is None:
return MultipleChoiceEvaluation.INCORRECT
# From here, if we don't match either the ideal or the unsure multiple choice
# options then we declare the answer as incorrect.
if (
question.unsure_answer_index is not None
and answer_letter_index == question.unsure_answer_index
):
return cls.UNSURE, cast(str, question.unsure_answer)
if answer_letter_index == question.ideal_answer_index:
return cls.CORRECT, question.ideal_answer
return cls.INCORRECT, question.options[answer_letter_index]
if extracted_answer == question.ideal_answer:
return MultipleChoiceEvaluation.CORRECT
if question.unsure_answer and extracted_answer == question.unsure_answer:
return MultipleChoiceEvaluation.UNSURE
return MultipleChoiceEvaluation.INCORRECT
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
interactions:
- request:
body:
'{"messages": [{"content": "Given the following question and a proposed
answer to the question, return the single-letter choice in the question that
matches the proposed answer. If the proposed answer is blank or an empty string,
or multiple options are matched, respond with ''0''.\n\nQuestion: Q: What is
the meaning of life?\n\nOptions:\nA) -84\nB) Insufficient information to answer
this question\nC) cheesecake\nD) 11\nE) 42\n\nProposed Answer: 14\n\nSingle
Letter Answer:", "role": "user"}], "model": "gpt-4o"}'
'{"messages": [{"content": "You are evaluating answers for a test which
has fixed options. Repeat back which option the proposed answer matches. GIVE
ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
or ambiguous, return an empty string.\n\nOptions:\n-84\nInsufficient information
to answer this question\ncheesecake\n11\n42\n\nProposed answer: 14", "role":
"user"}], "model": "gpt-4o-mini", "temperature": 0}'
headers:
accept:
- application/json
Expand All @@ -16,7 +15,7 @@ interactions:
connection:
- keep-alive
content-length:
- "513"
- "442"
content-type:
- application/json
host:
Expand All @@ -36,7 +35,7 @@ interactions:
x-stainless-raw-response:
- "true"
x-stainless-retry-count:
- "1"
- "0"
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
Expand All @@ -46,28 +45,34 @@ interactions:
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//jJJLa8MwEITv/hVC57goqamd3HpooPQBORVSilGktaNW1gpJoY+Q/15k
u7FDU+jFh/l2xrNr7xNCqJJ0QajY8iAaq9Prav24NO7rNmMKH55ulvd8VaxXHu/ejaOT6MDNK4jw
47oQ2FgNQaHpsHDAA8TUaX6ZZTnLi3kLGpSgo622Ic0wnbFZlrIiZVe9cYtKgKcL8pwQQsi+fcaK
RsIHXRA2+VEa8J7XQBfHIUKoQx0Vyr1XPnAT6GSAAk0A07ZmY91BtfM81jI7rXv9cHyRxto63Pie
H/VKGeW3pQPu0cRQH9DSlh4SQl7ahXYnHal12NhQBnwDEwOnrOjy6HDCEe1ZwMD12DSfnIkrJQSu
tB9dhAoutiAH63A+vpMKRyAZLf27zLnsbnFl6v/ED0AIsAFkaR1IJU4XHsYcxB/sr7HjkdvC1H/6
AE1ZKVODs05137iyJc/nspBcTCuaHJJvAAAA//8DAGY5XevsAgAA
H4sIAAAAAAAAAwAAAP//jFLLTsMwELznK6w9Nyh9P24V0AMHBBK9gFDk2pvU4NiWveVV9d+R00da
tUhcfJjZGc+svU4YAyVhwkAsOYnK6XRafF7fzuWPeLwZP4v7p5maTwd3GD6yB5pBKyrs4g0F7VVX
wlZOIylrtrTwyAmja3vY7fX7o357VBOVlaijrHSU9mxaKaPSTtbppdkwbY926qVVAgNM2EvCGGPr
+ow5jcQvmLCstUcqDIGXCJPDEGPgrY4I8BBUIG4IWg0prCE0dfRj2GOxCjxGMyutd/jmcI+2pfN2
EXb8AS+UUWGZe+TBmugZyDqo2U3C2GvdZ3USEZy3laOc7DuaaDjqb+2g2WJD7qoCWeL6gubELJdI
XOlwtA4QXCxRnhkyBnwllT0ikqPK51kueW9rK1P+x74hhEBHKHPnUSpxsW9tHr/YX2OHFdeBIXwH
wiovlCnRO6+2D1y4vDvmvUyMBzyDZJP8AgAA//8DADaBBszuAgAA
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 8f39fde1cf88cf1b-SJC
- 8f425bb2ac70f953-SJC
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Tue, 17 Dec 2024 21:26:29 GMT
- Wed, 18 Dec 2024 21:48:38 GMT
Server:
- cloudflare
Set-Cookie:
- __cf_bm=Z3Wkkk2LQA2GKAPZVirKPYLTJfmm9Luttv26RxPBKro-1734558518-1.0.1.1-4BZR47qupd.QCWRMrfyj_F2lS0fqBEuzxwPZTqYPUxSKwdzL4S_8YWk9ofOPXhFEnkMN6nwgWjBLjAR4nioxiQ;
path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly;
Secure; SameSite=None
- _cfuvid=B7CeJKL1WXveU2pmeUGy_AFjPsbf25SvdiSN_4fxTXE-1734558518441-0.0.1.1-604800000;
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
Transfer-Encoding:
- chunked
X-Content-Type-Options:
Expand All @@ -79,25 +84,25 @@ interactions:
openai-organization:
- future-house-xr4tdh
openai-processing-ms:
- "363"
- "144"
openai-version:
- "2020-10-01"
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- "10000"
- "30000"
x-ratelimit-limit-tokens:
- "30000000"
- "150000000"
x-ratelimit-remaining-requests:
- "9999"
- "29999"
x-ratelimit-remaining-tokens:
- "29999874"
- "149999896"
x-ratelimit-reset-requests:
- 6ms
- 2ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_aff8daa48aa43d3df077f97da6136e5a
- req_503cd8163bd0d3b634eb723d6874b1da
status:
code: 200
message: OK
Expand Down
Loading

0 comments on commit afa6961

Please sign in to comment.