Skip to content

Commit

Permalink
Moved back to gpt-4o-mini
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesbraza committed Dec 18, 2024
1 parent d018adf commit 90dbcf4
Show file tree
Hide file tree
Showing 16 changed files with 320 additions and 295 deletions.
2 changes: 1 addition & 1 deletion src/aviary/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import numpy as np


DEFAULT_EVAL_MODEL_NAME = "gpt-4o"
DEFAULT_EVAL_MODEL_NAME = "gpt-4o-mini"
LLM_BOOL_EVAL_CONFIG: dict[str, Any] = {
"prompt": (
"Here is a question, the correct answer to the question, and a proposed answer"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ interactions:
ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
or ambiguous, return an empty string.\n\nOptions:\n-84\nInsufficient information
to answer this question\ncheesecake\n11\n42\n\nProposed answer: 14", "role":
"user"}], "model": "gpt-4o", "temperature": 0}'
"user"}], "model": "gpt-4o-mini", "temperature": 0}'
headers:
accept:
- application/json
Expand All @@ -15,7 +15,7 @@ interactions:
connection:
- keep-alive
content-length:
- "437"
- "442"
content-type:
- application/json
host:
Expand All @@ -35,7 +35,7 @@ interactions:
x-stainless-raw-response:
- "true"
x-stainless-retry-count:
- "1"
- "0"
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
Expand All @@ -45,28 +45,34 @@ interactions:
response:
body:
string: !!binary |
H4sIAAAAAAAAA4xSy07DMBC85yusPTcobVJIe6u4wKESIHFCKHKdTWpwvJa9FaCq/46cvtUicfFh
Zmc8s/Y6EQJ0DVMBailZdc6ks+Zrko2cxvljMXvunmavPHkJTA9zk9/DICpo8YGK96obRZ0zyJrs
llYeJWN0Hd7lxXhc5sOyJzqq0URZ6zgtKB1loyLNyjS73QmXpBUGmIq3RAgh1v0ZI9oav2EqssEe
6TAE2SJMD0NCgCcTEZAh6MDSMgyOpCLLaPvUp7DHZhVkTGVXxuzwzeEeQ63ztAg7/oA32uqwrDzK
QDZ6BiYHPbtJhHjv+6zOIoLz1DmumD7RRsNyvLWD4wKP5K4qMLE0VzRnZlWNLLUJJ+sAJdUS6wtD
IUCuak0nRHJS+TLLNe9tbW3b/9gfCaXQMdaV81hrdbVvbx5/119jhxX3gSH8BMauarRt0Tuvtw/c
uGrSyIWcNFleQrJJfgEAAP//AwA5BypS6QIAAA==
H4sIAAAAAAAAAwAAAP//jFLLTsMwELznK6w9Nyh9P24V0AMHBBK9gFDk2pvU4NiWveVV9d+R00da
tUhcfJjZGc+svU4YAyVhwkAsOYnK6XRafF7fzuWPeLwZP4v7p5maTwd3GD6yB5pBKyrs4g0F7VVX
wlZOIylrtrTwyAmja3vY7fX7o357VBOVlaijrHSU9mxaKaPSTtbppdkwbY926qVVAgNM2EvCGGPr
+ow5jcQvmLCstUcqDIGXCJPDEGPgrY4I8BBUIG4IWg0prCE0dfRj2GOxCjxGMyutd/jmcI+2pfN2
EXb8AS+UUWGZe+TBmugZyDqo2U3C2GvdZ3USEZy3laOc7DuaaDjqb+2g2WJD7qoCWeL6gubELJdI
XOlwtA4QXCxRnhkyBnwllT0ikqPK51kueW9rK1P+x74hhEBHKHPnUSpxsW9tHr/YX2OHFdeBIXwH
wiovlCnRO6+2D1y4vDvmvUyMBzyDZJP8AgAA//8DADaBBszuAgAA
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 8f4256d28f9615ff-SJC
- 8f425bb2ac70f953-SJC
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Wed, 18 Dec 2024 21:45:18 GMT
- Wed, 18 Dec 2024 21:48:38 GMT
Server:
- cloudflare
Set-Cookie:
- __cf_bm=Z3Wkkk2LQA2GKAPZVirKPYLTJfmm9Luttv26RxPBKro-1734558518-1.0.1.1-4BZR47qupd.QCWRMrfyj_F2lS0fqBEuzxwPZTqYPUxSKwdzL4S_8YWk9ofOPXhFEnkMN6nwgWjBLjAR4nioxiQ;
path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly;
Secure; SameSite=None
- _cfuvid=B7CeJKL1WXveU2pmeUGy_AFjPsbf25SvdiSN_4fxTXE-1734558518441-0.0.1.1-604800000;
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
Transfer-Encoding:
- chunked
X-Content-Type-Options:
Expand All @@ -78,25 +84,25 @@ interactions:
openai-organization:
- future-house-xr4tdh
openai-processing-ms:
- "208"
- "144"
openai-version:
- "2020-10-01"
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- "10000"
- "30000"
x-ratelimit-limit-tokens:
- "30000000"
- "150000000"
x-ratelimit-remaining-requests:
- "9999"
- "29999"
x-ratelimit-remaining-tokens:
- "29999896"
- "149999896"
x-ratelimit-reset-requests:
- 6ms
- 2ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_7db1d1f6dded4679e43cc12a2183fa21
- req_503cd8163bd0d3b634eb723d6874b1da
status:
code: 200
message: OK
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ interactions:
ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information
to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: the answer
is 14004", "role": "user"}], "model": "gpt-4o", "temperature": 0}'
is 14004", "role": "user"}], "model": "gpt-4o-mini", "temperature": 0}'
headers:
accept:
- application/json
Expand All @@ -15,7 +15,7 @@ interactions:
connection:
- keep-alive
content-length:
- "459"
- "464"
content-type:
- application/json
host:
Expand All @@ -35,7 +35,7 @@ interactions:
x-stainless-raw-response:
- "true"
x-stainless-retry-count:
- "1"
- "0"
x-stainless-runtime:
- CPython
x-stainless-runtime-version:
Expand All @@ -45,28 +45,34 @@ interactions:
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//jJLNTsMwEITveQprzw1K/6DNDakIwYkLqgRFkeNsEhfHNrarUlV9d2S3
TVJRJC4+7Lcznl17HxECvICUAKupY40W8X25ndktPqzrRb37envdVk/LxfKRvkx4/gwDr1D5Gpk7
q26YarRAx5U8YmaQOvSuw7vxZDqdjYdJAI0qUHhZpV08UfEoGU3iZBYntydhrThDCyl5jwghZB9O
H1EW+A0pCTah0qC1tEJI2yZCwCjhK0Ct5dZR6WDQQaakQxlSr2AFfWSw3Fjqk8mNEKf6ob1LqEob
ldsTb+sll9zWmUFqlfS+1ikNgR4iQj7CTJuLmKCNarTLnPpE6Q3nw6MddEvs4Jk55ajoaUaDK2ZZ
gY5yYXsrAUZZjUWn7PZHNwVXPRD1Rv6d5Zr3cWwuq//Yd4Ax1A6LTBssOLuct2sz6H/YX23tikNg
sDvrsMlKLis02vDjI5c6m5c0p/MyGc8gOkQ/AAAA//8DAEsANTftAgAA
H4sIAAAAAAAAAwAAAP//jFLLbsIwELznK6w9kyqB8MoNoR7aAxdKVamqImNvElPHtmyjPhD/XjlQ
AoJKvfgwszOeWXsXEQKCQ06A1dSzxsh4Vn7M71fZMntcjAbfz6uXjTSbh9mCPk2Xc+gFhV5vkPlf
1R3TjZHohVYHmlmkHoNrOh5kw+FkmE5aotEcZZBVxseZjhuhRNxP+lmcjON0clTXWjB0kJPXiBBC
du0ZciqOn5CTpPeLNOgcrRDy0xAhYLUMCFDnhPNUeeh1JNPKo2qjn8MWy62jIZraSnnE96d7pK6M
1Wt35E94KZRwdWGROq2Cp/PaQMvuI0Le2j7bi4hgrG6ML7x+RxUMp+nBDrotduSxKnjtqbyhuTAr
OHoqpDtbBzDKauRXhoQA3XKhz4jorPJ1llveh9pCVf+x7wjG0HjkhbHIBbvZtzUPX+yvsdOK28Dg
vpzHpiiFqtAaKw4PXJpiVLI0wTTBNUT76AcAAP//AwBkI2np7gIAAA==
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 8f42569f7d2a2379-SJC
- 8f425bb11b702519-SJC
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Wed, 18 Dec 2024 21:45:10 GMT
- Wed, 18 Dec 2024 21:48:38 GMT
Server:
- cloudflare
Set-Cookie:
- __cf_bm=6j4w6Jnsg0wGsZf61WcNCvHdr1Vcb6uVLFFhTQQgcv4-1734558518-1.0.1.1-D0vsT8nCM66xiA.Xa6ijXpgeGPM65Iux2KhQqUiD8wToq.VmwT03dnkmELw1qn0GvHJvh8g7H6WkqYzXVgs2Xg;
path=/; expires=Wed, 18-Dec-24 22:18:38 GMT; domain=.api.openai.com; HttpOnly;
Secure; SameSite=None
- _cfuvid=LFVOxysXKxTPNQ2KK05aqbBnIRDPc45hskCPkFcOjXA-1734558518178-0.0.1.1-604800000;
path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
Transfer-Encoding:
- chunked
X-Content-Type-Options:
Expand All @@ -78,25 +84,25 @@ interactions:
openai-organization:
- future-house-xr4tdh
openai-processing-ms:
- "240"
- "131"
openai-version:
- "2020-10-01"
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- "10000"
- "30000"
x-ratelimit-limit-tokens:
- "30000000"
- "150000000"
x-ratelimit-remaining-requests:
- "9999"
- "29999"
x-ratelimit-remaining-tokens:
- "29999890"
- "149999890"
x-ratelimit-reset-requests:
- 6ms
- 2ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_363f6da2908247ad8c711b11d1593ae7
- req_12c5e1cdb8b2ba32b075f04f20194421
status:
code: 200
message: OK
Expand Down
40 changes: 20 additions & 20 deletions tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer1].yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ interactions:
ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
or ambiguous, return an empty string.\n\nOptions:\n-8\nInsufficient information
to answer this question\ncheesecake\n94106\n94107\n\nProposed answer: ", "role":
"user"}], "model": "gpt-4o", "temperature": 0}'
"user"}], "model": "gpt-4o-mini", "temperature": 0}'
headers:
accept:
- application/json
Expand All @@ -15,7 +15,7 @@ interactions:
connection:
- keep-alive
content-length:
- "440"
- "445"
content-type:
- application/json
host:
Expand Down Expand Up @@ -45,26 +45,24 @@ interactions:
response:
body:
string: !!binary |
H4sIAAAAAAAAA4xSy27CMBC85yusPZMqkAApt6rHSr0U9VJVkbE3wa3jtWyj8hD/XjlAAEGlXnyY
2RnPrL1LGAMlYcZALHkQrdXpU/1TbhfrIn+j+Vxs281Cv7/k9JrN1/gMg6igxReKcFI9CGqtxqDI
HGjhkAeMrsNpXozHZT6cdkRLEnWUNTakBaWjbFSkWZlmk6NwSUqghxn7SBhjbNedMaKRuIYZywYn
pEXveYMw64cYA0c6IsC9Vz5wE2BwJgWZgKZLfQk7rFeex1RmpfUR3/f3aGqso4U/8j1eK6P8snLI
PZno6QNZ6Nh9wthn12d1FRGso9aGKtA3mmhYTg52cF7gmTxWhUCB6zuaK7NKYuBK+4t1gOBiifLG
kDHgK6nogkguKt9mued9qK1M8x/7MyEE2oCysg6lEnf7dubxd/011q+4Cwx+4wO2Va1Mg846dXjg
2lZ8+ihLycWwhmSf/AIAAP//AwBsWlME6QIAAA==
H4sIAAAAAAAAAwAAAP//jFJdS8MwFH3vrwj3eZV2X46+6RAR0T2JikjJkts2miYhSVEZ+++Srms3
NsGXPJxzz8k5N9lEhIDgkBFgFfWsNjK+Kr6WN6vb8XL+9LqSxj3Qu2R1ff/8yCfiBUZBodcfyPxe
dcF0bSR6odWOZhapx+CaXk6ms9lili5aotYcZZCVxsdTHddCiXicjKdxchmni05dacHQQUbeIkII
2bRnyKk4fkNGktEeqdE5WiJk/RAhYLUMCFDnhPNUeRgNJNPKo2qjH8IWi8bREE01Unb4tr9H6tJY
vXYd3+OFUMJVuUXqtAqezmsDLbuNCHlv+zRHEcFYXRufe/2JKhgu5js7GLY4kF1V8NpTeUZzZJZz
9FRId7AOYJRVyE8MCQHacKEPiOig8mmWc9672kKV/7EfCMbQeOS5scgFO9u3NQ9f7K+xfsVtYHA/
zmOdF0KVaI0VuwcuTD4vWJpgmuAaom30CwAA//8DAL0A1qzuAgAA
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 8f4256cce931eb29-SJC
- 8f425bb5de5996de-SJC
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Wed, 18 Dec 2024 21:45:17 GMT
- Wed, 18 Dec 2024 21:48:39 GMT
Server:
- cloudflare
Transfer-Encoding:
Expand All @@ -75,28 +73,30 @@ interactions:
- X-Request-ID
alt-svc:
- h3=":443"; ma=86400
cf-cache-status:
- DYNAMIC
openai-organization:
- future-house-xr4tdh
openai-processing-ms:
- "217"
- "233"
openai-version:
- "2020-10-01"
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- "10000"
- "30000"
x-ratelimit-limit-tokens:
- "30000000"
- "150000000"
x-ratelimit-remaining-requests:
- "9999"
- "29999"
x-ratelimit-remaining-tokens:
- "29999895"
- "149999896"
x-ratelimit-reset-requests:
- 6ms
- 2ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_520872e529ccbb680d27a3729fbe637e
- req_0c845e0049332bd1fa73fdbe76005ea1
status:
code: 200
message: OK
Expand Down
36 changes: 18 additions & 18 deletions tests/cassettes/TestLitQAEvaluation.test_grade[empty-answer2].yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ interactions:
ONLY THE VERBATIM TEXT OF A FIXED OPTION. If the proposed answer is empty, invalid,
or ambiguous, return an empty string.\n\nOptions:\n-84\nInsufficient information
to answer this question\ncheesecake\n11\n42\n\nProposed answer: ", "role": "user"}],
"model": "gpt-4o", "temperature": 0}'
"model": "gpt-4o-mini", "temperature": 0}'
headers:
accept:
- application/json
Expand All @@ -15,7 +15,7 @@ interactions:
connection:
- keep-alive
content-length:
- "435"
- "440"
content-type:
- application/json
host:
Expand Down Expand Up @@ -45,26 +45,26 @@ interactions:
response:
body:
string: !!binary |
H4sIAAAAAAAAAwAAAP//jFKxbsIwFNzzFdabSRUg0CRbh7ZDB9Shqqqqioz9Etw6tmU7AoT498oB
QhBU6uLh7t357tm7iBAQHAoCbEU9a4yMH6p1Pp5/uKf59rF5b95e08X4hanFZv3sMhgFhV5+I/Mn
1R3TjZHohVYHmlmkHoPr+H6azmbZdJx3RKM5yiCrjY9THU+SSRonWZzMj8KVFgwdFOQzIoSQXXeG
iIrjBgqSjE5Ig87RGqHohwgBq2VAgDonnKfKw+hMMq08qi71ELZYtY6GVKqV8ojv+3ukro3VS3fk
e7wSSrhVaZE6rYKn89pAx+4jQr66Pu1FRDBWN8aXXv+gCoZZerCD8wLP5LEqeO2pvKG5MCs5eiqk
G6wDGGUr5FeGhABtudADIhpUvs5yy/tQW6j6P/ZngjE0HnlpLHLBbvbtzMPv+musX3EXGNzWeWzK
SqgarbHi8MCVKfOKLmleJdMMon30CwAA//8DAFd1apnpAgAA
H4sIAAAAAAAAAwAAAP//jFJda8IwFH3vrwj32Q6tLTrfhjB8FhyyMUpMbttomoQk3Qfifx+ptXXo
YC95OOeek3NucowIAcFhQYBV1LPayPip+Fw+rz5e6221zta4fKENiu1qszkk2z2MgkLv9sj8RfXA
dG0keqHVmWYWqcfgOplN0yybZ5PHlqg1RxlkpfFxquNaKBEn4ySNx7N4Mu/UlRYMHSzIW0QIIcf2
DDkVxy9YkPHogtToHC0RFv0QIWC1DAhQ54TzVHkYDSTTyqNqo1/DFovG0RBNNVJ2+Km/R+rSWL1z
Hd/jhVDCVblF6rQKns5rAy17igh5b/s0vyKCsbo2Pvf6gCoYztOzHQxbHMiuKnjtqbyj+WWWc/RU
SHe1DmCUVchvDAkB2nChr4joqvJtlnve59pClf+xHwjG0HjkubHIBbvbtzUPX+yvsX7FbWBw385j
nRdClWiNFecHLkzOxwnPppNdOoPoFP0AAAD//wMAMCnsc+4CAAA=
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 8f4256d7fd75cf0d-SJC
- 8f425bb72f9b67dc-SJC
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Wed, 18 Dec 2024 21:45:19 GMT
- Wed, 18 Dec 2024 21:48:39 GMT
Server:
- cloudflare
Transfer-Encoding:
Expand All @@ -78,25 +78,25 @@ interactions:
openai-organization:
- future-house-xr4tdh
openai-processing-ms:
- "520"
- "532"
openai-version:
- "2020-10-01"
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-ratelimit-limit-requests:
- "10000"
- "30000"
x-ratelimit-limit-tokens:
- "30000000"
- "150000000"
x-ratelimit-remaining-requests:
- "9999"
- "29999"
x-ratelimit-remaining-tokens:
- "29999896"
- "149999896"
x-ratelimit-reset-requests:
- 6ms
- 2ms
x-ratelimit-reset-tokens:
- 0s
x-request-id:
- req_f943511f12de0306ff59cccd017e98f1
- req_ed9d0e7998f792094d5aefe723693f28
status:
code: 200
message: OK
Expand Down
Loading

0 comments on commit 90dbcf4

Please sign in to comment.