Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: evaluate openai models on the remaining MTEB(Medical) tasks #71

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions results/openai__text-embedding-3-large/2/CMedQAv2-reranking.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"dataset_revision": "23d186750531a14a0357ca22cd92d712fd512ea0",
"task_name": "CMedQAv2-reranking",
"mteb_version": "1.24.0",
"scores": {
"test": [
{
"map": 0.751224,
"mrr": 0.788045,
"nAUC_map_max": 0.532528,
"nAUC_map_std": 0.170824,
"nAUC_map_diff1": 0.506192,
"nAUC_mrr_max": 0.619113,
"nAUC_mrr_std": 0.232415,
"nAUC_mrr_diff1": 0.582075,
"main_score": 0.751224,
"hf_subset": "default",
"languages": [
"cmn-Hans"
]
}
]
},
"evaluation_time": 1884.4257838726044,
"kg_co2_emissions": 0.00044117606664305007
}
158 changes: 158 additions & 0 deletions results/openai__text-embedding-3-large/2/CmedqaRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
{
"dataset_revision": "cd540c506dae1cf9e9a59c3e06f42030d54e7301",
"task_name": "CmedqaRetrieval",
"mteb_version": "1.24.0",
"scores": {
"dev": [
{
"ndcg_at_1": 0.23806,
"ndcg_at_3": 0.24019,
"ndcg_at_5": 0.25216,
"ndcg_at_10": 0.27435,
"ndcg_at_20": 0.29721,
"ndcg_at_100": 0.34303,
"ndcg_at_1000": 0.38144,
"map_at_1": 0.14967,
"map_at_3": 0.20044,
"map_at_5": 0.21298,
"map_at_10": 0.22494,
"map_at_20": 0.2325,
"map_at_100": 0.2402,
"map_at_1000": 0.24198,
"recall_at_1": 0.14967,
"recall_at_3": 0.24011,
"recall_at_5": 0.28091,
"recall_at_10": 0.34567,
"recall_at_20": 0.42386,
"recall_at_100": 0.63685,
"recall_at_1000": 0.90421,
"precision_at_1": 0.23806,
"precision_at_3": 0.1382,
"precision_at_5": 0.09947,
"precision_at_10": 0.06294,
"precision_at_20": 0.039,
"precision_at_100": 0.01199,
"precision_at_1000": 0.0017,
"mrr_at_1": 0.23806,
"mrr_at_3": 0.281279,
"mrr_at_5": 0.291931,
"mrr_at_10": 0.301517,
"mrr_at_20": 0.306952,
"mrr_at_100": 0.311724,
"mrr_at_1000": 0.312605,
"nauc_ndcg_at_1_max": 0.385752,
"nauc_ndcg_at_1_std": -0.040464,
"nauc_ndcg_at_1_diff1": 0.52806,
"nauc_ndcg_at_3_max": 0.34744,
"nauc_ndcg_at_3_std": -0.040927,
"nauc_ndcg_at_3_diff1": 0.446717,
"nauc_ndcg_at_5_max": 0.338609,
"nauc_ndcg_at_5_std": -0.045501,
"nauc_ndcg_at_5_diff1": 0.438516,
"nauc_ndcg_at_10_max": 0.339566,
"nauc_ndcg_at_10_std": -0.043432,
"nauc_ndcg_at_10_diff1": 0.429259,
"nauc_ndcg_at_20_max": 0.343991,
"nauc_ndcg_at_20_std": -0.026292,
"nauc_ndcg_at_20_diff1": 0.421478,
"nauc_ndcg_at_100_max": 0.347669,
"nauc_ndcg_at_100_std": 0.000656,
"nauc_ndcg_at_100_diff1": 0.412529,
"nauc_ndcg_at_1000_max": 0.356152,
"nauc_ndcg_at_1000_std": 0.004637,
"nauc_ndcg_at_1000_diff1": 0.422442,
"nauc_map_at_1_max": 0.301477,
"nauc_map_at_1_std": -0.072747,
"nauc_map_at_1_diff1": 0.493146,
"nauc_map_at_3_max": 0.326267,
"nauc_map_at_3_std": -0.060284,
"nauc_map_at_3_diff1": 0.454711,
"nauc_map_at_5_max": 0.330391,
"nauc_map_at_5_std": -0.058726,
"nauc_map_at_5_diff1": 0.449565,
"nauc_map_at_10_max": 0.336287,
"nauc_map_at_10_std": -0.055241,
"nauc_map_at_10_diff1": 0.443792,
"nauc_map_at_20_max": 0.339646,
"nauc_map_at_20_std": -0.049295,
"nauc_map_at_20_diff1": 0.441548,
"nauc_map_at_100_max": 0.341537,
"nauc_map_at_100_std": -0.043372,
"nauc_map_at_100_diff1": 0.440142,
"nauc_map_at_1000_max": 0.342049,
"nauc_map_at_1000_std": -0.042582,
"nauc_map_at_1000_diff1": 0.440503,
"nauc_recall_at_1_max": 0.301477,
"nauc_recall_at_1_std": -0.072747,
"nauc_recall_at_1_diff1": 0.493146,
"nauc_recall_at_3_max": 0.291041,
"nauc_recall_at_3_std": -0.053511,
"nauc_recall_at_3_diff1": 0.393961,
"nauc_recall_at_5_max": 0.283306,
"nauc_recall_at_5_std": -0.048442,
"nauc_recall_at_5_diff1": 0.364103,
"nauc_recall_at_10_max": 0.284127,
"nauc_recall_at_10_std": -0.03875,
"nauc_recall_at_10_diff1": 0.334214,
"nauc_recall_at_20_max": 0.290039,
"nauc_recall_at_20_std": 0.015398,
"nauc_recall_at_20_diff1": 0.30011,
"nauc_recall_at_100_max": 0.286998,
"nauc_recall_at_100_std": 0.132302,
"nauc_recall_at_100_diff1": 0.233188,
"nauc_recall_at_1000_max": 0.416109,
"nauc_recall_at_1000_std": 0.408188,
"nauc_recall_at_1000_diff1": 0.204341,
"nauc_precision_at_1_max": 0.385752,
"nauc_precision_at_1_std": -0.040464,
"nauc_precision_at_1_diff1": 0.52806,
"nauc_precision_at_3_max": 0.386608,
"nauc_precision_at_3_std": -0.00421,
"nauc_precision_at_3_diff1": 0.386592,
"nauc_precision_at_5_max": 0.370232,
"nauc_precision_at_5_std": 0.001644,
"nauc_precision_at_5_diff1": 0.350039,
"nauc_precision_at_10_max": 0.366455,
"nauc_precision_at_10_std": 0.029112,
"nauc_precision_at_10_diff1": 0.297162,
"nauc_precision_at_20_max": 0.350007,
"nauc_precision_at_20_std": 0.070674,
"nauc_precision_at_20_diff1": 0.254044,
"nauc_precision_at_100_max": 0.27252,
"nauc_precision_at_100_std": 0.160222,
"nauc_precision_at_100_diff1": 0.148347,
"nauc_precision_at_1000_max": 0.198375,
"nauc_precision_at_1000_std": 0.16985,
"nauc_precision_at_1000_diff1": 0.065617,
"nauc_mrr_at_1_max": 0.385752,
"nauc_mrr_at_1_std": -0.040464,
"nauc_mrr_at_1_diff1": 0.52806,
"nauc_mrr_at_3_max": 0.366945,
"nauc_mrr_at_3_std": -0.029993,
"nauc_mrr_at_3_diff1": 0.484221,
"nauc_mrr_at_5_max": 0.365061,
"nauc_mrr_at_5_std": -0.028935,
"nauc_mrr_at_5_diff1": 0.478225,
"nauc_mrr_at_10_max": 0.364478,
"nauc_mrr_at_10_std": -0.027304,
"nauc_mrr_at_10_diff1": 0.473609,
"nauc_mrr_at_20_max": 0.364233,
"nauc_mrr_at_20_std": -0.023302,
"nauc_mrr_at_20_diff1": 0.471052,
"nauc_mrr_at_100_max": 0.364592,
"nauc_mrr_at_100_std": -0.021667,
"nauc_mrr_at_100_diff1": 0.470421,
"nauc_mrr_at_1000_max": 0.364768,
"nauc_mrr_at_1000_std": -0.021747,
"nauc_mrr_at_1000_diff1": 0.470786,
"main_score": 0.27435,
"hf_subset": "default",
"languages": [
"cmn-Hans"
]
}
]
},
"evaluation_time": 2936.1834087371826,
"kg_co2_emissions": 0.0005700112053268032
}
158 changes: 158 additions & 0 deletions results/openai__text-embedding-3-large/2/MedicalQARetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
{
"dataset_revision": "ae763399273d8b20506b80cf6f6f9a31a6a2b238",
"task_name": "MedicalQARetrieval",
"mteb_version": "1.24.0",
"scores": {
"test": [
{
"ndcg_at_1": 0.66406,
"ndcg_at_3": 0.76001,
"ndcg_at_5": 0.78057,
"ndcg_at_10": 0.79657,
"ndcg_at_20": 0.804,
"ndcg_at_100": 0.80889,
"ndcg_at_1000": 0.81129,
"map_at_1": 0.66406,
"map_at_3": 0.73706,
"map_at_5": 0.74861,
"map_at_10": 0.75525,
"map_at_20": 0.75736,
"map_at_100": 0.75807,
"map_at_1000": 0.75816,
"recall_at_1": 0.66406,
"recall_at_3": 0.82617,
"recall_at_5": 0.87549,
"recall_at_10": 0.9248,
"recall_at_20": 0.95361,
"recall_at_100": 0.97949,
"recall_at_1000": 0.99854,
"precision_at_1": 0.66406,
"precision_at_3": 0.27539,
"precision_at_5": 0.1751,
"precision_at_10": 0.09248,
"precision_at_20": 0.04768,
"precision_at_100": 0.00979,
"precision_at_1000": 0.001,
"mrr_at_1": 0.664062,
"mrr_at_3": 0.736735,
"mrr_at_5": 0.748161,
"mrr_at_10": 0.754961,
"mrr_at_20": 0.757079,
"mrr_at_100": 0.757784,
"mrr_at_1000": 0.757873,
"nauc_ndcg_at_1_max": 0.433356,
"nauc_ndcg_at_1_std": -0.091583,
"nauc_ndcg_at_1_diff1": 0.694226,
"nauc_ndcg_at_3_max": 0.420168,
"nauc_ndcg_at_3_std": -0.131054,
"nauc_ndcg_at_3_diff1": 0.646435,
"nauc_ndcg_at_5_max": 0.41975,
"nauc_ndcg_at_5_std": -0.139749,
"nauc_ndcg_at_5_diff1": 0.645507,
"nauc_ndcg_at_10_max": 0.428545,
"nauc_ndcg_at_10_std": -0.150018,
"nauc_ndcg_at_10_diff1": 0.644121,
"nauc_ndcg_at_20_max": 0.427207,
"nauc_ndcg_at_20_std": -0.126902,
"nauc_ndcg_at_20_diff1": 0.646615,
"nauc_ndcg_at_100_max": 0.427837,
"nauc_ndcg_at_100_std": -0.117718,
"nauc_ndcg_at_100_diff1": 0.652459,
"nauc_ndcg_at_1000_max": 0.425608,
"nauc_ndcg_at_1000_std": -0.121263,
"nauc_ndcg_at_1000_diff1": 0.654148,
"nauc_map_at_1_max": 0.433356,
"nauc_map_at_1_std": -0.091583,
"nauc_map_at_1_diff1": 0.694226,
"nauc_map_at_3_max": 0.421986,
"nauc_map_at_3_std": -0.119741,
"nauc_map_at_3_diff1": 0.659279,
"nauc_map_at_5_max": 0.421911,
"nauc_map_at_5_std": -0.123421,
"nauc_map_at_5_diff1": 0.659557,
"nauc_map_at_10_max": 0.424924,
"nauc_map_at_10_std": -0.126781,
"nauc_map_at_10_diff1": 0.659324,
"nauc_map_at_20_max": 0.424638,
"nauc_map_at_20_std": -0.121343,
"nauc_map_at_20_diff1": 0.660062,
"nauc_map_at_100_max": 0.424713,
"nauc_map_at_100_std": -0.119872,
"nauc_map_at_100_diff1": 0.660823,
"nauc_map_at_1000_max": 0.424617,
"nauc_map_at_1000_std": -0.119899,
"nauc_map_at_1000_diff1": 0.660859,
"nauc_recall_at_1_max": 0.433356,
"nauc_recall_at_1_std": -0.091583,
"nauc_recall_at_1_diff1": 0.694226,
"nauc_recall_at_3_max": 0.414072,
"nauc_recall_at_3_std": -0.176215,
"nauc_recall_at_3_diff1": 0.595665,
"nauc_recall_at_5_max": 0.409939,
"nauc_recall_at_5_std": -0.226551,
"nauc_recall_at_5_diff1": 0.573048,
"nauc_recall_at_10_max": 0.470337,
"nauc_recall_at_10_std": -0.351935,
"nauc_recall_at_10_diff1": 0.520716,
"nauc_recall_at_20_max": 0.475081,
"nauc_recall_at_20_std": -0.14001,
"nauc_recall_at_20_diff1": 0.479932,
"nauc_recall_at_100_max": 0.561001,
"nauc_recall_at_100_std": 0.183469,
"nauc_recall_at_100_diff1": 0.513431,
"nauc_recall_at_1000_max": 0.565963,
"nauc_recall_at_1000_std": 0.391581,
"nauc_recall_at_1000_diff1": 0.322598,
"nauc_precision_at_1_max": 0.433356,
"nauc_precision_at_1_std": -0.091583,
"nauc_precision_at_1_diff1": 0.694226,
"nauc_precision_at_3_max": 0.414072,
"nauc_precision_at_3_std": -0.176215,
"nauc_precision_at_3_diff1": 0.595665,
"nauc_precision_at_5_max": 0.409939,
"nauc_precision_at_5_std": -0.226551,
"nauc_precision_at_5_diff1": 0.573048,
"nauc_precision_at_10_max": 0.470337,
"nauc_precision_at_10_std": -0.351935,
"nauc_precision_at_10_diff1": 0.520716,
"nauc_precision_at_20_max": 0.475081,
"nauc_precision_at_20_std": -0.14001,
"nauc_precision_at_20_diff1": 0.479932,
"nauc_precision_at_100_max": 0.561001,
"nauc_precision_at_100_std": 0.183469,
"nauc_precision_at_100_diff1": 0.513431,
"nauc_precision_at_1000_max": 0.565963,
"nauc_precision_at_1000_std": 0.391581,
"nauc_precision_at_1000_diff1": 0.322598,
"nauc_mrr_at_1_max": 0.434137,
"nauc_mrr_at_1_std": -0.08679,
"nauc_mrr_at_1_diff1": 0.694226,
"nauc_mrr_at_3_max": 0.422952,
"nauc_mrr_at_3_std": -0.115251,
"nauc_mrr_at_3_diff1": 0.659957,
"nauc_mrr_at_5_max": 0.423275,
"nauc_mrr_at_5_std": -0.118462,
"nauc_mrr_at_5_diff1": 0.660504,
"nauc_mrr_at_10_max": 0.426052,
"nauc_mrr_at_10_std": -0.122276,
"nauc_mrr_at_10_diff1": 0.659972,
"nauc_mrr_at_20_max": 0.425811,
"nauc_mrr_at_20_std": -0.116797,
"nauc_mrr_at_20_diff1": 0.660715,
"nauc_mrr_at_100_max": 0.425891,
"nauc_mrr_at_100_std": -0.115313,
"nauc_mrr_at_100_diff1": 0.661478,
"nauc_mrr_at_1000_max": 0.425795,
"nauc_mrr_at_1000_std": -0.115338,
"nauc_mrr_at_1000_diff1": 0.661514,
"main_score": 0.79657,
"hf_subset": "default",
"languages": [
"eng-Latn"
]
}
]
},
"evaluation_time": 81.16353392601013,
"kg_co2_emissions": 2.6804755836003245e-05
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"dataset_revision": "35191c8c0dca72d8ff3efcd72aa802307d469663",
"task_name": "MedrxivClusteringS2S.v2",
"mteb_version": "1.24.0",
"scores": {
"test": [
{
"v_measures": {
"Level 0": [
0.340695,
0.345533,
0.339033,
0.349097,
0.348423,
0.328494,
0.347611,
0.338524,
0.345661,
0.351792
]
},
"v_measure": 0.343486,
"v_measure_std": 0.006541,
"main_score": 0.343486,
"hf_subset": "default",
"languages": [
"eng-Latn"
]
}
]
},
"evaluation_time": 43.02136588096619,
"kg_co2_emissions": 9.90941361771497e-06
}
Loading
Loading