From 7e7d04ca8eecba0a79f514d919fed40dae44805f Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sun, 22 Dec 2024 21:57:43 +0100 Subject: [PATCH 1/2] add MSMARCO eval split in MTEB English (classic) benchmark Fixes #1608 --- mteb/benchmarks/benchmarks.py | 148 +++++++++++++++++----------------- 1 file changed, 75 insertions(+), 73 deletions(-) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index e872143ee5..45a7aad108 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -9,7 +9,7 @@ from mteb.abstasks.AbsTask import AbsTask from mteb.load_results.benchmark_results import BenchmarkResults from mteb.load_results.load_results import load_results -from mteb.overview import get_tasks +from mteb.overview import MTEBTasks, get_tasks http_url_adapter = TypeAdapter(AnyUrl) UrlString = Annotated[ @@ -123,78 +123,80 @@ def load_results( MTEB_ENG_CLASSIC = Benchmark( name="MTEB(eng, classic)", - tasks=get_tasks( - tasks=[ - "AmazonCounterfactualClassification", - "AmazonPolarityClassification", - "AmazonReviewsClassification", - "ArguAna", - "ArxivClusteringP2P", - "ArxivClusteringS2S", - "AskUbuntuDupQuestions", - "BIOSSES", - "Banking77Classification", - "BiorxivClusteringP2P", - "BiorxivClusteringS2S", - "CQADupstackAndroidRetrieval", - "CQADupstackEnglishRetrieval", - "CQADupstackGamingRetrieval", - "CQADupstackGisRetrieval", - "CQADupstackMathematicaRetrieval", - "CQADupstackPhysicsRetrieval", - "CQADupstackProgrammersRetrieval", - "CQADupstackStatsRetrieval", - "CQADupstackTexRetrieval", - "CQADupstackUnixRetrieval", - "CQADupstackWebmastersRetrieval", - "CQADupstackWordpressRetrieval", - "ClimateFEVER", - "DBPedia", - "EmotionClassification", - "FEVER", - "FiQA2018", - "HotpotQA", - "ImdbClassification", - "MSMARCO", - "MTOPDomainClassification", - "MTOPIntentClassification", - "MassiveIntentClassification", - "MassiveScenarioClassification", - "MedrxivClusteringP2P", - "MedrxivClusteringS2S", - "MindSmallReranking", - "NFCorpus", - "NQ", - "QuoraRetrieval", - "RedditClustering", - "RedditClusteringP2P", - "SCIDOCS", - "SICK-R", - "STS12", - "STS13", - "STS14", - "STS15", - "STS16", - "STS17", - "STS22", - "STSBenchmark", - "SciDocsRR", - "SciFact", - "SprintDuplicateQuestions", - "StackExchangeClustering", - "StackExchangeClusteringP2P", - "StackOverflowDupQuestions", - "SummEval", - "TRECCOVID", - "Touche2020", - "ToxicConversationsClassification", - "TweetSentimentExtractionClassification", - "TwentyNewsgroupsClustering", - "TwitterSemEval2015", - "TwitterURLCorpus", - ], - languages=["eng"], - eval_splits=["test"], + tasks=MTEBTasks( + get_tasks( + tasks=[ + "AmazonCounterfactualClassification", + "AmazonPolarityClassification", + "AmazonReviewsClassification", + "ArguAna", + "ArxivClusteringP2P", + "ArxivClusteringS2S", + "AskUbuntuDupQuestions", + "BIOSSES", + "Banking77Classification", + "BiorxivClusteringP2P", + "BiorxivClusteringS2S", + "CQADupstackAndroidRetrieval", + "CQADupstackEnglishRetrieval", + "CQADupstackGamingRetrieval", + "CQADupstackGisRetrieval", + "CQADupstackMathematicaRetrieval", + "CQADupstackPhysicsRetrieval", + "CQADupstackProgrammersRetrieval", + "CQADupstackStatsRetrieval", + "CQADupstackTexRetrieval", + "CQADupstackUnixRetrieval", + "CQADupstackWebmastersRetrieval", + "CQADupstackWordpressRetrieval", + "ClimateFEVER", + "DBPedia", + "EmotionClassification", + "FEVER", + "FiQA2018", + "HotpotQA", + "ImdbClassification", + "MTOPDomainClassification", + "MTOPIntentClassification", + "MassiveIntentClassification", + "MassiveScenarioClassification", + "MedrxivClusteringP2P", + "MedrxivClusteringS2S", + "MindSmallReranking", + "NFCorpus", + "NQ", + "QuoraRetrieval", + "RedditClustering", + "RedditClusteringP2P", + "SCIDOCS", + "SICK-R", + "STS12", + "STS13", + "STS14", + "STS15", + "STS16", + "STS17", + "STS22", + "STSBenchmark", + "SciDocsRR", + "SciFact", + "SprintDuplicateQuestions", + "StackExchangeClustering", + "StackExchangeClusteringP2P", + "StackOverflowDupQuestions", + "SummEval", + "TRECCOVID", + "Touche2020", + "ToxicConversationsClassification", + "TweetSentimentExtractionClassification", + "TwentyNewsgroupsClustering", + "TwitterSemEval2015", + "TwitterURLCorpus", + ], + languages=["eng"], + eval_splits=["test"], + ) + + get_tasks(tasks=["MSMARCO"], languages=["eng"], eval_splits=["dev"]) ), description="The original English benchmarks by Muennighoff et al., (2023).", citation="""@inproceedings{muennighoff-etal-2023-mteb, From 376bc9799aa0fab3104314bf218820f29b2abb99 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sun, 22 Dec 2024 22:00:12 +0100 Subject: [PATCH 2/2] Add co-author Co-authored-by: aashka-trivedi --- mteb/benchmarks/benchmarks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 45a7aad108..b56b39a1b2 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -198,7 +198,7 @@ def load_results( ) + get_tasks(tasks=["MSMARCO"], languages=["eng"], eval_splits=["dev"]) ), - description="The original English benchmarks by Muennighoff et al., (2023).", + description="The original English benchmark by Muennighoff et al., (2023).", citation="""@inproceedings{muennighoff-etal-2023-mteb, title = "{MTEB}: Massive Text Embedding Benchmark", author = "Muennighoff, Niklas and