From 7e7d04ca8eecba0a79f514d919fed40dae44805f Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Sun, 22 Dec 2024 21:57:43 +0100
Subject: [PATCH 1/2] add MSMARCO eval split in MTEB English (classic)
 benchmark

Fixes #1608
---
 mteb/benchmarks/benchmarks.py | 148 +++++++++++++++++-----------------
 1 file changed, 75 insertions(+), 73 deletions(-)

diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py
index e872143ee5..45a7aad108 100644
--- a/mteb/benchmarks/benchmarks.py
+++ b/mteb/benchmarks/benchmarks.py
@@ -9,7 +9,7 @@
 from mteb.abstasks.AbsTask import AbsTask
 from mteb.load_results.benchmark_results import BenchmarkResults
 from mteb.load_results.load_results import load_results
-from mteb.overview import get_tasks
+from mteb.overview import MTEBTasks, get_tasks
 
 http_url_adapter = TypeAdapter(AnyUrl)
 UrlString = Annotated[
@@ -123,78 +123,80 @@ def load_results(
 
 MTEB_ENG_CLASSIC = Benchmark(
     name="MTEB(eng, classic)",
-    tasks=get_tasks(
-        tasks=[
-            "AmazonCounterfactualClassification",
-            "AmazonPolarityClassification",
-            "AmazonReviewsClassification",
-            "ArguAna",
-            "ArxivClusteringP2P",
-            "ArxivClusteringS2S",
-            "AskUbuntuDupQuestions",
-            "BIOSSES",
-            "Banking77Classification",
-            "BiorxivClusteringP2P",
-            "BiorxivClusteringS2S",
-            "CQADupstackAndroidRetrieval",
-            "CQADupstackEnglishRetrieval",
-            "CQADupstackGamingRetrieval",
-            "CQADupstackGisRetrieval",
-            "CQADupstackMathematicaRetrieval",
-            "CQADupstackPhysicsRetrieval",
-            "CQADupstackProgrammersRetrieval",
-            "CQADupstackStatsRetrieval",
-            "CQADupstackTexRetrieval",
-            "CQADupstackUnixRetrieval",
-            "CQADupstackWebmastersRetrieval",
-            "CQADupstackWordpressRetrieval",
-            "ClimateFEVER",
-            "DBPedia",
-            "EmotionClassification",
-            "FEVER",
-            "FiQA2018",
-            "HotpotQA",
-            "ImdbClassification",
-            "MSMARCO",
-            "MTOPDomainClassification",
-            "MTOPIntentClassification",
-            "MassiveIntentClassification",
-            "MassiveScenarioClassification",
-            "MedrxivClusteringP2P",
-            "MedrxivClusteringS2S",
-            "MindSmallReranking",
-            "NFCorpus",
-            "NQ",
-            "QuoraRetrieval",
-            "RedditClustering",
-            "RedditClusteringP2P",
-            "SCIDOCS",
-            "SICK-R",
-            "STS12",
-            "STS13",
-            "STS14",
-            "STS15",
-            "STS16",
-            "STS17",
-            "STS22",
-            "STSBenchmark",
-            "SciDocsRR",
-            "SciFact",
-            "SprintDuplicateQuestions",
-            "StackExchangeClustering",
-            "StackExchangeClusteringP2P",
-            "StackOverflowDupQuestions",
-            "SummEval",
-            "TRECCOVID",
-            "Touche2020",
-            "ToxicConversationsClassification",
-            "TweetSentimentExtractionClassification",
-            "TwentyNewsgroupsClustering",
-            "TwitterSemEval2015",
-            "TwitterURLCorpus",
-        ],
-        languages=["eng"],
-        eval_splits=["test"],
+    tasks=MTEBTasks(
+        get_tasks(
+            tasks=[
+                "AmazonCounterfactualClassification",
+                "AmazonPolarityClassification",
+                "AmazonReviewsClassification",
+                "ArguAna",
+                "ArxivClusteringP2P",
+                "ArxivClusteringS2S",
+                "AskUbuntuDupQuestions",
+                "BIOSSES",
+                "Banking77Classification",
+                "BiorxivClusteringP2P",
+                "BiorxivClusteringS2S",
+                "CQADupstackAndroidRetrieval",
+                "CQADupstackEnglishRetrieval",
+                "CQADupstackGamingRetrieval",
+                "CQADupstackGisRetrieval",
+                "CQADupstackMathematicaRetrieval",
+                "CQADupstackPhysicsRetrieval",
+                "CQADupstackProgrammersRetrieval",
+                "CQADupstackStatsRetrieval",
+                "CQADupstackTexRetrieval",
+                "CQADupstackUnixRetrieval",
+                "CQADupstackWebmastersRetrieval",
+                "CQADupstackWordpressRetrieval",
+                "ClimateFEVER",
+                "DBPedia",
+                "EmotionClassification",
+                "FEVER",
+                "FiQA2018",
+                "HotpotQA",
+                "ImdbClassification",
+                "MTOPDomainClassification",
+                "MTOPIntentClassification",
+                "MassiveIntentClassification",
+                "MassiveScenarioClassification",
+                "MedrxivClusteringP2P",
+                "MedrxivClusteringS2S",
+                "MindSmallReranking",
+                "NFCorpus",
+                "NQ",
+                "QuoraRetrieval",
+                "RedditClustering",
+                "RedditClusteringP2P",
+                "SCIDOCS",
+                "SICK-R",
+                "STS12",
+                "STS13",
+                "STS14",
+                "STS15",
+                "STS16",
+                "STS17",
+                "STS22",
+                "STSBenchmark",
+                "SciDocsRR",
+                "SciFact",
+                "SprintDuplicateQuestions",
+                "StackExchangeClustering",
+                "StackExchangeClusteringP2P",
+                "StackOverflowDupQuestions",
+                "SummEval",
+                "TRECCOVID",
+                "Touche2020",
+                "ToxicConversationsClassification",
+                "TweetSentimentExtractionClassification",
+                "TwentyNewsgroupsClustering",
+                "TwitterSemEval2015",
+                "TwitterURLCorpus",
+            ],
+            languages=["eng"],
+            eval_splits=["test"],
+        )
+        + get_tasks(tasks=["MSMARCO"], languages=["eng"], eval_splits=["dev"])
     ),
     description="The original English benchmarks by Muennighoff et al., (2023).",
     citation="""@inproceedings{muennighoff-etal-2023-mteb,

From 376bc9799aa0fab3104314bf218820f29b2abb99 Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Sun, 22 Dec 2024 22:00:12 +0100
Subject: [PATCH 2/2] Add co-author

Co-authored-by: aashka-trivedi <aashka.trivedi@gmail.com>
---
 mteb/benchmarks/benchmarks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py
index 45a7aad108..b56b39a1b2 100644
--- a/mteb/benchmarks/benchmarks.py
+++ b/mteb/benchmarks/benchmarks.py
@@ -198,7 +198,7 @@ def load_results(
         )
         + get_tasks(tasks=["MSMARCO"], languages=["eng"], eval_splits=["dev"])
     ),
-    description="The original English benchmarks by Muennighoff et al., (2023).",
+    description="The original English benchmark by Muennighoff et al., (2023).",
     citation="""@inproceedings{muennighoff-etal-2023-mteb,
     title = "{MTEB}: Massive Text Embedding Benchmark",
     author = "Muennighoff, Niklas  and