diff --git a/datasets/super_glue/dataset_infos.json b/datasets/super_glue/dataset_infos.json
index 18229852f4a..a16b9d64dee 100644
--- a/datasets/super_glue/dataset_infos.json
+++ b/datasets/super_glue/dataset_infos.json
@@ -1 +1 @@
-{"boolq": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nBoolQ (Boolean Questions, Clark et al., 2019a) is a QA task where each example consists of a short\npassage and a yes/no question about the passage. The questions are provided anonymously and\nunsolicited by users of the Google search engine, and afterwards paired with a paragraph from a\nWikipedia article containing the answer. Following the original work, we evaluate with accuracy.", "citation": "@inproceedings{clark2019boolq,\n  title={BoolQ: Exploring the Surprising Difficulty of Natural Yes/No Questions},\n  author={Clark, Christopher and Lee, Kenton and Chang, Ming-Wei, and Kwiatkowski, Tom and Collins, Michael, and Toutanova, Kristina},\n  booktitle={NAACL},\n  year={2019}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://github.com/google-research-datasets/boolean-questions", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "passage": {"dtype": "string", "id": null, "_type": "Value"}, "idx": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["False", "True"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "supervised_keys": null, "builder_name": "super_glue", "config_name": "boolq", "version": {"version_str": "1.0.2", "description": null, "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 2107997, "num_examples": 3245, "dataset_name": "super_glue"}, "train": {"name": "train", "num_bytes": 6179206, "num_examples": 9427, "dataset_name": "super_glue"}, "validation": {"name": "validation", "num_bytes": 2118505, "num_examples": 3270, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/BoolQ.zip": {"num_bytes": 4118001, "checksum": "853fbe7922f70c59629f06a39e8d9ca440c3d740e760fd3b87a5ddf3dcba2436"}}, "download_size": 4118001, "dataset_size": 10405708, "size_in_bytes": 14523709}, "cb": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nThe CommitmentBank (De Marneffe et al., 2019) is a corpus of short texts in which at least\none sentence contains an embedded clause. Each of these embedded clauses is annotated with the\ndegree to which we expect that the person who wrote the text is committed to the truth of the clause.\nThe resulting task framed as three-class textual entailment on examples that are drawn from the Wall\nStreet Journal, fiction from the British National Corpus, and Switchboard. Each example consists\nof a premise containing an embedded clause and the corresponding hypothesis is the extraction of\nthat clause. We use a subset of the data that had inter-annotator agreement above 0.85. The data is\nimbalanced (relatively fewer neutral examples), so we evaluate using accuracy and F1, where for\nmulti-class F1 we compute the unweighted average of the F1 per class.", "citation": "@article{de marneff_simons_tonhauser_2019,\n  title={The CommitmentBank: Investigating projection in naturally occurring discourse},\n  journal={proceedings of Sinn und Bedeutung 23},\n  author={De Marneff, Marie-Catherine and Simons, Mandy and Tonhauser, Judith},\n  year={2019}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://github.com/mcdm/CommitmentBank", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "idx": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"num_classes": 3, "names": ["entailment", "contradiction", "neutral"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "supervised_keys": null, "builder_name": "super_glue", "config_name": "cb", "version": {"version_str": "1.0.2", "description": null, "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 93660, "num_examples": 250, "dataset_name": "super_glue"}, "train": {"name": "train", "num_bytes": 87218, "num_examples": 250, "dataset_name": "super_glue"}, "validation": {"name": "validation", "num_bytes": 21894, "num_examples": 56, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/CB.zip": {"num_bytes": 75482, "checksum": "8d641383298d54554066ba1c93f56ae7410af75df621b90c63028806bbbbb535"}}, "download_size": 75482, "dataset_size": 202772, "size_in_bytes": 278254}, "copa": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nThe Choice Of Plausible Alternatives (COPA, Roemmele et al., 2011) dataset is a causal\nreasoning task in which a system is given a premise sentence and two possible alternatives. The\nsystem must choose the alternative which has the more plausible causal relationship with the premise.\nThe method used for the construction of the alternatives ensures that the task requires causal reasoning\nto solve. Examples either deal with alternative possible causes or alternative possible effects of the\npremise sentence, accompanied by a simple question disambiguating between the two instance\ntypes for the model. All examples are handcrafted and focus on topics from online blogs and a\nphotography-related encyclopedia. Following the recommendation of the authors, we evaluate using\naccuracy.", "citation": "@inproceedings{roemmele2011choice,\n  title={Choice of plausible alternatives: An evaluation of commonsense causal reasoning},\n  author={Roemmele, Melissa and Bejan, Cosmin Adrian and Gordon, Andrew S},\n  booktitle={2011 AAAI Spring Symposium Series},\n  year={2011}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "http://people.ict.usc.edu/~gordon/copa.html", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "choice1": {"dtype": "string", "id": null, "_type": "Value"}, "choice2": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "idx": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["choice1", "choice2"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "supervised_keys": null, "builder_name": "super_glue", "config_name": "copa", "version": {"version_str": "1.0.2", "description": null, "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 60303, "num_examples": 500, "dataset_name": "super_glue"}, "train": {"name": "train", "num_bytes": 49599, "num_examples": 400, "dataset_name": "super_glue"}, "validation": {"name": "validation", "num_bytes": 12586, "num_examples": 100, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/COPA.zip": {"num_bytes": 43986, "checksum": "405906cddac74bc1e1ce8220f1107d1025b66a25ef10149d91b10bb30651125f"}}, "download_size": 43986, "dataset_size": 122488, "size_in_bytes": 166474}, "multirc": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nThe Multi-Sentence Reading Comprehension dataset (MultiRC, Khashabi et al., 2018)\nis a true/false question-answering task. Each example consists of a context paragraph, a question\nabout that paragraph, and a list of possible answers to that question which must be labeled as true or\nfalse. Question-answering (QA) is a popular problem with many datasets. We use MultiRC because\nof a number of desirable properties: (i) each question can have multiple possible correct answers,\nso each question-answer pair must be evaluated independent of other pairs, (ii) the questions are\ndesigned such that answering each question requires drawing facts from multiple context sentences,\nand (iii) the question-answer pair format more closely matches the API of other SuperGLUE tasks\nthan span-based extractive QA does. The paragraphs are drawn from seven domains including news,\nfiction, and historical text.", "citation": "@inproceedings{MultiRC2018,\n    author = {Daniel Khashabi and Snigdha Chaturvedi and Michael Roth and Shyam Upadhyay and Dan Roth},\n    title = {Looking Beyond the Surface:A Challenge Set for Reading Comprehension over Multiple Sentences},\n    booktitle = {Proceedings of North American Chapter of the Association for Computational Linguistics (NAACL)},\n    year = {2018}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://cogcomp.org/multirc/", "license": "", "features": {"paragraph": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "idx": {"paragraph": {"dtype": "int32", "id": null, "_type": "Value"}, "question": {"dtype": "int32", "id": null, "_type": "Value"}, "answer": {"dtype": "int32", "id": null, "_type": "Value"}}, "label": {"num_classes": 2, "names": ["False", "True"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "supervised_keys": null, "builder_name": "super_glue", "config_name": "multirc", "version": {"version_str": "1.0.2", "description": null, "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 14996451, "num_examples": 9693, "dataset_name": "super_glue"}, "train": {"name": "train", "num_bytes": 46213579, "num_examples": 27243, "dataset_name": "super_glue"}, "validation": {"name": "validation", "num_bytes": 7758918, "num_examples": 4848, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/MultiRC.zip": {"num_bytes": 1116225, "checksum": "b3cd440856e72eb166b2edcd37b798455f1ebd51f2c3de64c0c2a4e1971d2737"}}, "download_size": 1116225, "dataset_size": 68968948, "size_in_bytes": 70085173}, "record": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\n(Reading Comprehension with Commonsense Reasoning Dataset, Zhang et al., 2018) is a\nmultiple-choice QA task. Each example consists of a news article and a Cloze-style question about\nthe article in which one entity is masked out. The system must predict the masked out entity from a\ngiven list of possible entities in the provided passage, where the same entity may be expressed using\nmultiple different surface forms, all of which are considered correct. Articles are drawn from CNN\nand Daily Mail. Following the original work, we evaluate with max (over all mentions) token-level\nF1 and exact match (EM).", "citation": "@article{zhang2018record,\n  title={Record: Bridging the gap between human and machine commonsense reading comprehension},\n  author={Zhang, Sheng and Liu, Xiaodong and Liu, Jingjing and Gao, Jianfeng and Duh, Kevin and Van Durme, Benjamin},\n  journal={arXiv preprint arXiv:1810.12885},\n  year={2018}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://sheng-z.github.io/ReCoRD-explorer/", "license": "", "features": {"passage": {"dtype": "string", "id": null, "_type": "Value"}, "query": {"dtype": "string", "id": null, "_type": "Value"}, "entities": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "idx": {"passage": {"dtype": "int32", "id": null, "_type": "Value"}, "query": {"dtype": "int32", "id": null, "_type": "Value"}}}, "supervised_keys": null, "builder_name": "super_glue", "config_name": "record", "version": {"version_str": "1.0.2", "description": null, "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 13403490, "num_examples": 10000, "dataset_name": "super_glue"}, "train": {"name": "train", "num_bytes": 139393747, "num_examples": 100730, "dataset_name": "super_glue"}, "validation": {"name": "validation", "num_bytes": 13650007, "num_examples": 10000, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/ReCoRD.zip": {"num_bytes": 51757880, "checksum": "30c7b651ab21b8bf8fab986495cd1084333010e040548f861b839eec0044ac18"}}, "download_size": 51757880, "dataset_size": 166447244, "size_in_bytes": 218205124}, "rte": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nThe Recognizing Textual Entailment (RTE) datasets come from a series of annual competitions\non textual entailment, the problem of predicting whether a given premise sentence entails a given\nhypothesis sentence (also known as natural language inference, NLI). RTE was previously included\nin GLUE, and we use the same data and format as before: We merge data from RTE1 (Dagan\net al., 2006), RTE2 (Bar Haim et al., 2006), RTE3 (Giampiccolo et al., 2007), and RTE5 (Bentivogli\net al., 2009). All datasets are combined and converted to two-class classification: entailment and\nnot_entailment. Of all the GLUE tasks, RTE was among those that benefited from transfer learning\nthe most, jumping from near random-chance performance (~56%) at the time of GLUE's launch to\n85% accuracy (Liu et al., 2019c) at the time of writing. Given the eight point gap with respect to\nhuman performance, however, the task is not yet solved by machines, and we expect the remaining\ngap to be difficult to close.", "citation": "@inproceedings{dagan2005pascal,\n  title={The PASCAL recognising textual entailment challenge},\n  author={Dagan, Ido and Glickman, Oren and Magnini, Bernardo},\n  booktitle={Machine Learning Challenges Workshop},\n  pages={177--190},\n  year={2005},\n  organization={Springer}\n}\n@inproceedings{bar2006second,\n  title={The second pascal recognising textual entailment challenge},\n  author={Bar-Haim, Roy and Dagan, Ido and Dolan, Bill and Ferro, Lisa and Giampiccolo, Danilo and Magnini, Bernardo and Szpektor, Idan},\n  booktitle={Proceedings of the second PASCAL challenges workshop on recognising textual entailment},\n  volume={6},\n  number={1},\n  pages={6--4},\n  year={2006},\n  organization={Venice}\n}\n@inproceedings{giampiccolo2007third,\n  title={The third pascal recognizing textual entailment challenge},\n  author={Giampiccolo, Danilo and Magnini, Bernardo and Dagan, Ido and Dolan, Bill},\n  booktitle={Proceedings of the ACL-PASCAL workshop on textual entailment and paraphrasing},\n  pages={1--9},\n  year={2007},\n  organization={Association for Computational Linguistics}\n}\n@inproceedings{bentivogli2009fifth,\n  title={The Fifth PASCAL Recognizing Textual Entailment Challenge.},\n  author={Bentivogli, Luisa and Clark, Peter and Dagan, Ido and Giampiccolo, Danilo},\n  booktitle={TAC},\n  year={2009}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://aclweb.org/aclwiki/Recognizing_Textual_Entailment", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "idx": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["entailment", "not_entailment"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "supervised_keys": null, "builder_name": "super_glue", "config_name": "rte", "version": {"version_str": "1.0.2", "description": null, "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 975799, "num_examples": 3000, "dataset_name": "super_glue"}, "train": {"name": "train", "num_bytes": 848745, "num_examples": 2490, "dataset_name": "super_glue"}, "validation": {"name": "validation", "num_bytes": 90899, "num_examples": 277, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/RTE.zip": {"num_bytes": 750920, "checksum": "6310aab3f000424c9d0318a1ff20692e07c7f4aa15e8f17a5972ea0a35c398b9"}}, "download_size": 750920, "dataset_size": 1915443, "size_in_bytes": 2666363}, "wic": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nThe Word-in-Context (WiC, Pilehvar and Camacho-Collados, 2019) dataset supports a word\nsense disambiguation task cast as binary classification over sentence pairs. Given two sentences and a\npolysemous (sense-ambiguous) word that appears in both sentences, the task is to determine whether\nthe word is used with the same sense in both sentences. Sentences are drawn from WordNet (Miller,\n1995), VerbNet (Schuler, 2005), and Wiktionary. We follow the original work and evaluate using\naccuracy.", "citation": "@article{DBLP:journals/corr/abs-1808-09121,\n  author={Mohammad Taher Pilehvar and os{'{e}} Camacho{-}Collados},\n  title={WiC: 10, 000 Example Pairs for Evaluating Context-Sensitive Representations},\n  journal={CoRR},\n  volume={abs/1808.09121},\n  year={2018},\n  url={http://arxiv.org/abs/1808.09121},\n  archivePrefix={arXiv},\n  eprint={1808.09121},\n  timestamp={Mon, 03 Sep 2018 13:36:40 +0200},\n  biburl={https://dblp.org/rec/bib/journals/corr/abs-1808-09121},\n  bibsource={dblp computer science bibliography, https://dblp.org}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://pilehvar.github.io/wic/", "license": "", "features": {"word": {"dtype": "string", "id": null, "_type": "Value"}, "sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "start1": {"dtype": "int32", "id": null, "_type": "Value"}, "start2": {"dtype": "int32", "id": null, "_type": "Value"}, "end1": {"dtype": "int32", "id": null, "_type": "Value"}, "end2": {"dtype": "int32", "id": null, "_type": "Value"}, "idx": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["False", "True"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "supervised_keys": null, "builder_name": "super_glue", "config_name": "wic", "version": {"version_str": "1.0.2", "description": null, "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 180593, "num_examples": 1400, "dataset_name": "super_glue"}, "train": {"name": "train", "num_bytes": 665183, "num_examples": 5428, "dataset_name": "super_glue"}, "validation": {"name": "validation", "num_bytes": 82623, "num_examples": 638, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WiC.zip": {"num_bytes": 396213, "checksum": "ee7e67f4ae9eafbf533780faa198e62167f3cda54256cdf261877be3c0e90900"}}, "download_size": 396213, "dataset_size": 928399, "size_in_bytes": 1324612}, "wsc": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nThe Winograd Schema Challenge (WSC, Levesque et al., 2012) is a reading comprehension\ntask in which a system must read a sentence with a pronoun and select the referent of that pronoun\nfrom a list of choices. Given the difficulty of this task and the headroom still left, we have included\nWSC in SuperGLUE and recast the dataset into its coreference form. The task is cast as a binary\nclassification problem, as opposed to N-multiple choice, in order to isolate the model's ability to\nunderstand the coreference links within a sentence as opposed to various other strategies that may\ncome into play in multiple choice conditions. With that in mind, we create a split with 65% negative\nmajority class in the validation set, reflecting the distribution of the hidden test set, and 52% negative\nclass in the training set. The training and validation examples are drawn from the original Winograd\nSchema dataset (Levesque et al., 2012), as well as those distributed by the affiliated organization\nCommonsense Reasoning. The test examples are derived from fiction books and have been shared\nwith us by the authors of the original dataset. Previously, a version of WSC recast as NLI as included\nin GLUE, known as WNLI. No substantial progress was made on WNLI, with many submissions\nopting to submit only majority class predictions. WNLI was made especially difficult due to an\nadversarial train/dev split: Premise sentences that appeared in the training set sometimes appeared\nin the development set with a different hypothesis and a flipped label. If a system memorized the\ntraining set without meaningfully generalizing, which was easy due to the small size of the training\nset, it could perform far below chance on the development set. We remove this adversarial design\nin the SuperGLUE version of WSC by ensuring that no sentences are shared between the training,\nvalidation, and test sets.\n\nHowever, the validation and test sets come from different domains, with the validation set consisting\nof ambiguous examples such that changing one non-noun phrase word will change the coreference\ndependencies in the sentence. The test set consists only of more straightforward examples, with a\nhigh number of noun phrases (and thus more choices for the model), but low to no ambiguity.", "citation": "@inproceedings{levesque2012winograd,\n  title={The winograd schema challenge},\n  author={Levesque, Hector and Davis, Ernest and Morgenstern, Leora},\n  booktitle={Thirteenth International Conference on the Principles of Knowledge Representation and Reasoning},\n  year={2012}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://cs.nyu.edu/faculty/davise/papers/WinogradSchemas/WS.html", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "span1_index": {"dtype": "int32", "id": null, "_type": "Value"}, "span2_index": {"dtype": "int32", "id": null, "_type": "Value"}, "span1_text": {"dtype": "string", "id": null, "_type": "Value"}, "span2_text": {"dtype": "string", "id": null, "_type": "Value"}, "idx": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["False", "True"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "supervised_keys": null, "builder_name": "super_glue", "config_name": "wsc", "version": {"version_str": "1.0.2", "description": null, "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 31572, "num_examples": 146, "dataset_name": "super_glue"}, "train": {"name": "train", "num_bytes": 89883, "num_examples": 554, "dataset_name": "super_glue"}, "validation": {"name": "validation", "num_bytes": 21637, "num_examples": 104, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WSC.zip": {"num_bytes": 32751, "checksum": "2ed6dfa94556b4a128ff0441efe365b2e883124e7e6aa00fb8d3a6cb1fd520a9"}}, "download_size": 32751, "dataset_size": 143092, "size_in_bytes": 175843}, "wsc.fixed": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nThe Winograd Schema Challenge (WSC, Levesque et al., 2012) is a reading comprehension\ntask in which a system must read a sentence with a pronoun and select the referent of that pronoun\nfrom a list of choices. Given the difficulty of this task and the headroom still left, we have included\nWSC in SuperGLUE and recast the dataset into its coreference form. The task is cast as a binary\nclassification problem, as opposed to N-multiple choice, in order to isolate the model's ability to\nunderstand the coreference links within a sentence as opposed to various other strategies that may\ncome into play in multiple choice conditions. With that in mind, we create a split with 65% negative\nmajority class in the validation set, reflecting the distribution of the hidden test set, and 52% negative\nclass in the training set. The training and validation examples are drawn from the original Winograd\nSchema dataset (Levesque et al., 2012), as well as those distributed by the affiliated organization\nCommonsense Reasoning. The test examples are derived from fiction books and have been shared\nwith us by the authors of the original dataset. Previously, a version of WSC recast as NLI as included\nin GLUE, known as WNLI. No substantial progress was made on WNLI, with many submissions\nopting to submit only majority class predictions. WNLI was made especially difficult due to an\nadversarial train/dev split: Premise sentences that appeared in the training set sometimes appeared\nin the development set with a different hypothesis and a flipped label. If a system memorized the\ntraining set without meaningfully generalizing, which was easy due to the small size of the training\nset, it could perform far below chance on the development set. We remove this adversarial design\nin the SuperGLUE version of WSC by ensuring that no sentences are shared between the training,\nvalidation, and test sets.\n\nHowever, the validation and test sets come from different domains, with the validation set consisting\nof ambiguous examples such that changing one non-noun phrase word will change the coreference\ndependencies in the sentence. The test set consists only of more straightforward examples, with a\nhigh number of noun phrases (and thus more choices for the model), but low to no ambiguity.\n\nThis version fixes issues where the spans are not actually substrings of the text.", "citation": "@inproceedings{levesque2012winograd,\n  title={The winograd schema challenge},\n  author={Levesque, Hector and Davis, Ernest and Morgenstern, Leora},\n  booktitle={Thirteenth International Conference on the Principles of Knowledge Representation and Reasoning},\n  year={2012}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://cs.nyu.edu/faculty/davise/papers/WinogradSchemas/WS.html", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "span1_index": {"dtype": "int32", "id": null, "_type": "Value"}, "span2_index": {"dtype": "int32", "id": null, "_type": "Value"}, "span1_text": {"dtype": "string", "id": null, "_type": "Value"}, "span2_text": {"dtype": "string", "id": null, "_type": "Value"}, "idx": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["False", "True"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "supervised_keys": null, "builder_name": "super_glue", "config_name": "wsc.fixed", "version": {"version_str": "1.0.2", "description": null, "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 31568, "num_examples": 146, "dataset_name": "super_glue"}, "train": {"name": "train", "num_bytes": 89883, "num_examples": 554, "dataset_name": "super_glue"}, "validation": {"name": "validation", "num_bytes": 21637, "num_examples": 104, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WSC.zip": {"num_bytes": 32751, "checksum": "2ed6dfa94556b4a128ff0441efe365b2e883124e7e6aa00fb8d3a6cb1fd520a9"}}, "download_size": 32751, "dataset_size": 143088, "size_in_bytes": 175839}, "axb": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nAn expert-constructed,\ndiagnostic dataset that automatically tests models for a broad range of linguistic, commonsense, and\nworld knowledge. Each example in this broad-coverage diagnostic is a sentence pair labeled with\na three-way entailment relation (entailment, neutral, or contradiction) and tagged with labels that\nindicate the phenomena that characterize the relationship between the two sentences. Submissions\nto the GLUE leaderboard are required to include predictions from the submission's MultiNLI\nclassifier on the diagnostic dataset, and analyses of the results were shown alongside the main\nleaderboard. Since this broad-coverage diagnostic task has proved difficult for top models, we retain\nit in SuperGLUE. However, since MultiNLI is not part of SuperGLUE, we collapse contradiction\nand neutral into a single not_entailment label, and request that submissions include predictions\non the resulting set from the model used for the RTE task.\n", "citation": "\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://gluebenchmark.com/diagnostics", "license": "", "features": {"sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "idx": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["entailment", "not_entailment"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "supervised_keys": null, "builder_name": "super_glue", "config_name": "axb", "version": {"version_str": "1.0.2", "description": null, "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 238392, "num_examples": 1104, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/AX-b.zip": {"num_bytes": 33950, "checksum": "43b45c55a273575c58a33cd68f10a971f83daa3aa223bfbc4077b92fbdf960b3"}}, "download_size": 33950, "dataset_size": 238392, "size_in_bytes": 272342}, "axg": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nWinogender is designed to measure gender\nbias in coreference resolution systems. We use the Diverse Natural Language Inference Collection\n(DNC; Poliak et al., 2018) version that casts Winogender as a textual entailment task. Each example\nconsists of a premise sentence with a male or female pronoun and a hypothesis giving a possible\nantecedent of the pronoun. Examples occur in minimal pairs, where the only difference between\nan example and its pair is the gender of the pronoun in the premise. Performance on Winogender\nis measured with both accuracy and the gender parity score: the percentage of minimal pairs for\nwhich the predictions are the same. We note that a system can trivially obtain a perfect gender parity\nscore by guessing the same class for all examples, so a high gender parity score is meaningless unless\naccompanied by high accuracy. As a diagnostic test of gender bias, we view the schemas as having high\npositive predictive value and low negative predictive value; that is, they may demonstrate the presence\nof gender bias in a system, but not prove its absence.\n", "citation": "@inproceedings{rudinger-EtAl:2018:N18,\n  author    = {Rudinger, Rachel  and  Naradowsky, Jason  and  Leonard, Brian  and  {Van Durme}, Benjamin},\n  title     = {Gender Bias in Coreference Resolution},\n  booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},\n  month     = {June},\n  year      = {2018},\n  address   = {New Orleans, Louisiana},\n  publisher = {Association for Computational Linguistics}\n}\n\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://github.com/rudinger/winogender-schemas", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "idx": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["entailment", "not_entailment"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "supervised_keys": null, "builder_name": "super_glue", "config_name": "axg", "version": {"version_str": "1.0.2", "description": null, "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 53581, "num_examples": 356, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/AX-g.zip": {"num_bytes": 10413, "checksum": "2d4e00d3a7d23d2c3787ee4c1382cc81a72cb05a76fc9d78d142949247ed61b9"}}, "download_size": 10413, "dataset_size": 53581, "size_in_bytes": 63994}}
\ No newline at end of file
+{"boolq": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nBoolQ (Boolean Questions, Clark et al., 2019a) is a QA task where each example consists of a short\npassage and a yes/no question about the passage. The questions are provided anonymously and\nunsolicited by users of the Google search engine, and afterwards paired with a paragraph from a\nWikipedia article containing the answer. Following the original work, we evaluate with accuracy.", "citation": "@inproceedings{clark2019boolq,\n  title={BoolQ: Exploring the Surprising Difficulty of Natural Yes/No Questions},\n  author={Clark, Christopher and Lee, Kenton and Chang, Ming-Wei, and Kwiatkowski, Tom and Collins, Michael, and Toutanova, Kristina},\n  booktitle={NAACL},\n  year={2019}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://github.com/google-research-datasets/boolean-questions", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "passage": {"dtype": "string", "id": null, "_type": "Value"}, "idx": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["False", "True"], "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "super_glue", "config_name": "boolq", "version": {"version_str": "1.0.2", "description": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 2107997, "num_examples": 3245, "dataset_name": "super_glue"}, "train": {"name": "train", "num_bytes": 6179206, "num_examples": 9427, "dataset_name": "super_glue"}, "validation": {"name": "validation", "num_bytes": 2118505, "num_examples": 3270, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/BoolQ.zip": {"num_bytes": 4118001, "checksum": "853fbe7922f70c59629f06a39e8d9ca440c3d740e760fd3b87a5ddf3dcba2436"}}, "download_size": 4118001, "post_processing_size": null, "dataset_size": 10405708, "size_in_bytes": 14523709}, "cb": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nThe CommitmentBank (De Marneffe et al., 2019) is a corpus of short texts in which at least\none sentence contains an embedded clause. Each of these embedded clauses is annotated with the\ndegree to which we expect that the person who wrote the text is committed to the truth of the clause.\nThe resulting task framed as three-class textual entailment on examples that are drawn from the Wall\nStreet Journal, fiction from the British National Corpus, and Switchboard. Each example consists\nof a premise containing an embedded clause and the corresponding hypothesis is the extraction of\nthat clause. We use a subset of the data that had inter-annotator agreement above 0.85. The data is\nimbalanced (relatively fewer neutral examples), so we evaluate using accuracy and F1, where for\nmulti-class F1 we compute the unweighted average of the F1 per class.", "citation": "@article{de marneff_simons_tonhauser_2019,\n  title={The CommitmentBank: Investigating projection in naturally occurring discourse},\n  journal={proceedings of Sinn und Bedeutung 23},\n  author={De Marneff, Marie-Catherine and Simons, Mandy and Tonhauser, Judith},\n  year={2019}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://github.com/mcdm/CommitmentBank", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "idx": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"num_classes": 3, "names": ["entailment", "contradiction", "neutral"], "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "super_glue", "config_name": "cb", "version": {"version_str": "1.0.2", "description": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 93660, "num_examples": 250, "dataset_name": "super_glue"}, "train": {"name": "train", "num_bytes": 87218, "num_examples": 250, "dataset_name": "super_glue"}, "validation": {"name": "validation", "num_bytes": 21894, "num_examples": 56, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/CB.zip": {"num_bytes": 75482, "checksum": "8d641383298d54554066ba1c93f56ae7410af75df621b90c63028806bbbbb535"}}, "download_size": 75482, "post_processing_size": null, "dataset_size": 202772, "size_in_bytes": 278254}, "copa": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nThe Choice Of Plausible Alternatives (COPA, Roemmele et al., 2011) dataset is a causal\nreasoning task in which a system is given a premise sentence and two possible alternatives. The\nsystem must choose the alternative which has the more plausible causal relationship with the premise.\nThe method used for the construction of the alternatives ensures that the task requires causal reasoning\nto solve. Examples either deal with alternative possible causes or alternative possible effects of the\npremise sentence, accompanied by a simple question disambiguating between the two instance\ntypes for the model. All examples are handcrafted and focus on topics from online blogs and a\nphotography-related encyclopedia. Following the recommendation of the authors, we evaluate using\naccuracy.", "citation": "@inproceedings{roemmele2011choice,\n  title={Choice of plausible alternatives: An evaluation of commonsense causal reasoning},\n  author={Roemmele, Melissa and Bejan, Cosmin Adrian and Gordon, Andrew S},\n  booktitle={2011 AAAI Spring Symposium Series},\n  year={2011}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "http://people.ict.usc.edu/~gordon/copa.html", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "choice1": {"dtype": "string", "id": null, "_type": "Value"}, "choice2": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "idx": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["choice1", "choice2"], "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "super_glue", "config_name": "copa", "version": {"version_str": "1.0.2", "description": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 60303, "num_examples": 500, "dataset_name": "super_glue"}, "train": {"name": "train", "num_bytes": 49599, "num_examples": 400, "dataset_name": "super_glue"}, "validation": {"name": "validation", "num_bytes": 12586, "num_examples": 100, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/COPA.zip": {"num_bytes": 43986, "checksum": "405906cddac74bc1e1ce8220f1107d1025b66a25ef10149d91b10bb30651125f"}}, "download_size": 43986, "post_processing_size": null, "dataset_size": 122488, "size_in_bytes": 166474}, "multirc": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nThe Multi-Sentence Reading Comprehension dataset (MultiRC, Khashabi et al., 2018)\nis a true/false question-answering task. Each example consists of a context paragraph, a question\nabout that paragraph, and a list of possible answers to that question which must be labeled as true or\nfalse. Question-answering (QA) is a popular problem with many datasets. We use MultiRC because\nof a number of desirable properties: (i) each question can have multiple possible correct answers,\nso each question-answer pair must be evaluated independent of other pairs, (ii) the questions are\ndesigned such that answering each question requires drawing facts from multiple context sentences,\nand (iii) the question-answer pair format more closely matches the API of other SuperGLUE tasks\nthan span-based extractive QA does. The paragraphs are drawn from seven domains including news,\nfiction, and historical text.", "citation": "@inproceedings{MultiRC2018,\n    author = {Daniel Khashabi and Snigdha Chaturvedi and Michael Roth and Shyam Upadhyay and Dan Roth},\n    title = {Looking Beyond the Surface:A Challenge Set for Reading Comprehension over Multiple Sentences},\n    booktitle = {Proceedings of North American Chapter of the Association for Computational Linguistics (NAACL)},\n    year = {2018}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://cogcomp.org/multirc/", "license": "", "features": {"paragraph": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "idx": {"paragraph": {"dtype": "int32", "id": null, "_type": "Value"}, "question": {"dtype": "int32", "id": null, "_type": "Value"}, "answer": {"dtype": "int32", "id": null, "_type": "Value"}}, "label": {"num_classes": 2, "names": ["False", "True"], "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "super_glue", "config_name": "multirc", "version": {"version_str": "1.0.2", "description": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 14996451, "num_examples": 9693, "dataset_name": "super_glue"}, "train": {"name": "train", "num_bytes": 46213579, "num_examples": 27243, "dataset_name": "super_glue"}, "validation": {"name": "validation", "num_bytes": 7758918, "num_examples": 4848, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/MultiRC.zip": {"num_bytes": 1116225, "checksum": "b3cd440856e72eb166b2edcd37b798455f1ebd51f2c3de64c0c2a4e1971d2737"}}, "download_size": 1116225, "post_processing_size": null, "dataset_size": 68968948, "size_in_bytes": 70085173}, "record": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\n(Reading Comprehension with Commonsense Reasoning Dataset, Zhang et al., 2018) is a\nmultiple-choice QA task. Each example consists of a news article and a Cloze-style question about\nthe article in which one entity is masked out. The system must predict the masked out entity from a\ngiven list of possible entities in the provided passage, where the same entity may be expressed using\nmultiple different surface forms, all of which are considered correct. Articles are drawn from CNN\nand Daily Mail. Following the original work, we evaluate with max (over all mentions) token-level\nF1 and exact match (EM).", "citation": "@article{zhang2018record,\n  title={Record: Bridging the gap between human and machine commonsense reading comprehension},\n  author={Zhang, Sheng and Liu, Xiaodong and Liu, Jingjing and Gao, Jianfeng and Duh, Kevin and Van Durme, Benjamin},\n  journal={arXiv preprint arXiv:1810.12885},\n  year={2018}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://sheng-z.github.io/ReCoRD-explorer/", "license": "", "features": {"passage": {"dtype": "string", "id": null, "_type": "Value"}, "query": {"dtype": "string", "id": null, "_type": "Value"}, "entities": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "entity_spans": {"feature": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "start": {"dtype": "int32", "id": null, "_type": "Value"}, "end": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answers": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "idx": {"passage": {"dtype": "int32", "id": null, "_type": "Value"}, "query": {"dtype": "int32", "id": null, "_type": "Value"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "super_glue", "config_name": "record", "version": {"version_str": "1.0.3", "description": null, "major": 1, "minor": 0, "patch": 3}, "splits": {"train": {"name": "train", "num_bytes": 179232052, "num_examples": 100730, "dataset_name": "super_glue"}, "validation": {"name": "validation", "num_bytes": 17479084, "num_examples": 10000, "dataset_name": "super_glue"}, "test": {"name": "test", "num_bytes": 17200575, "num_examples": 10000, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/ReCoRD.zip": {"num_bytes": 51757880, "checksum": "30c7b651ab21b8bf8fab986495cd1084333010e040548f861b839eec0044ac18"}}, "download_size": 51757880, "post_processing_size": null, "dataset_size": 213911711, "size_in_bytes": 265669591}, "rte": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nThe Recognizing Textual Entailment (RTE) datasets come from a series of annual competitions\non textual entailment, the problem of predicting whether a given premise sentence entails a given\nhypothesis sentence (also known as natural language inference, NLI). RTE was previously included\nin GLUE, and we use the same data and format as before: We merge data from RTE1 (Dagan\net al., 2006), RTE2 (Bar Haim et al., 2006), RTE3 (Giampiccolo et al., 2007), and RTE5 (Bentivogli\net al., 2009). All datasets are combined and converted to two-class classification: entailment and\nnot_entailment. Of all the GLUE tasks, RTE was among those that benefited from transfer learning\nthe most, jumping from near random-chance performance (~56%) at the time of GLUE's launch to\n85% accuracy (Liu et al., 2019c) at the time of writing. Given the eight point gap with respect to\nhuman performance, however, the task is not yet solved by machines, and we expect the remaining\ngap to be difficult to close.", "citation": "@inproceedings{dagan2005pascal,\n  title={The PASCAL recognising textual entailment challenge},\n  author={Dagan, Ido and Glickman, Oren and Magnini, Bernardo},\n  booktitle={Machine Learning Challenges Workshop},\n  pages={177--190},\n  year={2005},\n  organization={Springer}\n}\n@inproceedings{bar2006second,\n  title={The second pascal recognising textual entailment challenge},\n  author={Bar-Haim, Roy and Dagan, Ido and Dolan, Bill and Ferro, Lisa and Giampiccolo, Danilo and Magnini, Bernardo and Szpektor, Idan},\n  booktitle={Proceedings of the second PASCAL challenges workshop on recognising textual entailment},\n  volume={6},\n  number={1},\n  pages={6--4},\n  year={2006},\n  organization={Venice}\n}\n@inproceedings{giampiccolo2007third,\n  title={The third pascal recognizing textual entailment challenge},\n  author={Giampiccolo, Danilo and Magnini, Bernardo and Dagan, Ido and Dolan, Bill},\n  booktitle={Proceedings of the ACL-PASCAL workshop on textual entailment and paraphrasing},\n  pages={1--9},\n  year={2007},\n  organization={Association for Computational Linguistics}\n}\n@inproceedings{bentivogli2009fifth,\n  title={The Fifth PASCAL Recognizing Textual Entailment Challenge.},\n  author={Bentivogli, Luisa and Clark, Peter and Dagan, Ido and Giampiccolo, Danilo},\n  booktitle={TAC},\n  year={2009}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://aclweb.org/aclwiki/Recognizing_Textual_Entailment", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "idx": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["entailment", "not_entailment"], "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "super_glue", "config_name": "rte", "version": {"version_str": "1.0.2", "description": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 975799, "num_examples": 3000, "dataset_name": "super_glue"}, "train": {"name": "train", "num_bytes": 848745, "num_examples": 2490, "dataset_name": "super_glue"}, "validation": {"name": "validation", "num_bytes": 90899, "num_examples": 277, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/RTE.zip": {"num_bytes": 750920, "checksum": "6310aab3f000424c9d0318a1ff20692e07c7f4aa15e8f17a5972ea0a35c398b9"}}, "download_size": 750920, "post_processing_size": null, "dataset_size": 1915443, "size_in_bytes": 2666363}, "wic": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nThe Word-in-Context (WiC, Pilehvar and Camacho-Collados, 2019) dataset supports a word\nsense disambiguation task cast as binary classification over sentence pairs. Given two sentences and a\npolysemous (sense-ambiguous) word that appears in both sentences, the task is to determine whether\nthe word is used with the same sense in both sentences. Sentences are drawn from WordNet (Miller,\n1995), VerbNet (Schuler, 2005), and Wiktionary. We follow the original work and evaluate using\naccuracy.", "citation": "@article{DBLP:journals/corr/abs-1808-09121,\n  author={Mohammad Taher Pilehvar and os{'{e}} Camacho{-}Collados},\n  title={WiC: 10, 000 Example Pairs for Evaluating Context-Sensitive Representations},\n  journal={CoRR},\n  volume={abs/1808.09121},\n  year={2018},\n  url={http://arxiv.org/abs/1808.09121},\n  archivePrefix={arXiv},\n  eprint={1808.09121},\n  timestamp={Mon, 03 Sep 2018 13:36:40 +0200},\n  biburl={https://dblp.org/rec/bib/journals/corr/abs-1808-09121},\n  bibsource={dblp computer science bibliography, https://dblp.org}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://pilehvar.github.io/wic/", "license": "", "features": {"word": {"dtype": "string", "id": null, "_type": "Value"}, "sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "start1": {"dtype": "int32", "id": null, "_type": "Value"}, "start2": {"dtype": "int32", "id": null, "_type": "Value"}, "end1": {"dtype": "int32", "id": null, "_type": "Value"}, "end2": {"dtype": "int32", "id": null, "_type": "Value"}, "idx": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["False", "True"], "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "super_glue", "config_name": "wic", "version": {"version_str": "1.0.2", "description": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 180593, "num_examples": 1400, "dataset_name": "super_glue"}, "train": {"name": "train", "num_bytes": 665183, "num_examples": 5428, "dataset_name": "super_glue"}, "validation": {"name": "validation", "num_bytes": 82623, "num_examples": 638, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WiC.zip": {"num_bytes": 396213, "checksum": "ee7e67f4ae9eafbf533780faa198e62167f3cda54256cdf261877be3c0e90900"}}, "download_size": 396213, "post_processing_size": null, "dataset_size": 928399, "size_in_bytes": 1324612}, "wsc": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nThe Winograd Schema Challenge (WSC, Levesque et al., 2012) is a reading comprehension\ntask in which a system must read a sentence with a pronoun and select the referent of that pronoun\nfrom a list of choices. Given the difficulty of this task and the headroom still left, we have included\nWSC in SuperGLUE and recast the dataset into its coreference form. The task is cast as a binary\nclassification problem, as opposed to N-multiple choice, in order to isolate the model's ability to\nunderstand the coreference links within a sentence as opposed to various other strategies that may\ncome into play in multiple choice conditions. With that in mind, we create a split with 65% negative\nmajority class in the validation set, reflecting the distribution of the hidden test set, and 52% negative\nclass in the training set. The training and validation examples are drawn from the original Winograd\nSchema dataset (Levesque et al., 2012), as well as those distributed by the affiliated organization\nCommonsense Reasoning. The test examples are derived from fiction books and have been shared\nwith us by the authors of the original dataset. Previously, a version of WSC recast as NLI as included\nin GLUE, known as WNLI. No substantial progress was made on WNLI, with many submissions\nopting to submit only majority class predictions. WNLI was made especially difficult due to an\nadversarial train/dev split: Premise sentences that appeared in the training set sometimes appeared\nin the development set with a different hypothesis and a flipped label. If a system memorized the\ntraining set without meaningfully generalizing, which was easy due to the small size of the training\nset, it could perform far below chance on the development set. We remove this adversarial design\nin the SuperGLUE version of WSC by ensuring that no sentences are shared between the training,\nvalidation, and test sets.\n\nHowever, the validation and test sets come from different domains, with the validation set consisting\nof ambiguous examples such that changing one non-noun phrase word will change the coreference\ndependencies in the sentence. The test set consists only of more straightforward examples, with a\nhigh number of noun phrases (and thus more choices for the model), but low to no ambiguity.", "citation": "@inproceedings{levesque2012winograd,\n  title={The winograd schema challenge},\n  author={Levesque, Hector and Davis, Ernest and Morgenstern, Leora},\n  booktitle={Thirteenth International Conference on the Principles of Knowledge Representation and Reasoning},\n  year={2012}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://cs.nyu.edu/faculty/davise/papers/WinogradSchemas/WS.html", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "span1_index": {"dtype": "int32", "id": null, "_type": "Value"}, "span2_index": {"dtype": "int32", "id": null, "_type": "Value"}, "span1_text": {"dtype": "string", "id": null, "_type": "Value"}, "span2_text": {"dtype": "string", "id": null, "_type": "Value"}, "idx": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["False", "True"], "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "super_glue", "config_name": "wsc", "version": {"version_str": "1.0.2", "description": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 31572, "num_examples": 146, "dataset_name": "super_glue"}, "train": {"name": "train", "num_bytes": 89883, "num_examples": 554, "dataset_name": "super_glue"}, "validation": {"name": "validation", "num_bytes": 21637, "num_examples": 104, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WSC.zip": {"num_bytes": 32751, "checksum": "2ed6dfa94556b4a128ff0441efe365b2e883124e7e6aa00fb8d3a6cb1fd520a9"}}, "download_size": 32751, "post_processing_size": null, "dataset_size": 143092, "size_in_bytes": 175843}, "wsc.fixed": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nThe Winograd Schema Challenge (WSC, Levesque et al., 2012) is a reading comprehension\ntask in which a system must read a sentence with a pronoun and select the referent of that pronoun\nfrom a list of choices. Given the difficulty of this task and the headroom still left, we have included\nWSC in SuperGLUE and recast the dataset into its coreference form. The task is cast as a binary\nclassification problem, as opposed to N-multiple choice, in order to isolate the model's ability to\nunderstand the coreference links within a sentence as opposed to various other strategies that may\ncome into play in multiple choice conditions. With that in mind, we create a split with 65% negative\nmajority class in the validation set, reflecting the distribution of the hidden test set, and 52% negative\nclass in the training set. The training and validation examples are drawn from the original Winograd\nSchema dataset (Levesque et al., 2012), as well as those distributed by the affiliated organization\nCommonsense Reasoning. The test examples are derived from fiction books and have been shared\nwith us by the authors of the original dataset. Previously, a version of WSC recast as NLI as included\nin GLUE, known as WNLI. No substantial progress was made on WNLI, with many submissions\nopting to submit only majority class predictions. WNLI was made especially difficult due to an\nadversarial train/dev split: Premise sentences that appeared in the training set sometimes appeared\nin the development set with a different hypothesis and a flipped label. If a system memorized the\ntraining set without meaningfully generalizing, which was easy due to the small size of the training\nset, it could perform far below chance on the development set. We remove this adversarial design\nin the SuperGLUE version of WSC by ensuring that no sentences are shared between the training,\nvalidation, and test sets.\n\nHowever, the validation and test sets come from different domains, with the validation set consisting\nof ambiguous examples such that changing one non-noun phrase word will change the coreference\ndependencies in the sentence. The test set consists only of more straightforward examples, with a\nhigh number of noun phrases (and thus more choices for the model), but low to no ambiguity.\n\nThis version fixes issues where the spans are not actually substrings of the text.", "citation": "@inproceedings{levesque2012winograd,\n  title={The winograd schema challenge},\n  author={Levesque, Hector and Davis, Ernest and Morgenstern, Leora},\n  booktitle={Thirteenth International Conference on the Principles of Knowledge Representation and Reasoning},\n  year={2012}\n}\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://cs.nyu.edu/faculty/davise/papers/WinogradSchemas/WS.html", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "span1_index": {"dtype": "int32", "id": null, "_type": "Value"}, "span2_index": {"dtype": "int32", "id": null, "_type": "Value"}, "span1_text": {"dtype": "string", "id": null, "_type": "Value"}, "span2_text": {"dtype": "string", "id": null, "_type": "Value"}, "idx": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["False", "True"], "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "super_glue", "config_name": "wsc.fixed", "version": {"version_str": "1.0.2", "description": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 31568, "num_examples": 146, "dataset_name": "super_glue"}, "train": {"name": "train", "num_bytes": 89883, "num_examples": 554, "dataset_name": "super_glue"}, "validation": {"name": "validation", "num_bytes": 21637, "num_examples": 104, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WSC.zip": {"num_bytes": 32751, "checksum": "2ed6dfa94556b4a128ff0441efe365b2e883124e7e6aa00fb8d3a6cb1fd520a9"}}, "download_size": 32751, "post_processing_size": null, "dataset_size": 143088, "size_in_bytes": 175839}, "axb": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nAn expert-constructed,\ndiagnostic dataset that automatically tests models for a broad range of linguistic, commonsense, and\nworld knowledge. Each example in this broad-coverage diagnostic is a sentence pair labeled with\na three-way entailment relation (entailment, neutral, or contradiction) and tagged with labels that\nindicate the phenomena that characterize the relationship between the two sentences. Submissions\nto the GLUE leaderboard are required to include predictions from the submission's MultiNLI\nclassifier on the diagnostic dataset, and analyses of the results were shown alongside the main\nleaderboard. Since this broad-coverage diagnostic task has proved difficult for top models, we retain\nit in SuperGLUE. However, since MultiNLI is not part of SuperGLUE, we collapse contradiction\nand neutral into a single not_entailment label, and request that submissions include predictions\non the resulting set from the model used for the RTE task.\n", "citation": "\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://gluebenchmark.com/diagnostics", "license": "", "features": {"sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "idx": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["entailment", "not_entailment"], "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "super_glue", "config_name": "axb", "version": {"version_str": "1.0.2", "description": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 238392, "num_examples": 1104, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/AX-b.zip": {"num_bytes": 33950, "checksum": "43b45c55a273575c58a33cd68f10a971f83daa3aa223bfbc4077b92fbdf960b3"}}, "download_size": 33950, "post_processing_size": null, "dataset_size": 238392, "size_in_bytes": 272342}, "axg": {"description": "SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after\nGLUE with a new set of more difficult language understanding tasks, improved\nresources, and a new public leaderboard.\n\nWinogender is designed to measure gender\nbias in coreference resolution systems. We use the Diverse Natural Language Inference Collection\n(DNC; Poliak et al., 2018) version that casts Winogender as a textual entailment task. Each example\nconsists of a premise sentence with a male or female pronoun and a hypothesis giving a possible\nantecedent of the pronoun. Examples occur in minimal pairs, where the only difference between\nan example and its pair is the gender of the pronoun in the premise. Performance on Winogender\nis measured with both accuracy and the gender parity score: the percentage of minimal pairs for\nwhich the predictions are the same. We note that a system can trivially obtain a perfect gender parity\nscore by guessing the same class for all examples, so a high gender parity score is meaningless unless\naccompanied by high accuracy. As a diagnostic test of gender bias, we view the schemas as having high\npositive predictive value and low negative predictive value; that is, they may demonstrate the presence\nof gender bias in a system, but not prove its absence.\n", "citation": "@inproceedings{rudinger-EtAl:2018:N18,\n  author    = {Rudinger, Rachel  and  Naradowsky, Jason  and  Leonard, Brian  and  {Van Durme}, Benjamin},\n  title     = {Gender Bias in Coreference Resolution},\n  booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},\n  month     = {June},\n  year      = {2018},\n  address   = {New Orleans, Louisiana},\n  publisher = {Association for Computational Linguistics}\n}\n\n@article{wang2019superglue,\n  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},\n  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},\n  journal={arXiv preprint arXiv:1905.00537},\n  year={2019}\n}\n\nNote that each SuperGLUE dataset has its own citation. Please see the source to\nget the correct citation for each contained dataset.\n", "homepage": "https://github.com/rudinger/winogender-schemas", "license": "", "features": {"premise": {"dtype": "string", "id": null, "_type": "Value"}, "hypothesis": {"dtype": "string", "id": null, "_type": "Value"}, "idx": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["entailment", "not_entailment"], "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "super_glue", "config_name": "axg", "version": {"version_str": "1.0.2", "description": null, "major": 1, "minor": 0, "patch": 2}, "splits": {"test": {"name": "test", "num_bytes": 53581, "num_examples": 356, "dataset_name": "super_glue"}}, "download_checksums": {"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/AX-g.zip": {"num_bytes": 10413, "checksum": "2d4e00d3a7d23d2c3787ee4c1382cc81a72cb05a76fc9d78d142949247ed61b9"}}, "download_size": 10413, "post_processing_size": null, "dataset_size": 53581, "size_in_bytes": 63994}}
\ No newline at end of file
diff --git a/datasets/super_glue/dummy/axb/1.0.2/dummy_data.zip b/datasets/super_glue/dummy/axb/1.0.3/dummy_data.zip
similarity index 100%
rename from datasets/super_glue/dummy/axb/1.0.2/dummy_data.zip
rename to datasets/super_glue/dummy/axb/1.0.3/dummy_data.zip
diff --git a/datasets/super_glue/dummy/axg/1.0.2/dummy_folder.zip b/datasets/super_glue/dummy/axg/1.0.3/dummy_folder.zip
similarity index 100%
rename from datasets/super_glue/dummy/axg/1.0.2/dummy_folder.zip
rename to datasets/super_glue/dummy/axg/1.0.3/dummy_folder.zip
diff --git a/datasets/super_glue/dummy/boolq/1.0.2/dummy_data/BoolQ/test.jsonl b/datasets/super_glue/dummy/boolq/1.0.2/dummy_data/BoolQ/test.jsonl
deleted file mode 100644
index e70a260a261..00000000000
--- a/datasets/super_glue/dummy/boolq/1.0.2/dummy_data/BoolQ/test.jsonl
+++ /dev/null
@@ -1 +0,0 @@
-{"passage": "20 euro note -- Until now there has been only one complete series of euro notes; however a new series, similar to the current one, is being released. The European Central Bank will, in due time, announce when banknotes from the first series lose legal tender status.", "question": "is the first series 20 euro note still legal tender", "idx": 0}
diff --git a/datasets/super_glue/dummy/boolq/1.0.2/dummy_data/BoolQ/train.jsonl b/datasets/super_glue/dummy/boolq/1.0.2/dummy_data/BoolQ/train.jsonl
deleted file mode 100644
index dfaf347b69e..00000000000
--- a/datasets/super_glue/dummy/boolq/1.0.2/dummy_data/BoolQ/train.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-{"question": "do iran and afghanistan speak the same language", "passage": "Persian language -- Persian (/\u02c8p\u025c\u02d0r\u0292\u0259n, -\u0283\u0259n/), also known by its endonym Farsi (\u0641\u0627\u0631\u0633\u06cc f\u0101rsi (f\u0252\u02d0\u027e\u02c8si\u02d0) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet.", "idx": 0, "label": true}
-{"question": "do good samaritan laws protect those who help at an accident", "passage": "Good Samaritan law -- Good Samaritan laws offer legal protection to people who give reasonable assistance to those who are, or who they believe to be, injured, ill, in peril, or otherwise incapacitated. The protection is intended to reduce bystanders' hesitation to assist, for fear of being sued or prosecuted for unintentional injury or wrongful death. An example of such a law in common-law areas of Canada: a good Samaritan doctrine is a legal principle that prevents a rescuer who has voluntarily helped a victim in distress from being successfully sued for wrongdoing. Its purpose is to keep people from being reluctant to help a stranger in need for fear of legal repercussions should they make some mistake in treatment. By contrast, a duty to rescue law requires people to offer assistance and holds those who fail to do so liable.", "idx": 1, "label": true}
-{"question": "is windows movie maker part of windows essentials", "passage": "Windows Movie Maker -- Windows Movie Maker (formerly known as Windows Live Movie Maker in Windows 7) is a discontinued video editing software by Microsoft. It is a part of Windows Essentials software suite and offers the ability to create and edit videos as well as to publish them on OneDrive, Facebook, Vimeo, YouTube, and Flickr.", "idx": 2, "label": true}
diff --git a/datasets/super_glue/dummy/boolq/1.0.2/dummy_data/BoolQ/val.jsonl b/datasets/super_glue/dummy/boolq/1.0.2/dummy_data/BoolQ/val.jsonl
deleted file mode 100644
index cf392a7a4b1..00000000000
--- a/datasets/super_glue/dummy/boolq/1.0.2/dummy_data/BoolQ/val.jsonl
+++ /dev/null
@@ -1,2 +0,0 @@
-{"question": "does ethanol take more energy make that produces", "passage": "Ethanol fuel -- All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a separate survey reports that production of ethanol from sugarcane, which requires a tropical climate to grow productively, returns from 8 to 9 units of energy for each unit expended, as compared to corn, which only returns about 1.34 units of fuel energy for each unit of energy expended. A 2006 University of California Berkeley study, after analyzing six separate studies, concluded that producing ethanol from corn uses much less petroleum than producing gasoline.", "idx": 0, "label": false}
-{"question": "is house tax and property tax are same", "passage": "Property tax -- Property tax or 'house tax' is a local tax on buildings, along with appurtenant land. It is and imposed on the Possessor (not the custodian of property as per 1978, 44th amendment of constitution). It resembles the US-type wealth tax and differs from the excise-type UK rate. The tax power is vested in the states and is delegated to local bodies, specifying the valuation method, rate band, and collection procedures. The tax base is the annual rental value (ARV) or area-based rating. Owner-occupied and other properties not producing rent are assessed on cost and then converted into ARV by applying a percentage of cost, usually four percent. Vacant land is generally exempt. Central government properties are exempt. Instead a 'service charge' is permissible under executive order. Properties of foreign missions also enjoy tax exemption without requiring reciprocity. The tax is usually accompanied by service taxes, e.g., water tax, drainage tax, conservancy (sanitation) tax, lighting tax, all using the same tax base. The rate structure is flat on rural (panchayat) properties, but in the urban (municipal) areas it is mildly progressive with about 80% of assessments falling in the first two brackets.", "idx": 1, "label": true}
diff --git a/datasets/super_glue/dummy/boolq/1.0.2/dummy_data.zip b/datasets/super_glue/dummy/boolq/1.0.3/dummy_data.zip
similarity index 100%
rename from datasets/super_glue/dummy/boolq/1.0.2/dummy_data.zip
rename to datasets/super_glue/dummy/boolq/1.0.3/dummy_data.zip
diff --git a/datasets/super_glue/dummy/cb/1.0.2/dummy_data/CB/test.jsonl b/datasets/super_glue/dummy/cb/1.0.2/dummy_data/CB/test.jsonl
deleted file mode 100644
index a9d8d03b13b..00000000000
--- a/datasets/super_glue/dummy/cb/1.0.2/dummy_data/CB/test.jsonl
+++ /dev/null
@@ -1 +0,0 @@
-{"premise": "Polly had to think quickly. They were still close enough to shore for him to return her to the police if she admitted she was not an experienced ocean sailor.", "hypothesis": "Polly was not an experienced ocean sailor", "idx": 0}
diff --git a/datasets/super_glue/dummy/cb/1.0.2/dummy_data/CB/train.jsonl b/datasets/super_glue/dummy/cb/1.0.2/dummy_data/CB/train.jsonl
deleted file mode 100644
index 43968583fb0..00000000000
--- a/datasets/super_glue/dummy/cb/1.0.2/dummy_data/CB/train.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-{"premise": "It was a complex language. Not written down but handed down. One might say it was peeled down.", "hypothesis": "the language was peeled down", "label": "entailment", "idx": 0}
-{"premise": "It is part of their religion, a religion I do not scoff at as it holds many elements which match our own even though it lacks the truth of ours. At one of their great festivals they have the ritual of driving out the devils from their bodies. First the drummers come on - I may say that no women are allowed to take part in this ritual and the ladies here will perhaps agree with me that they are fortunate in that omission.", "hypothesis": "no women are allowed to take part in this ritual", "label": "entailment", "idx": 1}
-{"premise": "The Paris to Rouen railway was being extended to Le Havre, and the line cut straight through Dr Flaubert's land. Part of it was to be compulsorily purchased. You could say that Gustave was shepherded into creative retreat at Croisset by epilepsy.", "hypothesis": "Gustave was shepherded into creative retreat at Croisset by epilepsy", "label": "entailment", "idx": 2}
diff --git a/datasets/super_glue/dummy/cb/1.0.2/dummy_data/CB/val.jsonl b/datasets/super_glue/dummy/cb/1.0.2/dummy_data/CB/val.jsonl
deleted file mode 100644
index 0d802bd3d59..00000000000
--- a/datasets/super_glue/dummy/cb/1.0.2/dummy_data/CB/val.jsonl
+++ /dev/null
@@ -1,2 +0,0 @@
-{"premise": "Valence the void-brain, Valence the virtuous valet. Why couldn't the figger choose his own portion of titanic anatomy to shaft? Did he think he was helping?", "hypothesis": "Valence was helping", "label": "contradiction", "idx": 0}
-{"premise": "``Who knows? The point is, do we go with it or not?'' Do we assume there is a shipment?", "hypothesis": "there is a shipment", "label": "neutral", "idx": 1}
diff --git a/datasets/super_glue/dummy/copa/1.0.2/dummy_data/COPA.zip b/datasets/super_glue/dummy/copa/1.0.3/dummy_data/COPA.zip
similarity index 100%
rename from datasets/super_glue/dummy/copa/1.0.2/dummy_data/COPA.zip
rename to datasets/super_glue/dummy/copa/1.0.3/dummy_data/COPA.zip
diff --git a/datasets/super_glue/dummy/multirc/1.0.2/dummy_folder.zip b/datasets/super_glue/dummy/multirc/1.0.3/dummy_folder.zip
similarity index 100%
rename from datasets/super_glue/dummy/multirc/1.0.2/dummy_folder.zip
rename to datasets/super_glue/dummy/multirc/1.0.3/dummy_folder.zip
diff --git a/datasets/super_glue/dummy/record/1.0.2/dummy_data.zip b/datasets/super_glue/dummy/record/1.0.3/dummy_data.zip
similarity index 100%
rename from datasets/super_glue/dummy/record/1.0.2/dummy_data.zip
rename to datasets/super_glue/dummy/record/1.0.3/dummy_data.zip
diff --git a/datasets/super_glue/dummy/rte/1.0.2/dummy_data.zip b/datasets/super_glue/dummy/rte/1.0.3/dummy_data.zip
similarity index 100%
rename from datasets/super_glue/dummy/rte/1.0.2/dummy_data.zip
rename to datasets/super_glue/dummy/rte/1.0.3/dummy_data.zip
diff --git a/datasets/super_glue/dummy/wic/1.0.2/dummy_data.zip b/datasets/super_glue/dummy/wic/1.0.3/dummy_data.zip
similarity index 100%
rename from datasets/super_glue/dummy/wic/1.0.2/dummy_data.zip
rename to datasets/super_glue/dummy/wic/1.0.3/dummy_data.zip
diff --git a/datasets/super_glue/dummy/wsc/1.0.2/dummy_data.zip b/datasets/super_glue/dummy/wsc/1.0.3/dummy_data.zip
similarity index 100%
rename from datasets/super_glue/dummy/wsc/1.0.2/dummy_data.zip
rename to datasets/super_glue/dummy/wsc/1.0.3/dummy_data.zip
diff --git a/datasets/super_glue/super_glue.py b/datasets/super_glue/super_glue.py
index a778dfe3ace..18f830a2dbb 100644
--- a/datasets/super_glue/super_glue.py
+++ b/datasets/super_glue/super_glue.py
@@ -296,12 +296,13 @@ def __init__(self, features, data_url, citation, url, label_classes=("False", "T
           **kwargs: keyword arguments forwarded to super.
         """
         # Version history:
+        # 1.0.3: Fix not including entity position in ReCoRD.
         # 1.0.2: Fixed non-nondeterminism in ReCoRD.
         # 1.0.1: Change from the pre-release trial version of SuperGLUE (v1.9) to
         #        the full release (v2.0).
         # 1.0.0: S3 (new shuffling, sharding and slicing mechanism).
         # 0.0.2: Initial version.
-        super(SuperGlueConfig, self).__init__(version=datasets.Version("1.0.2"), **kwargs)
+        super(SuperGlueConfig, self).__init__(version=datasets.Version("1.0.3"), **kwargs)
         self.features = features
         self.label_classes = label_classes
         self.data_url = data_url
@@ -355,7 +356,7 @@ class SuperGlue(datasets.GeneratorBasedBuilder):
             # Note that entities and answers will be a sequences of strings. Query
             # will contain @placeholder as a substring, which represents the word
             # to be substituted in.
-            features=["passage", "query", "entities", "answers"],
+            features=["passage", "query", "entities", "entity_spans", "answers"],
             data_url="https://dl.fbaipublicfiles.com/glue/superglue/data/v2/ReCoRD.zip",
             citation=_RECORD_CITATION,
             url="https://sheng-z.github.io/ReCoRD-explorer/",
@@ -453,6 +454,14 @@ def _info(self):
         if self.config.name == "record":
             # Entities are the set of possible choices for the placeholder.
             features["entities"] = datasets.features.Sequence(datasets.Value("string"))
+            # The start and end indices of paragraph text for each entity.
+            features["entity_spans"] = datasets.features.Sequence(
+                {
+                    "text": datasets.Value("string"),
+                    "start": datasets.Value("int32"),
+                    "end": datasets.Value("int32"),
+                }
+            )
             # Answers are the subset of entities that are correct.
             features["answers"] = datasets.features.Sequence(datasets.Value("string"))
         else:
@@ -523,11 +532,13 @@ def _generate_examples(self, data_file, split):
                             }
                 elif self.config.name == "record":
                     passage = row["passage"]
+                    entity_texts, entity_spans = _get_record_entities(passage)
                     for qa in row["qas"]:
                         yield qa["idx"], {
                             "passage": passage["text"],
                             "query": qa["query"],
-                            "entities": _get_record_entities(passage),
+                            "entities": entity_texts,
+                            "entity_spans": entity_spans,
                             "answers": _get_record_answers(qa),
                             "idx": {"passage": row["idx"], "query": qa["idx"]},
                         }
@@ -603,10 +614,13 @@ def _cast_label(label):
 def _get_record_entities(passage):
     """Returns the unique set of entities."""
     text = passage["text"]
-    entities = set()
+    entity_spans = list()
     for entity in passage["entities"]:
-        entities.add(text[entity["start"] : entity["end"] + 1])
-    return sorted(entities)
+        entity_text = text[entity["start"] : entity["end"] + 1]
+        entity_spans.append({"text": entity_text, "start": entity["start"], "end": entity["end"] + 1})
+    entity_spans = sorted(entity_spans, key=lambda e: e["start"])  # sort by start index
+    entity_texts = set(e["text"] for e in entity_spans)  # for backward compatability
+    return entity_texts, entity_spans
 
 
 def _get_record_answers(qa):