huggingface · lhoestq · Dec 6, 2021 · Nov 13, 2021 · Dec 2, 2021 · Dec 2, 2021
diff --git a/datasets/definite_pronoun_resolution/dataset_infos.json b/datasets/definite_pronoun_resolution/dataset_infos.json
@@ -1 +1 @@
-{"plain_text": {"description": "Composed by 30 students from one of the author's undergraduate classes. These\nsentence pairs cover topics ranging from real events (e.g., Iran's plan to\nattack the Saudi ambassador to the U.S.) to events/characters in movies (e.g.,\nBatman) and purely imaginary situations, largely reflecting the pop culture as\nperceived by the American kids born in the early 90s. Each annotated example\nspans four lines: the first line contains the sentence, the second line contains\nthe target pronoun, the third line contains the two candidate antecedents, and\nthe fourth line contains the correct antecedent. If the target pronoun appears\nmore than once in the sentence, its first occurrence is the one to be resolved.\n", "citation": "@inproceedings{rahman2012resolving,\n  title={Resolving complex cases of definite pronouns: the winograd schema challenge},\n  author={Rahman, Altaf and Ng, Vincent},\n  booktitle={Proceedings of the 2012 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning},\n  pages={777--789},\n  year={2012},\n  organization={Association for Computational Linguistics}\n}", "homepage": "http://www.hlt.utdallas.edu/~vince/data/emnlp12/", "license": "", "features": {"sentence": {"dtype": "string", "id": null, "_type": "Value"}, "pronoun": {"dtype": "string", "id": null, "_type": "Value"}, "candidates": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": 2, "id": null, "_type": "Sequence"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "supervised_keys": {"input": "sentence", "output": "label"}, "builder_name": "definite_pronoun_resolution", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 72187, "num_examples": 564, "dataset_name": "definite_pronoun_resolution"}, "train": {"name": "train", "num_bytes": 172672, "num_examples": 1322, "dataset_name": "definite_pronoun_resolution"}}, "download_checksums": {"http://www.hlt.utdallas.edu/~vince/data/emnlp12/train.c.txt": {"num_bytes": 160409, "checksum": "bc0a54b3ca1009d8d5b2ca5a221086aee2b0fc5cd03b22b9dfa9cdf44c629cec"}, "http://www.hlt.utdallas.edu/~vince/data/emnlp12/test.c.txt": {"num_bytes": 67044, "checksum": "cf1cf025e1d59a5b363e6dcfbd5a13aae8a9830831ac16fd7f865aca7a1559d8"}}, "download_size": 227453, "dataset_size": 244859, "size_in_bytes": 472312}}
+{"plain_text": {"description": "Composed by 30 students from one of the author's undergraduate classes. These\nsentence pairs cover topics ranging from real events (e.g., Iran's plan to\nattack the Saudi ambassador to the U.S.) to events/characters in movies (e.g.,\nBatman) and purely imaginary situations, largely reflecting the pop culture as\nperceived by the American kids born in the early 90s. Each annotated example\nspans four lines: the first line contains the sentence, the second line contains\nthe target pronoun, the third line contains the two candidate antecedents, and\nthe fourth line contains the correct antecedent. If the target pronoun appears\nmore than once in the sentence, its first occurrence is the one to be resolved.\n", "citation": "@inproceedings{rahman2012resolving,\n  title={Resolving complex cases of definite pronouns: the winograd schema challenge},\n  author={Rahman, Altaf and Ng, Vincent},\n  booktitle={Proceedings of the 2012 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning},\n  pages={777--789},\n  year={2012},\n  organization={Association for Computational Linguistics}\n}", "homepage": "http://www.hlt.utdallas.edu/~vince/data/emnlp12/", "license": "", "features": {"sentence": {"dtype": "string", "id": null, "_type": "Value"}, "pronoun": {"dtype": "string", "id": null, "_type": "Value"}, "candidates": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": 2, "id": null, "_type": "Sequence"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": {"input": "sentence", "output": "label"}, "task_templates": null, "builder_name": "definite_pronoun_resolution", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 71691, "num_examples": 564, "dataset_name": "definite_pronoun_resolution"}, "train": {"name": "train", "num_bytes": 171511, "num_examples": 1322, "dataset_name": "definite_pronoun_resolution"}}, "download_checksums": {"https://s3.amazonaws.com/datasets.huggingface.co/definite_pronoun_resolution/train.c.txt": {"num_bytes": 160408, "checksum": "c310158d0cbac1a556e3284e6c167f4478271d8d50b0b9d15dfe428c905a0867"}, "https://s3.amazonaws.com/datasets.huggingface.co/definite_pronoun_resolution/test.c.txt": {"num_bytes": 67044, "checksum": "cf1cf025e1d59a5b363e6dcfbd5a13aae8a9830831ac16fd7f865aca7a1559d8"}}, "download_size": 227452, "post_processing_size": null, "dataset_size": 243202, "size_in_bytes": 470654}}
diff --git a/datasets/definite_pronoun_resolution/definite_pronoun_resolution.py b/datasets/definite_pronoun_resolution/definite_pronoun_resolution.py
@@ -42,7 +42,8 @@
 more than once in the sentence, its first occurrence is the one to be resolved.
 """
 
-_DATA_URL_PATTERN = "http://www.hlt.utdallas.edu/~vince/data/emnlp12/{}.c.txt"
+
+_DATA_URL_PATTERN = "https://s3.amazonaws.com/datasets.huggingface.co/definite_pronoun_resolution/{}.c.txt"
 
 
 class DefinitePronounResolution(datasets.GeneratorBasedBuilder):

diff --git a/datasets/jeopardy/dataset_infos.json b/datasets/jeopardy/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "\nDataset containing 216,930 Jeopardy questions, answers and other data.\n\nThe json file is an unordered list of questions where each question has\n'category' : the question category, e.g. \"HISTORY\"\n'value' : integer $ value of the question as string, e.g. \"200\"\nNote: This is \"None\" for Final Jeopardy! and Tiebreaker questions\n'question' : text of question\nNote: This sometimes contains hyperlinks and other things messy text such as when there's a picture or video question\n'answer' : text of answer\n'round' : one of \"Jeopardy!\",\"Double Jeopardy!\",\"Final Jeopardy!\" or \"Tiebreaker\"\nNote: Tiebreaker questions do happen but they're very rare (like once every 20 years)\n'show_number' : int of show number, e.g '4680'\n'air_date' : string of the show air date in format YYYY-MM-DD\n", "citation": "\n", "homepage": "https://www.reddit.com/r/datasets/comments/1uyd0t/200000_jeopardy_questions_in_a_json_file/", "license": "", "features": {"category": {"dtype": "string", "id": null, "_type": "Value"}, "air_date": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "value": {"dtype": "int32", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "round": {"dtype": "string", "id": null, "_type": "Value"}, "show_number": {"dtype": "int32", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "jeopardy", "config_name": "default", "version": {"version_str": "0.1.0", "description": null, "datasets_version_to_prepare": null, "major": 0, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 36132636, "num_examples": 216930, "dataset_name": "jeopardy"}}, "download_checksums": {"http://skeeto.s3.amazonaws.com/share/JEOPARDY_QUESTIONS1.json.gz": {"num_bytes": 12721082, "checksum": "e031eeab6eecc1f36efe7addb01cef2cd3623da16cd0e5de7c4661cf11dbd4f0"}}, "download_size": 12721082, "dataset_size": 36132636, "size_in_bytes": 48853718}}
+{"default": {"description": "\nDataset containing 216,930 Jeopardy questions, answers and other data.\n\nThe json file is an unordered list of questions where each question has\n'category' : the question category, e.g. \"HISTORY\"\n'value' : integer $ value of the question as string, e.g. \"200\"\nNote: This is \"None\" for Final Jeopardy! and Tiebreaker questions\n'question' : text of question\nNote: This sometimes contains hyperlinks and other things messy text such as when there's a picture or video question\n'answer' : text of answer\n'round' : one of \"Jeopardy!\",\"Double Jeopardy!\",\"Final Jeopardy!\" or \"Tiebreaker\"\nNote: Tiebreaker questions do happen but they're very rare (like once every 20 years)\n'show_number' : int of show number, e.g '4680'\n'air_date' : string of the show air date in format YYYY-MM-DD\n", "citation": "\n", "homepage": "https://www.reddit.com/r/datasets/comments/1uyd0t/200000_jeopardy_questions_in_a_json_file/", "license": "", "features": {"category": {"dtype": "string", "id": null, "_type": "Value"}, "air_date": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "value": {"dtype": "int32", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "round": {"dtype": "string", "id": null, "_type": "Value"}, "show_number": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "jeopardy", "config_name": "default", "version": {"version_str": "0.1.0", "description": null, "major": 0, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 35916080, "num_examples": 216930, "dataset_name": "jeopardy"}}, "download_checksums": {"https://drive.google.com/uc?export=download&id=0BwT5wj_P7BKXb2hfM3d2RHU1ckE": {"num_bytes": 55554625, "checksum": "e93247fed3511e025b9f4c7d3a6ab3334c77546401fb1d64f4d558af09506308"}}, "download_size": 55554625, "post_processing_size": null, "dataset_size": 35916080, "size_in_bytes": 91470705}}
diff --git a/datasets/jeopardy/jeopardy.py b/datasets/jeopardy/jeopardy.py
@@ -27,7 +27,7 @@
 'air_date' : string of the show air date in format YYYY-MM-DD
 """
 _URL = "https://www.reddit.com/r/datasets/comments/1uyd0t/200000_jeopardy_questions_in_a_json_file/"
-_DATA_URL = "http://skeeto.s3.amazonaws.com/share/JEOPARDY_QUESTIONS1.json.gz"
+_DATA_URL = "https://drive.google.com/uc?export=download&id=0BwT5wj_P7BKXb2hfM3d2RHU1ckE"
 _DATA_FILE = "JEOPARDY_QUESTIONS1.json"
 
 

diff --git a/datasets/wiki_auto/README.md b/datasets/wiki_auto/README.md
@@ -86,18 +86,17 @@ The data in all of the configurations looks a little different.
 A `manual` config instance consists of a sentence from the Simple English Wikipedia article, one from the linked English Wikipedia article, IDs for each of them, and a label indicating whether they are  aligned. Sentences on either side can be repeated so that the aligned sentences are in the same instances. For example:
 ```
 {'alignment_label': 1,
- 'normal_sentence': 'The Local Government Act 1985 is an Act of Parliament in the United Kingdom.',
  'normal_sentence_id': '0_66252-1-0-0',
- 'simple_sentence': 'The Local Government Act 1985 was an Act of Parliament in the United Kingdom.',
- 'simple_sentence_id': '0_66252-0-0-0'}
+ 'simple_sentence_id': '0_66252-0-0-0',
+ 'normal_sentence': 'The Local Government Act 1985 is an Act of Parliament in the United Kingdom.', 'simple_sentence': 'The Local Government Act 1985 was an Act of Parliament in the United Kingdom', 'gleu_score': 0.800000011920929}
 ```
 Is followed by
 ```
 {'alignment_label': 0,
- 'normal_sentence': 'Its main effect was to abolish the six county councils of the metropolitan counties that had been set up in 1974, 11 years earlier, by the Local Government Act 1972, along with the Greater London Council that had been established in 1965.',
  'normal_sentence_id': '0_66252-1-0-1',
- 'simple_sentence': 'The Local Government Act 1985 was an Act of Parliament in the United Kingdom.',
- 'simple_sentence_id': '0_66252-0-0-0'}
+ 'simple_sentence_id': '0_66252-0-0-0',
+ 'normal_sentence': 'Its main effect was to abolish the six county councils of the metropolitan counties that had been set up in 1974, 11 years earlier, by the Local Government Act 1972, along with the Greater London Council that had been established in 1965.',
+ 'simple_sentence': 'The Local Government Act 1985 was an Act of Parliament in the United Kingdom', 'gleu_score': 0.08641975373029709}
 ```
 
 The `auto` config shows a pair of an English and corresponding Simple English Wikipedia as an instance, with an alignment at the paragraph and sentence level:
@@ -146,9 +145,10 @@ The data has the following field:
 - `normal_sentence_id`: a unique ID for each English Wikipedia sentence. The last two dash-separated numbers correspond to the paragraph number in the article and the sentence number in the paragraph.
 - `simple_sentence`: a sentence from Simple English Wikipedia.
 - `simple_sentence_id`: a unique ID for each Simple English Wikipedia sentence. The last two dash-separated numbers correspond to the paragraph number in the article and the sentence number in the paragraph.
-- `alignment_label`: signifies whether a pair of sentences is aligned: labels are `1:aligned` and `0:notAligned`
+- `alignment_label`: signifies whether a pair of sentences is aligned: labels are `2:partialAligned`, `1:aligned` and `0:notAligned`
 - `paragraph_alignment`: a first step of alignment mapping English and Simple English paragraphs from linked articles
 - `sentence_alignment`: the full alignment mapping English and Simple English sentences from linked articles
+- `gleu_score`: the sentence level GLEU (Google-BLEU) score for each pair.
 
 ### Data Splits