Merge branch 'master' into refac-dataset-builder-preparation

huggingface · Oct 5, 2021 · 14b88ce · 14b88ce · github-actions · Oct 5, 2021
2 parents 8e04027 + 9379a5a
commit 14b88ce
Show file tree

Hide file tree

Showing 16 changed files with 483 additions and 22 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -43,11 +43,12 @@ jobs:
             - checkout
             - run: conda update conda
             - run: conda install python=3.6 --yes
+            - run: Remove-Item c:\tools\miniconda3\lib\site-packages\ruamel* -Recurse -Force -Confirm:$false
+            - run: pip install ruamel.yaml
             - run: conda install pytorch --yes
             - run: pip install virtualenv
             - run: python -m virtualenv venv --system-site-packages
             - run: "& venv/Scripts/activate.ps1"
-            - run: pip install --ignore-installed ruamel-yaml
             - run: pip install .[tests]
             - run: pip install -r additional-tests-requirements.txt --no-deps
             - run: pip install pyarrow --upgrade
@@ -63,11 +64,12 @@ jobs:
             - checkout
             - run: conda update conda
             - run: conda install python=3.6 --yes
+            - run: Remove-Item c:\tools\miniconda3\lib\site-packages\ruamel* -Recurse -Force -Confirm:$false
+            - run: pip install ruamel.yaml
             - run: conda install pytorch --yes
             - run: pip install virtualenv
             - run: python -m virtualenv venv --system-site-packages
             - run: "& venv/Scripts/activate.ps1"
-            - run: pip install --ignore-installed ruamel-yaml
             - run: pip install .[tests]
             - run: pip install -r additional-tests-requirements.txt --no-deps
             - run: pip install pyarrow==1.0.0

diff --git a/datasets/nq_open/README.md b/datasets/nq_open/README.md
@@ -9,6 +9,7 @@ licenses:
 - cc-by-sa-3.0
 multilinguality:
 - monolingual
+pretty_name: NQ-Open
 size_categories:
 - 10K<n<100K
 source_datasets:
@@ -197,4 +198,4 @@ All of the Natural Questions data is released under the
 
 ### Contributions
 
-Thanks to [@Nilanshrajput](https://github.com/Nilanshrajput) for adding this dataset.
+Thanks to [@Nilanshrajput](https://github.com/Nilanshrajput) for adding this dataset.
diff --git a/datasets/nq_open/dataset_infos.json b/datasets/nq_open/dataset_infos.json
@@ -1 +1 @@
-{"nq_open": {"description": "The NQ-Open task, introduced by Lee et.al. 2019,\nis an open domain question answering benchmark that is derived from Natural Questions.\nThe goal is to predict an English answer string for an input English question.\nAll questions can be answered using the contents of English Wikipedia.\n", "citation": "@article{doi:10.1162/tacl_a_00276,\n    author = {Kwiatkowski, Tom and Palomaki, Jennimaria and Redfield, Olivia and Collins, Michael and Parikh, Ankur and Alberti, Chris and Epstein, Danielle and Polosukhin, Illia and Devlin, Jacob and Lee, Kenton and Toutanova, Kristina and Jones, Llion and Kelcey, Matthew and Chang, Ming-Wei and Dai, Andrew                         M. and Uszkoreit, Jakob and Le, Quoc and Petrov, Slav},\n    title = {Natural Questions: A Benchmark for Question Answering Research},\n    journal = {Transactions of the Association for Computational Linguistics},\n    volume = {7},\n    number = {},\n    pages = {453-466},\n    year = {2019},\n    doi = {10.1162/tacl_a_00276},\n    URL = {\n            https://doi.org/10.1162/tacl_a_00276\n        },\n    eprint = {\n            https://doi.org/10.1162/tacl_a_00276\n        },\n    abstract = { We present the Natural Questions corpus, a question answering data set. Questions consist of real anonymized, aggregated queries issued to the Google search engine. An annotator is presented with a question along with a Wikipedia page from the top 5 search results, and annotates a long answer (typically a paragraph) and a short answer (one or more entities) if present on the page, or marks null if no long/short answer is present. The public release consists of 307,373 training examples with single annotations; 7,830 examples with 5-way annotations for development data; and a further 7,842 examples with 5-way annotated sequestered as test data. We present experiments validating quality of the data. We also describe analysis of 25-way annotations on 302 examples, giving insights into human variability on the annotation task. We introduce robust metrics for the purposes of evaluating question answering systems; demonstrate high human upper bounds on these metrics; and establish baseline results using competitive methods drawn from related literature. }\n}\n\n@inproceedings{lee-etal-2019-latent,\n    title = \"Latent Retrieval for Weakly Supervised Open Domain Question Answering\",\n    author = \"Lee, Kenton  and\n      Chang, Ming-Wei  and\n      Toutanova, Kristina\",\n    booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n    month = jul,\n    year = \"2019\",\n    address = \"Florence, Italy\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/P19-1612\",\n    doi = \"10.18653/v1/P19-1612\",\n    pages = \"6086--6096\",\n    abstract = \"Recent work on open domain question answering (QA) assumes strong supervision of the supporting evidence and/or assumes a blackbox information retrieval (IR) system to retrieve evidence candidates. We argue that both are suboptimal, since gold evidence is not always available, and QA is fundamentally different from IR. We show for the first time that it is possible to jointly learn the retriever and reader from question-answer string pairs and without any IR system. In this setting, evidence retrieval from all of Wikipedia is treated as a latent variable. Since this is impractical to learn from scratch, we pre-train the retriever with an Inverse Cloze Task. We evaluate on open versions of five QA datasets. On datasets where the questioner already knows the answer, a traditional IR system such as BM25 is sufficient. On datasets where a user is genuinely seeking an answer, we show that learned retrieval is crucial, outperforming BM25 by up to 19 points in exact match.\",\n}\n", "homepage": "https://efficientqa.github.io/", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "nq_open", "config_name": "nq_open", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6651344, "num_examples": 87925, "dataset_name": "nq_open"}, "validation": {"name": "validation", "num_bytes": 156124, "num_examples": 1800, "dataset_name": "nq_open"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.efficientqa.dev.1.1.jsonl": {"num_bytes": 194998, "checksum": "0ad45e2fa9fd32d10b3cf08c63e6d22f97ba1a0ce44ea98040f4c18bce7e268d"}, "https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.train.jsonl": {"num_bytes": 8522298, "checksum": "0c93aaf13b12f855628489665e4ef85ffb8573ad444646f11f4fabdc57a0aabf"}}, "download_size": 8717296, "post_processing_size": null, "dataset_size": 6807468, "size_in_bytes": 15524764}}
+{"nq_open": {"description": "The NQ-Open task, introduced by Lee et.al. 2019,\nis an open domain question answering benchmark that is derived from Natural Questions.\nThe goal is to predict an English answer string for an input English question.\nAll questions can be answered using the contents of English Wikipedia.\n", "citation": "@article{doi:10.1162/tacl_a_00276,\n    author = {Kwiatkowski, Tom and Palomaki, Jennimaria and Redfield, Olivia and Collins, Michael and Parikh, Ankur and Alberti, Chris and Epstein, Danielle and Polosukhin, Illia and Devlin, Jacob and Lee, Kenton and Toutanova, Kristina and Jones, Llion and Kelcey, Matthew and Chang, Ming-Wei and Dai, Andrew                         M. and Uszkoreit, Jakob and Le, Quoc and Petrov, Slav},\n    title = {Natural Questions: A Benchmark for Question Answering Research},\n    journal = {Transactions of the Association for Computational Linguistics},\n    volume = {7},\n    number = {},\n    pages = {453-466},\n    year = {2019},\n    doi = {10.1162/tacl_a_00276},\n    URL = {\n            https://doi.org/10.1162/tacl_a_00276\n        },\n    eprint = {\n            https://doi.org/10.1162/tacl_a_00276\n        },\n    abstract = { We present the Natural Questions corpus, a question answering data set. Questions consist of real anonymized, aggregated queries issued to the Google search engine. An annotator is presented with a question along with a Wikipedia page from the top 5 search results, and annotates a long answer (typically a paragraph) and a short answer (one or more entities) if present on the page, or marks null if no long/short answer is present. The public release consists of 307,373 training examples with single annotations; 7,830 examples with 5-way annotations for development data; and a further 7,842 examples with 5-way annotated sequestered as test data. We present experiments validating quality of the data. We also describe analysis of 25-way annotations on 302 examples, giving insights into human variability on the annotation task. We introduce robust metrics for the purposes of evaluating question answering systems; demonstrate high human upper bounds on these metrics; and establish baseline results using competitive methods drawn from related literature. }\n}\n\n@inproceedings{lee-etal-2019-latent,\n    title = \"Latent Retrieval for Weakly Supervised Open Domain Question Answering\",\n    author = \"Lee, Kenton  and\n      Chang, Ming-Wei  and\n      Toutanova, Kristina\",\n    booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n    month = jul,\n    year = \"2019\",\n    address = \"Florence, Italy\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/P19-1612\",\n    doi = \"10.18653/v1/P19-1612\",\n    pages = \"6086--6096\",\n    abstract = \"Recent work on open domain question answering (QA) assumes strong supervision of the supporting evidence and/or assumes a blackbox information retrieval (IR) system to retrieve evidence candidates. We argue that both are suboptimal, since gold evidence is not always available, and QA is fundamentally different from IR. We show for the first time that it is possible to jointly learn the retriever and reader from question-answer string pairs and without any IR system. In this setting, evidence retrieval from all of Wikipedia is treated as a latent variable. Since this is impractical to learn from scratch, we pre-train the retriever with an Inverse Cloze Task. We evaluate on open versions of five QA datasets. On datasets where the questioner already knows the answer, a traditional IR system such as BM25 is sufficient. On datasets where a user is genuinely seeking an answer, we show that learned retrieval is crucial, outperforming BM25 by up to 19 points in exact match.\",\n}\n", "homepage": "https://efficientqa.github.io/", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "nq_open", "config_name": "nq_open", "version": {"version_str": "2.0.0", "description": "", "major": 2, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6651344, "num_examples": 87925, "dataset_name": "nq_open"}, "validation": {"name": "validation", "num_bytes": 313841, "num_examples": 3610, "dataset_name": "nq_open"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.dev.jsonl": {"num_bytes": 391316, "checksum": "f15567f38099f3615f5b8a685c0aef449c11ad90d3da3735e8d1b98115b40616"}, "https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.train.jsonl": {"num_bytes": 8522298, "checksum": "0c93aaf13b12f855628489665e4ef85ffb8573ad444646f11f4fabdc57a0aabf"}}, "download_size": 8913614, "post_processing_size": null, "dataset_size": 6965185, "size_in_bytes": 15878799}}
diff --git a/datasets/nq_open/dummy/nq_open/1.0.0/dummy_data.zip b/datasets/nq_open/dummy/nq_open/1.0.0/dummy_data.zip
diff --git a/datasets/nq_open/dummy/nq_open/2.0.0/dummy_data.zip b/datasets/nq_open/dummy/nq_open/2.0.0/dummy_data.zip
diff --git a/datasets/nq_open/nq_open.py b/datasets/nq_open/nq_open.py
@@ -65,7 +65,7 @@
 """
 
 _URLS = {
-    "dev": "https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.efficientqa.dev.1.1.jsonl",
+    "dev": "https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.dev.jsonl",
     "train": "https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.train.jsonl",
 }
 
@@ -87,7 +87,7 @@ class NQOpen(datasets.GeneratorBasedBuilder):
     BUILDER_CONFIGS = [
         NQOpenConfig(
             name="nq_open",
-            version=datasets.Version("1.0.0", ""),
+            version=datasets.Version("2.0.0", ""),
             description="NQ_Open open domain question answering dataset.",
         ),
     ]

diff --git a/datasets/swedish_medical_ner/README.md b/datasets/swedish_medical_ner/README.md
@@ -0,0 +1,194 @@
+---
+annotations_creators:
+- machine-generated
+- expert-generated
+language_creators:
+- found
+languages:
+- sv-SE
+licenses:
+- cc-by-sa-4.0
+multilinguality:
+- monolingual
+size_categories:
+- 100K<n<1M
+source_datasets:
+- original
+task_categories:
+- structure-prediction
+task_ids:
+- named-entity-recognition
+pretty_name: SwedMedNER
+---
+
+# Dataset Card for swedish_medical_ner
+
+## Table of Contents
+- [Dataset Description](#dataset-description)
+  - [Dataset Summary](#dataset-summary)
+  - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
+  - [Languages](#languages)
+- [Dataset Structure](#dataset-structure)
+  - [Data Instances](#data-instances)
+  - [Data Fields](#data-fields)
+  - [Data Splits](#data-splits)
+- [Dataset Creation](#dataset-creation)
+  - [Curation Rationale](#curation-rationale)
+  - [Source Data](#source-data)
+  - [Annotations](#annotations)
+  - [Personal and Sensitive Information](#personal-and-sensitive-information)
+- [Considerations for Using the Data](#considerations-for-using-the-data)
+  - [Social Impact of Dataset](#social-impact-of-dataset)
+  - [Discussion of Biases](#discussion-of-biases)
+  - [Other Known Limitations](#other-known-limitations)
+- [Additional Information](#additional-information)
+  - [Dataset Curators](#dataset-curators)
+  - [Licensing Information](#licensing-information)
+  - [Citation Information](#citation-information)
+  - [Contributions](#contributions)
+
+## Dataset Description
+
+- **Repository:** https://github.com/olofmogren/biomedical-ner-data-swedish
+- **Paper:** [Named Entity Recognition in Swedish Health Records with Character-Based Deep Bidirectional LSTMs](https://aclanthology.org/W16-5104.pdf)
+- **Point of Contact:** [Olof Mogren](olof@mogren.one)
+
+### Dataset Summary
+
+SwedMedNER is Named Entity Recognition dataset on medical text in Swedish. It consists three subsets which are in turn derived from three different sources respectively: the Swedish Wikipedia (a.k.a. wiki), Läkartidningen (a.k.a. lt), and 1177 Vårdguiden (a.k.a. 1177). While the Swedish Wikipedia and Läkartidningen subsets in total contains over 790000 sequences with 60 characters each, the 1177 Vårdguiden subset is manually annotated and contains 927 sentences, 2740 annotations, out of which 1574 are _disorder and findings_, 546 are _pharmaceutical drug_, and 620 are _body structure_.
+
+Texts from both Swedish Wikipedia and Läkartidningen were automatically annotated using a list of medical seed terms. Sentences from 1177 Vårdguiden were manuually annotated.
+
+
+### Supported Tasks and Leaderboards
+
+Medical NER.
+
+### Languages
+
+Swedish (SV).
+
+## Dataset Structure
+
+### Data Instances
+
+Annotated example sentences are shown below:
+
+```
+( Förstoppning ) är ett vanligt problem hos äldre.
+[ Cox-hämmare ] finns även som gel och sprej.
+[ Medicinen ] kan också göra att man blöder lättare eftersom den påverkar { blodets } förmåga att levra sig.
+```
+
+Tags are as follows:
+- Prenthesis, (): Disorder and Finding
+- Brackets, []: Pharmaceutical Drug
+- Curly brackets, {}: Body Structure
+
+Data example:
+
+```
+In: data = load_dataset('./datasets/swedish_medical_ner', "wiki")
+In: data['train']:
+Out: 
+Dataset({
+    features: ['sid', 'sentence', 'entities'],
+    num_rows: 48720
+})
+
+In: data['train'][0]['sentence']
+Out: '{kropp} beskrivs i till exempel människokroppen, anatomi och f'
+In: data['train'][0]['entities']
+Out: {'start': [0], 'end': [7], 'text': ['kropp'], 'type': [2]}
+```
+
+### Data Fields
+
+- `sentence`
+- `entities`
+    - `start`: the start index
+    - `end`: the end index
+    - `text`: the text of the entity
+    - `type`: entity type: Disorder and Finding (0), Pharmaceutical Drug (1) or Body Structure (2)
+
+### Data Splits
+
+In the original paper, its authors used the text from Läkartidningen for model training, Swedish Wikipedia for validation, and 1177.se for the final model evaluation. 
+
+## Dataset Creation
+
+### Curation Rationale
+
+### Source Data
+
+- Swedish Wikipedia;
+- Läkartidningen - contains articles from the Swedish journal for medical professionals;
+- 1177.se - a web site provided by the Swedish public health care authorities, containing information, counselling, and other health-care services.
+
+#### Initial Data Collection and Normalization
+
+[More Information Needed]
+
+#### Who are the source language producers?
+
+[More Information Needed]
+
+### Annotations
+
+#### Annotation process
+
+- A list of seed terms was extracted using SweMeSH and SNOMED CT;
+   - The following predefined categories was used for the extraction: disorder & finding (sjukdom & symtom), pharmaceutical drug (läkemedel) and body structure (kroppsdel)
+- For _Swedish Wikipedia_, an initial list of medical domain articles were selected manually. These source articles as well as their linked articles were downloaded and automatically annotated by finding the aforementioned seed terms with a context window of 60 characters;
+- Articles from the _Läkartidningen_ corpus were downloaded and automatically annotated by finding the aforementioned seed terms with a context window of 60 characters;
+- 15 documents from _1177.se_ were downloaded in May 2016 and then manually annotated with the seed terms as support, resulting 2740 annotations.
+
+#### Who are the annotators?
+
+[More Information Needed]
+
+### Personal and Sensitive Information
+
+[More Information Needed]
+
+## Considerations for Using the Data
+
+### Social Impact of Dataset
+
+[More Information Needed]
+
+### Discussion of Biases
+
+[More Information Needed]
+
+### Other Known Limitations
+
+[More Information Needed]
+
+## Additional Information
+
+### Dataset Curators
+
+- Simon Almgren, simonwalmgren@gmail.com
+- Sean Pavlov, sean.pavlov@gmail.com
+- Olof Mogren, olof@mogren.one
+Chalmers University of Technology
+
+### Licensing Information
+
+This dataset is released under the [Creative Commons Attribution-ShareAlike 4.0 International Public License (CC BY-SA 4.0)](http://creativecommons.org/licenses/by-sa/4.0/).
+
+### Citation Information
+```bibtex
+@inproceedings{almgrenpavlovmogren2016bioner,
+  title={Named Entity Recognition in Swedish Medical Journals with Deep Bidirectional Character-Based LSTMs},
+  author={Simon Almgren, Sean Pavlov, Olof Mogren},
+  booktitle={Proceedings of the Fifth Workshop on Building and Evaluating Resources for Biomedical Text Mining (BioTxtM 2016)},
+  pages={1},
+  year={2016}
+}
+```
+
+### Contributions
+
+Thanks to [@bwang482](https://github.com/bwang482) for adding this dataset.
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"nq_open": {"description": "The NQ-Open task, introduced by Lee et.al. 2019,\nis an open domain question answering benchmark that is derived from Natural Questions.\nThe goal is to predict an English answer string for an input English question.\nAll questions can be answered using the contents of English Wikipedia.\n", "citation": "@article{doi:10.1162/tacl_a_00276,\n author = {Kwiatkowski, Tom and Palomaki, Jennimaria and Redfield, Olivia and Collins, Michael and Parikh, Ankur and Alberti, Chris and Epstein, Danielle and Polosukhin, Illia and Devlin, Jacob and Lee, Kenton and Toutanova, Kristina and Jones, Llion and Kelcey, Matthew and Chang, Ming-Wei and Dai, Andrew M. and Uszkoreit, Jakob and Le, Quoc and Petrov, Slav},\n title = {Natural Questions: A Benchmark for Question Answering Research},\n journal = {Transactions of the Association for Computational Linguistics},\n volume = {7},\n number = {},\n pages = {453-466},\n year = {2019},\n doi = {10.1162/tacl_a_00276},\n URL = {\n https://doi.org/10.1162/tacl_a_00276\n },\n eprint = {\n https://doi.org/10.1162/tacl_a_00276\n },\n abstract = { We present the Natural Questions corpus, a question answering data set. Questions consist of real anonymized, aggregated queries issued to the Google search engine. An annotator is presented with a question along with a Wikipedia page from the top 5 search results, and annotates a long answer (typically a paragraph) and a short answer (one or more entities) if present on the page, or marks null if no long/short answer is present. The public release consists of 307,373 training examples with single annotations; 7,830 examples with 5-way annotations for development data; and a further 7,842 examples with 5-way annotated sequestered as test data. We present experiments validating quality of the data. We also describe analysis of 25-way annotations on 302 examples, giving insights into human variability on the annotation task. We introduce robust metrics for the purposes of evaluating question answering systems; demonstrate high human upper bounds on these metrics; and establish baseline results using competitive methods drawn from related literature. }\n}\n\n@inproceedings{lee-etal-2019-latent,\n title = \"Latent Retrieval for Weakly Supervised Open Domain Question Answering\",\n author = \"Lee, Kenton and\n Chang, Ming-Wei and\n Toutanova, Kristina\",\n booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n month = jul,\n year = \"2019\",\n address = \"Florence, Italy\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/P19-1612\",\n doi = \"10.18653/v1/P19-1612\",\n pages = \"6086--6096\",\n abstract = \"Recent work on open domain question answering (QA) assumes strong supervision of the supporting evidence and/or assumes a blackbox information retrieval (IR) system to retrieve evidence candidates. We argue that both are suboptimal, since gold evidence is not always available, and QA is fundamentally different from IR. We show for the first time that it is possible to jointly learn the retriever and reader from question-answer string pairs and without any IR system. In this setting, evidence retrieval from all of Wikipedia is treated as a latent variable. Since this is impractical to learn from scratch, we pre-train the retriever with an Inverse Cloze Task. We evaluate on open versions of five QA datasets. On datasets where the questioner already knows the answer, a traditional IR system such as BM25 is sufficient. On datasets where a user is genuinely seeking an answer, we show that learned retrieval is crucial, outperforming BM25 by up to 19 points in exact match.\",\n}\n", "homepage": "https://efficientqa.github.io/", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "nq_open", "config_name": "nq_open", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6651344, "num_examples": 87925, "dataset_name": "nq_open"}, "validation": {"name": "validation", "num_bytes": 156124, "num_examples": 1800, "dataset_name": "nq_open"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.efficientqa.dev.1.1.jsonl": {"num_bytes": 194998, "checksum": "0ad45e2fa9fd32d10b3cf08c63e6d22f97ba1a0ce44ea98040f4c18bce7e268d"}, "https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.train.jsonl": {"num_bytes": 8522298, "checksum": "0c93aaf13b12f855628489665e4ef85ffb8573ad444646f11f4fabdc57a0aabf"}}, "download_size": 8717296, "post_processing_size": null, "dataset_size": 6807468, "size_in_bytes": 15524764}}
		{"nq_open": {"description": "The NQ-Open task, introduced by Lee et.al. 2019,\nis an open domain question answering benchmark that is derived from Natural Questions.\nThe goal is to predict an English answer string for an input English question.\nAll questions can be answered using the contents of English Wikipedia.\n", "citation": "@article{doi:10.1162/tacl_a_00276,\n author = {Kwiatkowski, Tom and Palomaki, Jennimaria and Redfield, Olivia and Collins, Michael and Parikh, Ankur and Alberti, Chris and Epstein, Danielle and Polosukhin, Illia and Devlin, Jacob and Lee, Kenton and Toutanova, Kristina and Jones, Llion and Kelcey, Matthew and Chang, Ming-Wei and Dai, Andrew M. and Uszkoreit, Jakob and Le, Quoc and Petrov, Slav},\n title = {Natural Questions: A Benchmark for Question Answering Research},\n journal = {Transactions of the Association for Computational Linguistics},\n volume = {7},\n number = {},\n pages = {453-466},\n year = {2019},\n doi = {10.1162/tacl_a_00276},\n URL = {\n https://doi.org/10.1162/tacl_a_00276\n },\n eprint = {\n https://doi.org/10.1162/tacl_a_00276\n },\n abstract = { We present the Natural Questions corpus, a question answering data set. Questions consist of real anonymized, aggregated queries issued to the Google search engine. An annotator is presented with a question along with a Wikipedia page from the top 5 search results, and annotates a long answer (typically a paragraph) and a short answer (one or more entities) if present on the page, or marks null if no long/short answer is present. The public release consists of 307,373 training examples with single annotations; 7,830 examples with 5-way annotations for development data; and a further 7,842 examples with 5-way annotated sequestered as test data. We present experiments validating quality of the data. We also describe analysis of 25-way annotations on 302 examples, giving insights into human variability on the annotation task. We introduce robust metrics for the purposes of evaluating question answering systems; demonstrate high human upper bounds on these metrics; and establish baseline results using competitive methods drawn from related literature. }\n}\n\n@inproceedings{lee-etal-2019-latent,\n title = \"Latent Retrieval for Weakly Supervised Open Domain Question Answering\",\n author = \"Lee, Kenton and\n Chang, Ming-Wei and\n Toutanova, Kristina\",\n booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n month = jul,\n year = \"2019\",\n address = \"Florence, Italy\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/P19-1612\",\n doi = \"10.18653/v1/P19-1612\",\n pages = \"6086--6096\",\n abstract = \"Recent work on open domain question answering (QA) assumes strong supervision of the supporting evidence and/or assumes a blackbox information retrieval (IR) system to retrieve evidence candidates. We argue that both are suboptimal, since gold evidence is not always available, and QA is fundamentally different from IR. We show for the first time that it is possible to jointly learn the retriever and reader from question-answer string pairs and without any IR system. In this setting, evidence retrieval from all of Wikipedia is treated as a latent variable. Since this is impractical to learn from scratch, we pre-train the retriever with an Inverse Cloze Task. We evaluate on open versions of five QA datasets. On datasets where the questioner already knows the answer, a traditional IR system such as BM25 is sufficient. On datasets where a user is genuinely seeking an answer, we show that learned retrieval is crucial, outperforming BM25 by up to 19 points in exact match.\",\n}\n", "homepage": "https://efficientqa.github.io/", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "nq_open", "config_name": "nq_open", "version": {"version_str": "2.0.0", "description": "", "major": 2, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6651344, "num_examples": 87925, "dataset_name": "nq_open"}, "validation": {"name": "validation", "num_bytes": 313841, "num_examples": 3610, "dataset_name": "nq_open"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.dev.jsonl": {"num_bytes": 391316, "checksum": "f15567f38099f3615f5b8a685c0aef449c11ad90d3da3735e8d1b98115b40616"}, "https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.train.jsonl": {"num_bytes": 8522298, "checksum": "0c93aaf13b12f855628489665e4ef85ffb8573ad444646f11f4fabdc57a0aabf"}}, "download_size": 8913614, "post_processing_size": null, "dataset_size": 6965185, "size_in_bytes": 15878799}}