From 93255c57c0d56f4dc32f2ec6847da27feb3ef9d2 Mon Sep 17 00:00:00 2001
From: "Li-Huai (Allan) Lin" <qqaatw@gmail.com>
Date: Tue, 5 Oct 2021 19:56:55 +0800
Subject: [PATCH 1/5] Fix typo (#3023)

---
 src/datasets/fingerprint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/fingerprint.py b/src/datasets/fingerprint.py
index 1c5362afee8..d8cd45a1732 100644
--- a/src/datasets/fingerprint.py
+++ b/src/datasets/fingerprint.py
@@ -164,7 +164,7 @@ def proxy(func):
 
 
 class Hasher:
-    """Hasher that accepts python objets as inputs."""
+    """Hasher that accepts python objects as inputs."""
 
     dispatch: Dict = {}
 

From fdc02f3377bf44ab10ba3402e3dc387690ed6bfe Mon Sep 17 00:00:00 2001
From: Bo Wang <6764450+bwang482@users.noreply.github.com>
Date: Tue, 5 Oct 2021 08:13:33 -0400
Subject: [PATCH 2/5] add swedish_medical_ner dataset (#2940)

* add swedish_medical_ner dataset

* update swedish_medical_ner

* Apply suggestions from code review

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
---
 datasets/swedish_medical_ner/README.md        | 194 +++++++++++++++++
 .../swedish_medical_ner/dataset_infos.json    |   1 +
 .../dummy/1177/1.0.0/dummy_data.zip           | Bin 0 -> 1256 bytes
 .../dummy/lt/1.0.0/dummy_data.zip             | Bin 0 -> 1256 bytes
 .../dummy/wiki/1.0.0/dummy_data.zip           | Bin 0 -> 1256 bytes
 .../swedish_medical_ner.py                    | 202 ++++++++++++++++++
 6 files changed, 397 insertions(+)
 create mode 100644 datasets/swedish_medical_ner/README.md
 create mode 100644 datasets/swedish_medical_ner/dataset_infos.json
 create mode 100644 datasets/swedish_medical_ner/dummy/1177/1.0.0/dummy_data.zip
 create mode 100644 datasets/swedish_medical_ner/dummy/lt/1.0.0/dummy_data.zip
 create mode 100644 datasets/swedish_medical_ner/dummy/wiki/1.0.0/dummy_data.zip
 create mode 100644 datasets/swedish_medical_ner/swedish_medical_ner.py

diff --git a/datasets/swedish_medical_ner/README.md b/datasets/swedish_medical_ner/README.md
new file mode 100644
index 00000000000..7c4b4910ae8
--- /dev/null
+++ b/datasets/swedish_medical_ner/README.md
@@ -0,0 +1,194 @@
+---
+annotations_creators:
+- machine-generated
+- expert-generated
+language_creators:
+- found
+languages:
+- sv-SE
+licenses:
+- cc-by-sa-4.0
+multilinguality:
+- monolingual
+size_categories:
+- 100K<n<1M
+source_datasets:
+- original
+task_categories:
+- structure-prediction
+task_ids:
+- named-entity-recognition
+pretty_name: SwedMedNER
+---
+
+# Dataset Card for swedish_medical_ner
+
+## Table of Contents
+- [Dataset Description](#dataset-description)
+  - [Dataset Summary](#dataset-summary)
+  - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
+  - [Languages](#languages)
+- [Dataset Structure](#dataset-structure)
+  - [Data Instances](#data-instances)
+  - [Data Fields](#data-fields)
+  - [Data Splits](#data-splits)
+- [Dataset Creation](#dataset-creation)
+  - [Curation Rationale](#curation-rationale)
+  - [Source Data](#source-data)
+  - [Annotations](#annotations)
+  - [Personal and Sensitive Information](#personal-and-sensitive-information)
+- [Considerations for Using the Data](#considerations-for-using-the-data)
+  - [Social Impact of Dataset](#social-impact-of-dataset)
+  - [Discussion of Biases](#discussion-of-biases)
+  - [Other Known Limitations](#other-known-limitations)
+- [Additional Information](#additional-information)
+  - [Dataset Curators](#dataset-curators)
+  - [Licensing Information](#licensing-information)
+  - [Citation Information](#citation-information)
+  - [Contributions](#contributions)
+
+## Dataset Description
+
+- **Repository:** https://github.com/olofmogren/biomedical-ner-data-swedish
+- **Paper:** [Named Entity Recognition in Swedish Health Records with Character-Based Deep Bidirectional LSTMs](https://aclanthology.org/W16-5104.pdf)
+- **Point of Contact:** [Olof Mogren](olof@mogren.one)
+
+### Dataset Summary
+
+SwedMedNER is Named Entity Recognition dataset on medical text in Swedish. It consists three subsets which are in turn derived from three different sources respectively: the Swedish Wikipedia (a.k.a. wiki), Läkartidningen (a.k.a. lt), and 1177 Vårdguiden (a.k.a. 1177). While the Swedish Wikipedia and Läkartidningen subsets in total contains over 790000 sequences with 60 characters each, the 1177 Vårdguiden subset is manually annotated and contains 927 sentences, 2740 annotations, out of which 1574 are _disorder and findings_, 546 are _pharmaceutical drug_, and 620 are _body structure_.
+
+Texts from both Swedish Wikipedia and Läkartidningen were automatically annotated using a list of medical seed terms. Sentences from 1177 Vårdguiden were manuually annotated.
+
+
+### Supported Tasks and Leaderboards
+
+Medical NER.
+
+### Languages
+
+Swedish (SV).
+
+## Dataset Structure
+
+### Data Instances
+
+Annotated example sentences are shown below:
+
+```
+( Förstoppning ) är ett vanligt problem hos äldre.
+[ Cox-hämmare ] finns även som gel och sprej.
+[ Medicinen ] kan också göra att man blöder lättare eftersom den påverkar { blodets } förmåga att levra sig.
+```
+
+Tags are as follows:
+- Prenthesis, (): Disorder and Finding
+- Brackets, []: Pharmaceutical Drug
+- Curly brackets, {}: Body Structure
+
+Data example:
+
+```
+In: data = load_dataset('./datasets/swedish_medical_ner', "wiki")
+In: data['train']:
+Out: 
+Dataset({
+    features: ['sid', 'sentence', 'entities'],
+    num_rows: 48720
+})
+
+In: data['train'][0]['sentence']
+Out: '{kropp} beskrivs i till exempel människokroppen, anatomi och f'
+In: data['train'][0]['entities']
+Out: {'start': [0], 'end': [7], 'text': ['kropp'], 'type': [2]}
+```
+
+### Data Fields
+
+- `sentence`
+- `entities`
+    - `start`: the start index
+    - `end`: the end index
+    - `text`: the text of the entity
+    - `type`: entity type: Disorder and Finding (0), Pharmaceutical Drug (1) or Body Structure (2)
+
+### Data Splits
+
+In the original paper, its authors used the text from Läkartidningen for model training, Swedish Wikipedia for validation, and 1177.se for the final model evaluation. 
+
+## Dataset Creation
+
+### Curation Rationale
+
+### Source Data
+
+- Swedish Wikipedia;
+- Läkartidningen - contains articles from the Swedish journal for medical professionals;
+- 1177.se - a web site provided by the Swedish public health care authorities, containing information, counselling, and other health-care services.
+
+#### Initial Data Collection and Normalization
+
+[More Information Needed]
+
+#### Who are the source language producers?
+
+[More Information Needed]
+
+### Annotations
+
+#### Annotation process
+
+- A list of seed terms was extracted using SweMeSH and SNOMED CT;
+   - The following predefined categories was used for the extraction: disorder & finding (sjukdom & symtom), pharmaceutical drug (läkemedel) and body structure (kroppsdel)
+- For _Swedish Wikipedia_, an initial list of medical domain articles were selected manually. These source articles as well as their linked articles were downloaded and automatically annotated by finding the aforementioned seed terms with a context window of 60 characters;
+- Articles from the _Läkartidningen_ corpus were downloaded and automatically annotated by finding the aforementioned seed terms with a context window of 60 characters;
+- 15 documents from _1177.se_ were downloaded in May 2016 and then manually annotated with the seed terms as support, resulting 2740 annotations.
+
+#### Who are the annotators?
+
+[More Information Needed]
+
+### Personal and Sensitive Information
+
+[More Information Needed]
+
+## Considerations for Using the Data
+
+### Social Impact of Dataset
+
+[More Information Needed]
+
+### Discussion of Biases
+
+[More Information Needed]
+
+### Other Known Limitations
+
+[More Information Needed]
+
+## Additional Information
+
+### Dataset Curators
+
+- Simon Almgren, simonwalmgren@gmail.com
+- Sean Pavlov, sean.pavlov@gmail.com
+- Olof Mogren, olof@mogren.one
+Chalmers University of Technology
+
+### Licensing Information
+
+This dataset is released under the [Creative Commons Attribution-ShareAlike 4.0 International Public License (CC BY-SA 4.0)](http://creativecommons.org/licenses/by-sa/4.0/).
+
+### Citation Information
+```bibtex
+@inproceedings{almgrenpavlovmogren2016bioner,
+  title={Named Entity Recognition in Swedish Medical Journals with Deep Bidirectional Character-Based LSTMs},
+  author={Simon Almgren, Sean Pavlov, Olof Mogren},
+  booktitle={Proceedings of the Fifth Workshop on Building and Evaluating Resources for Biomedical Text Mining (BioTxtM 2016)},
+  pages={1},
+  year={2016}
+}
+```
+
+### Contributions
+
+Thanks to [@bwang482](https://github.com/bwang482) for adding this dataset.
\ No newline at end of file
diff --git a/datasets/swedish_medical_ner/dataset_infos.json b/datasets/swedish_medical_ner/dataset_infos.json
new file mode 100644
index 00000000000..86a6dce76d9
--- /dev/null
+++ b/datasets/swedish_medical_ner/dataset_infos.json
@@ -0,0 +1 @@
+{"wiki": {"description": "SwedMedNER is a dataset for training and evaluating Named Entity Recognition systems on medical texts in Swedish.\nIt is derived from medical articles on the Swedish Wikipedia, L\u00e4kartidningen, and 1177 V\u00e5rdguiden.\n", "citation": "@inproceedings{almgrenpavlovmogren2016bioner,\n  title={Named Entity Recognition in Swedish Medical Journals with Deep Bidirectional Character-Based LSTMs},\n  author={Simon Almgren, Sean Pavlov, Olof Mogren},\n  booktitle={Proceedings of the Fifth Workshop on Building and Evaluating Resources for Biomedical Text Mining (BioTxtM 2016)},\n  pages={1},\n  year={2016}\n}\n", "homepage": "https://github.com/olofmogren/biomedical-ner-data-swedish", "license": "Creative Commons Attribution-ShareAlike 4.0 International Public License (CC BY-SA 4.0)\nSee http://creativecommons.org/licenses/by-sa/4.0/ for the summary of the license.\n", "features": {"sid": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "entities": {"feature": {"start": {"dtype": "int32", "id": null, "_type": "Value"}, "end": {"dtype": "int32", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"num_classes": 3, "names": ["Disorder and Finding", "Pharmaceutical Drug", "Body Structure"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "swedish_medical_ner", "config_name": "wiki", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 7044714, "num_examples": 48720, "dataset_name": "swedish_medical_ner"}}, "download_checksums": {"https://raw.githubusercontent.com/olofmogren/biomedical-ner-data-swedish/master/Wiki_annotated_60.txt": {"num_bytes": 3219144, "checksum": "734636dfd409c27539ca7fa57db35f04f9f6bdd8f0af4e385fb32f0c26a702f1"}, "https://raw.githubusercontent.com/olofmogren/biomedical-ner-data-swedish/master/LT_annotated_60.txt": {"num_bytes": 48959042, "checksum": "76a8e0aa4a56039074a15bcd95122cdfdecc4d1c1ddf71d94b58a25d42577dc3"}, "https://raw.githubusercontent.com/olofmogren/biomedical-ner-data-swedish/master/1177_annotated_sentences.txt": {"num_bytes": 94526, "checksum": "7c55f61f57cc1504e47b8b69d01ba763a13a3d3ebce4b0ca9851133392fd000a"}}, "download_size": 52272712, "post_processing_size": null, "dataset_size": 7044714, "size_in_bytes": 59317426}, "lt": {"description": "SwedMedNER is a dataset for training and evaluating Named Entity Recognition systems on medical texts in Swedish.\nIt is derived from medical articles on the Swedish Wikipedia, L\u00e4kartidningen, and 1177 V\u00e5rdguiden.\n", "citation": "@inproceedings{almgrenpavlovmogren2016bioner,\n  title={Named Entity Recognition in Swedish Medical Journals with Deep Bidirectional Character-Based LSTMs},\n  author={Simon Almgren, Sean Pavlov, Olof Mogren},\n  booktitle={Proceedings of the Fifth Workshop on Building and Evaluating Resources for Biomedical Text Mining (BioTxtM 2016)},\n  pages={1},\n  year={2016}\n}\n", "homepage": "https://github.com/olofmogren/biomedical-ner-data-swedish", "license": "Creative Commons Attribution-ShareAlike 4.0 International Public License (CC BY-SA 4.0)\nSee http://creativecommons.org/licenses/by-sa/4.0/ for the summary of the license.\n", "features": {"sid": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "entities": {"feature": {"start": {"dtype": "int32", "id": null, "_type": "Value"}, "end": {"dtype": "int32", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"num_classes": 3, "names": ["Disorder and Finding", "Pharmaceutical Drug", "Body Structure"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "swedish_medical_ner", "config_name": "lt", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 97955287, "num_examples": 745753, "dataset_name": "swedish_medical_ner"}}, "download_checksums": {"https://raw.githubusercontent.com/olofmogren/biomedical-ner-data-swedish/master/Wiki_annotated_60.txt": {"num_bytes": 3219144, "checksum": "734636dfd409c27539ca7fa57db35f04f9f6bdd8f0af4e385fb32f0c26a702f1"}, "https://raw.githubusercontent.com/olofmogren/biomedical-ner-data-swedish/master/LT_annotated_60.txt": {"num_bytes": 48959042, "checksum": "76a8e0aa4a56039074a15bcd95122cdfdecc4d1c1ddf71d94b58a25d42577dc3"}, "https://raw.githubusercontent.com/olofmogren/biomedical-ner-data-swedish/master/1177_annotated_sentences.txt": {"num_bytes": 94526, "checksum": "7c55f61f57cc1504e47b8b69d01ba763a13a3d3ebce4b0ca9851133392fd000a"}}, "download_size": 52272712, "post_processing_size": null, "dataset_size": 97955287, "size_in_bytes": 150227999}, "1177": {"description": "SwedMedNER is a dataset for training and evaluating Named Entity Recognition systems on medical texts in Swedish.\nIt is derived from medical articles on the Swedish Wikipedia, L\u00e4kartidningen, and 1177 V\u00e5rdguiden.\n", "citation": "@inproceedings{almgrenpavlovmogren2016bioner,\n  title={Named Entity Recognition in Swedish Medical Journals with Deep Bidirectional Character-Based LSTMs},\n  author={Simon Almgren, Sean Pavlov, Olof Mogren},\n  booktitle={Proceedings of the Fifth Workshop on Building and Evaluating Resources for Biomedical Text Mining (BioTxtM 2016)},\n  pages={1},\n  year={2016}\n}\n", "homepage": "https://github.com/olofmogren/biomedical-ner-data-swedish", "license": "Creative Commons Attribution-ShareAlike 4.0 International Public License (CC BY-SA 4.0)\nSee http://creativecommons.org/licenses/by-sa/4.0/ for the summary of the license.\n", "features": {"sid": {"dtype": "string", "id": null, "_type": "Value"}, "sentence": {"dtype": "string", "id": null, "_type": "Value"}, "entities": {"feature": {"start": {"dtype": "int32", "id": null, "_type": "Value"}, "end": {"dtype": "int32", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "type": {"num_classes": 3, "names": ["Disorder and Finding", "Pharmaceutical Drug", "Body Structure"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "swedish_medical_ner", "config_name": "1177", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 159007, "num_examples": 927, "dataset_name": "swedish_medical_ner"}}, "download_checksums": {"https://raw.githubusercontent.com/olofmogren/biomedical-ner-data-swedish/master/Wiki_annotated_60.txt": {"num_bytes": 3219144, "checksum": "734636dfd409c27539ca7fa57db35f04f9f6bdd8f0af4e385fb32f0c26a702f1"}, "https://raw.githubusercontent.com/olofmogren/biomedical-ner-data-swedish/master/LT_annotated_60.txt": {"num_bytes": 48959042, "checksum": "76a8e0aa4a56039074a15bcd95122cdfdecc4d1c1ddf71d94b58a25d42577dc3"}, "https://raw.githubusercontent.com/olofmogren/biomedical-ner-data-swedish/master/1177_annotated_sentences.txt": {"num_bytes": 94526, "checksum": "7c55f61f57cc1504e47b8b69d01ba763a13a3d3ebce4b0ca9851133392fd000a"}}, "download_size": 52272712, "post_processing_size": null, "dataset_size": 159007, "size_in_bytes": 52431719}}
\ No newline at end of file
diff --git a/datasets/swedish_medical_ner/dummy/1177/1.0.0/dummy_data.zip b/datasets/swedish_medical_ner/dummy/1177/1.0.0/dummy_data.zip
new file mode 100644
index 0000000000000000000000000000000000000000..961df338b37cf495b2713fd60ce5f6cf14acf639
GIT binary patch
literal 1256
zcmWIWW@Zs#0D;;qHo;&9l;8%^DW$o&mGLQwC5ie0P=y>|h1Wj?$6N*~aA#y-P(W1}
zo|&B)pO}}IUy@joni6kjpjT2+lG|_S$Lz@C`rG!Xy~7E{7ZVn=OuDDS_wihht<-rp
z$JKkeU3W0CC+Htb5nd=#a_#s1`}dPIrhRTP{}%A%fj~&7V9k>$mP%*ULq8^T_m{E;
zt(wNN?Pl`zN_z#*H@-U){%KV#pP~`=X12Sl_OSquRFBJCij#J~NjcoQt-$+*gY#Ci
zt#6AJu6OycCjPq?cH)w2^y8^s=bx|Pj^esrw*Gf}BVWSN$EOd+ZaVQ$S?^}DURY#F
z)Z>?KZ&$JOeLsKQ-BWh4`K47pJC=t3_-r|!YulYR`$zjx1LVdYy@c1mfN%o_h#YEw
z_=I2thw22wyu$_pEY*9Hyrl~!>s7gX9X@Jl<C#9mz1c?4py!X<zb5+w{jGsr%<FC(
zJ@<~Y{pq!BIg5l-^WB%cXKnu!Fg@62(UK0W3!-nQel+Y;cIiDB6>xKbNr`s)(k4qw
zCV%1T!+ZA%1n=t-PEK|`?~oh)Avd`ER<Mj@;>3qBVe>V8Ql<-^YpfB|o*o;t^vKnJ
zhJRM|ZM=2o-p=PU7XD~2&UkP2CTaB>?xRbSZMApGpRv_1aNAX^HhGEbmCpP3`<{3T
z{?;}5k~Uv?U1N4W+oc<GwXCLB3*;Mpcg<T}Uf;ceZ6<Gg;F+=pp%>z)!F1H?{&N;!
z5cL4DI%+T(8k(CU1ypfrUP)?Ra%wRs`Pfc4n|s)Rr}h2JN3xf@+cj?8GjhDZENi$a
z>#u}J;ueivy_=oHE+j3Tv&fvW=FdI%xfX%fCKMidEn&y$|3PO(Zudl^qTQ8>J&CC^
zi%-l;vDmX=QEpSw%<SEY+kB_CIIb#Kx=D9VVQ;+GO+%sG1sj{nZ=dYvaX*wa+bcOG
z)a;pH{#(7!)0@6XU4Fr9cErn)ZGFU&2&+V`t7#WG`P+^C_^rzS<s7Q_yO*$K`prs-
z<lWZ0U)`UwJ}k=HXzCr`U!^KPwxtxV5w+hTrFiozzt@wi6{bHPe(;%O7`bbeE6=oB
ze-HDhn>~|TH^sv(?9jc24TrC7tE|4mk}bdX$pzk5J~Q5=%kToz8zYklGw!?!3{4;q
zV0h~YqET`#Oe6NZ3sMdOOBywSB$7s~IT_&$h-v5vA8ZuZG%H{nBbkOJ8=#woo;na_
bB{HFyg)glHc(byBG_nAp2G9wsm_a-M|FPl4

literal 0
HcmV?d00001

diff --git a/datasets/swedish_medical_ner/dummy/lt/1.0.0/dummy_data.zip b/datasets/swedish_medical_ner/dummy/lt/1.0.0/dummy_data.zip
new file mode 100644
index 0000000000000000000000000000000000000000..961df338b37cf495b2713fd60ce5f6cf14acf639
GIT binary patch
literal 1256
zcmWIWW@Zs#0D;;qHo;&9l;8%^DW$o&mGLQwC5ie0P=y>|h1Wj?$6N*~aA#y-P(W1}
zo|&B)pO}}IUy@joni6kjpjT2+lG|_S$Lz@C`rG!Xy~7E{7ZVn=OuDDS_wihht<-rp
z$JKkeU3W0CC+Htb5nd=#a_#s1`}dPIrhRTP{}%A%fj~&7V9k>$mP%*ULq8^T_m{E;
zt(wNN?Pl`zN_z#*H@-U){%KV#pP~`=X12Sl_OSquRFBJCij#J~NjcoQt-$+*gY#Ci
zt#6AJu6OycCjPq?cH)w2^y8^s=bx|Pj^esrw*Gf}BVWSN$EOd+ZaVQ$S?^}DURY#F
z)Z>?KZ&$JOeLsKQ-BWh4`K47pJC=t3_-r|!YulYR`$zjx1LVdYy@c1mfN%o_h#YEw
z_=I2thw22wyu$_pEY*9Hyrl~!>s7gX9X@Jl<C#9mz1c?4py!X<zb5+w{jGsr%<FC(
zJ@<~Y{pq!BIg5l-^WB%cXKnu!Fg@62(UK0W3!-nQel+Y;cIiDB6>xKbNr`s)(k4qw
zCV%1T!+ZA%1n=t-PEK|`?~oh)Avd`ER<Mj@;>3qBVe>V8Ql<-^YpfB|o*o;t^vKnJ
zhJRM|ZM=2o-p=PU7XD~2&UkP2CTaB>?xRbSZMApGpRv_1aNAX^HhGEbmCpP3`<{3T
z{?;}5k~Uv?U1N4W+oc<GwXCLB3*;Mpcg<T}Uf;ceZ6<Gg;F+=pp%>z)!F1H?{&N;!
z5cL4DI%+T(8k(CU1ypfrUP)?Ra%wRs`Pfc4n|s)Rr}h2JN3xf@+cj?8GjhDZENi$a
z>#u}J;ueivy_=oHE+j3Tv&fvW=FdI%xfX%fCKMidEn&y$|3PO(Zudl^qTQ8>J&CC^
zi%-l;vDmX=QEpSw%<SEY+kB_CIIb#Kx=D9VVQ;+GO+%sG1sj{nZ=dYvaX*wa+bcOG
z)a;pH{#(7!)0@6XU4Fr9cErn)ZGFU&2&+V`t7#WG`P+^C_^rzS<s7Q_yO*$K`prs-
z<lWZ0U)`UwJ}k=HXzCr`U!^KPwxtxV5w+hTrFiozzt@wi6{bHPe(;%O7`bbeE6=oB
ze-HDhn>~|TH^sv(?9jc24TrC7tE|4mk}bdX$pzk5J~Q5=%kToz8zYklGw!?!3{4;q
zV0h~YqET`#Oe6NZ3sMdOOBywSB$7s~IT_&$h-v5vA8ZuZG%H{nBbkOJ8=#woo;na_
bB{HFyg)glHc(byBG_nAp2G9wsm_a-M|FPl4

literal 0
HcmV?d00001

diff --git a/datasets/swedish_medical_ner/dummy/wiki/1.0.0/dummy_data.zip b/datasets/swedish_medical_ner/dummy/wiki/1.0.0/dummy_data.zip
new file mode 100644
index 0000000000000000000000000000000000000000..961df338b37cf495b2713fd60ce5f6cf14acf639
GIT binary patch
literal 1256
zcmWIWW@Zs#0D;;qHo;&9l;8%^DW$o&mGLQwC5ie0P=y>|h1Wj?$6N*~aA#y-P(W1}
zo|&B)pO}}IUy@joni6kjpjT2+lG|_S$Lz@C`rG!Xy~7E{7ZVn=OuDDS_wihht<-rp
z$JKkeU3W0CC+Htb5nd=#a_#s1`}dPIrhRTP{}%A%fj~&7V9k>$mP%*ULq8^T_m{E;
zt(wNN?Pl`zN_z#*H@-U){%KV#pP~`=X12Sl_OSquRFBJCij#J~NjcoQt-$+*gY#Ci
zt#6AJu6OycCjPq?cH)w2^y8^s=bx|Pj^esrw*Gf}BVWSN$EOd+ZaVQ$S?^}DURY#F
z)Z>?KZ&$JOeLsKQ-BWh4`K47pJC=t3_-r|!YulYR`$zjx1LVdYy@c1mfN%o_h#YEw
z_=I2thw22wyu$_pEY*9Hyrl~!>s7gX9X@Jl<C#9mz1c?4py!X<zb5+w{jGsr%<FC(
zJ@<~Y{pq!BIg5l-^WB%cXKnu!Fg@62(UK0W3!-nQel+Y;cIiDB6>xKbNr`s)(k4qw
zCV%1T!+ZA%1n=t-PEK|`?~oh)Avd`ER<Mj@;>3qBVe>V8Ql<-^YpfB|o*o;t^vKnJ
zhJRM|ZM=2o-p=PU7XD~2&UkP2CTaB>?xRbSZMApGpRv_1aNAX^HhGEbmCpP3`<{3T
z{?;}5k~Uv?U1N4W+oc<GwXCLB3*;Mpcg<T}Uf;ceZ6<Gg;F+=pp%>z)!F1H?{&N;!
z5cL4DI%+T(8k(CU1ypfrUP)?Ra%wRs`Pfc4n|s)Rr}h2JN3xf@+cj?8GjhDZENi$a
z>#u}J;ueivy_=oHE+j3Tv&fvW=FdI%xfX%fCKMidEn&y$|3PO(Zudl^qTQ8>J&CC^
zi%-l;vDmX=QEpSw%<SEY+kB_CIIb#Kx=D9VVQ;+GO+%sG1sj{nZ=dYvaX*wa+bcOG
z)a;pH{#(7!)0@6XU4Fr9cErn)ZGFU&2&+V`t7#WG`P+^C_^rzS<s7Q_yO*$K`prs-
z<lWZ0U)`UwJ}k=HXzCr`U!^KPwxtxV5w+hTrFiozzt@wi6{bHPe(;%O7`bbeE6=oB
ze-HDhn>~|TH^sv(?9jc24TrC7tE|4mk}bdX$pzk5J~Q5=%kToz8zYklGw!?!3{4;q
zV0h~YqET`#Oe6NZ3sMdOOBywSB$7s~IT_&$h-v5vA8ZuZG%H{nBbkOJ8=#woo;na_
bB{HFyg)glHc(byBG_nAp2G9wsm_a-M|FPl4

literal 0
HcmV?d00001

diff --git a/datasets/swedish_medical_ner/swedish_medical_ner.py b/datasets/swedish_medical_ner/swedish_medical_ner.py
new file mode 100644
index 00000000000..a5055107531
--- /dev/null
+++ b/datasets/swedish_medical_ner/swedish_medical_ner.py
@@ -0,0 +1,202 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SwedMedNER: A Named Entity Recognition Dataset on medical texts in Swedish"""
+
+
+import re
+
+import datasets
+
+
+_CITATION = """\
+@inproceedings{almgrenpavlovmogren2016bioner,
+  title={Named Entity Recognition in Swedish Medical Journals with Deep Bidirectional Character-Based LSTMs},
+  author={Simon Almgren, Sean Pavlov, Olof Mogren},
+  booktitle={Proceedings of the Fifth Workshop on Building and Evaluating Resources for Biomedical Text Mining (BioTxtM 2016)},
+  pages={1},
+  year={2016}
+}
+"""
+
+
+_DESCRIPTION = """\
+SwedMedNER is a dataset for training and evaluating Named Entity Recognition systems on medical texts in Swedish.
+It is derived from medical articles on the Swedish Wikipedia, Läkartidningen, and 1177 Vårdguiden.
+"""
+
+
+_LICENSE = """\
+Creative Commons Attribution-ShareAlike 4.0 International Public License (CC BY-SA 4.0)
+See http://creativecommons.org/licenses/by-sa/4.0/ for the summary of the license.
+"""
+
+
+_URL = "https://github.com/olofmogren/biomedical-ner-data-swedish"
+
+
+_DATA_URL = "https://raw.githubusercontent.com/olofmogren/biomedical-ner-data-swedish/master/"
+
+
+class SwedishMedicalNerConfig(datasets.BuilderConfig):
+    """BuilderConfig for SwedMedNER"""
+
+    def __init__(self, **kwargs):
+        """
+        Args:
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(SwedishMedicalNerConfig, self).__init__(**kwargs)
+
+
+class SwedishMedicalNer(datasets.GeneratorBasedBuilder):
+    """SwedMedNER: A Named Entity Recognition Dataset on medical texts in Swedish"""
+
+    VERSION = datasets.Version("1.0.0")
+
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(name="wiki", version=VERSION, description="The Swedish Wikipedia part of the dataset"),
+        datasets.BuilderConfig(name="lt", version=VERSION, description="The Läkartidningen part of the dataset"),
+        datasets.BuilderConfig(name="1177", version=VERSION, description="The 1177 Vårdguiden part of the dataset"),
+    ]
+
+    def _info(self):
+        if self.config.name == "wiki":
+            features = datasets.Features(
+                {
+                    "sid": datasets.Value("string"),
+                    "sentence": datasets.Value("string"),
+                    "entities": datasets.Sequence(
+                        {
+                            "start": datasets.Value("int32"),
+                            "end": datasets.Value("int32"),
+                            "text": datasets.Value("string"),
+                            "type": datasets.ClassLabel(
+                                names=["Disorder and Finding", "Pharmaceutical Drug", "Body Structure"]
+                            ),
+                        }
+                    ),
+                }
+            )
+        elif self.config.name == "lt":
+            features = datasets.Features(
+                {
+                    "sid": datasets.Value("string"),
+                    "sentence": datasets.Value("string"),
+                    "entities": datasets.Sequence(
+                        {
+                            "start": datasets.Value("int32"),
+                            "end": datasets.Value("int32"),
+                            "text": datasets.Value("string"),
+                            "type": datasets.ClassLabel(
+                                names=["Disorder and Finding", "Pharmaceutical Drug", "Body Structure"]
+                            ),
+                        }
+                    ),
+                }
+            )
+        elif self.config.name == "1177":
+            features = datasets.Features(
+                {
+                    "sid": datasets.Value("string"),
+                    "sentence": datasets.Value("string"),
+                    "entities": datasets.Sequence(
+                        {
+                            "start": datasets.Value("int32"),
+                            "end": datasets.Value("int32"),
+                            "text": datasets.Value("string"),
+                            "type": datasets.ClassLabel(
+                                names=["Disorder and Finding", "Pharmaceutical Drug", "Body Structure"]
+                            ),
+                        }
+                    ),
+                }
+            )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            supervised_keys=None,
+            homepage=_URL,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        urls_to_download = {
+            "wiki": _DATA_URL + "Wiki_annotated_60.txt",
+            "lt": _DATA_URL + "LT_annotated_60.txt",
+            "1177": _DATA_URL + "1177_annotated_sentences.txt",
+        }
+        downloaded_files = dl_manager.download_and_extract(urls_to_download)
+
+        if self.config.name == "wiki":
+            return [
+                datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["wiki"]})
+            ]
+        elif self.config.name == "lt":
+            return [
+                datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["lt"]})
+            ]
+        elif self.config.name == "1177":
+            return [
+                datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["1177"]})
+            ]
+
+    def _generate_examples(self, filepath):
+        """Yields examples as (key, example) tuples."""
+
+        def find_type(s, e):
+            if (s == "(") and (e == ")"):
+                return "Disorder and Finding"
+            elif (s == "[") and (e == "]"):
+                return "Pharmaceutical Drug"
+            elif (s == "{") and (e == "}"):
+                return "Body Structure"
+
+        pattern = r"\[([^\[\]()]+)\]|\(([^\[\]()]+)\)|\{([^\[\]()]+)\}"
+        with open(filepath, encoding="utf-8") as f:
+            for id_, row in enumerate(f):
+                sentence = row.replace("\n", "")
+
+                if self.config.name == "1177":
+                    targets = [
+                        {
+                            "start": m.start(0),
+                            "end": m.end(0),
+                            "text": sentence[m.start(0) + 2 : m.end(0) - 2],
+                            "type": find_type(sentence[m.start(0)], sentence[m.end(0) - 1]),
+                        }
+                        for m in re.finditer(pattern, sentence)
+                    ]
+                    yield id_, {
+                        "sid": self.config.name + "_" + str(id_),
+                        "sentence": sentence,
+                        "entities": targets if targets else [],
+                    }
+                else:
+                    targets = [
+                        {
+                            "start": m.start(0),
+                            "end": m.end(0),
+                            "text": sentence[m.start(0) + 1 : m.end(0) - 1],
+                            "type": find_type(sentence[m.start(0)], sentence[m.end(0) - 1]),
+                        }
+                        for m in re.finditer(pattern, sentence)
+                    ]
+                    yield id_, {
+                        "sid": self.config.name + "_" + str(id_),
+                        "sentence": sentence,
+                        "entities": targets if targets else [],
+                    }

From 2814fbd0e18150be409f10804670e98d9ecb87d4 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Tue, 5 Oct 2021 15:46:38 +0200
Subject: [PATCH 3/5] Extend support for streaming datasets that use glob.glob
 (#3015)

* Test xglob

* Implement xglob

* Use xglob to patch glob.glob

* Rename output_path to output_paths in glob tests

* Create fixture to mock fsspec

* Use fixture to mock fsspec in tests

* Remove unused import

* Pass recursive parameter to glob
---
 src/datasets/streaming.py                     |  2 +
 .../utils/streaming_download_manager.py       | 25 ++++++++
 tests/test_streaming_download_manager.py      | 64 ++++++++++++++-----
 3 files changed, 76 insertions(+), 15 deletions(-)

diff --git a/src/datasets/streaming.py b/src/datasets/streaming.py
index 02d613eae24..60621c8c240 100644
--- a/src/datasets/streaming.py
+++ b/src/datasets/streaming.py
@@ -7,6 +7,7 @@
 from .utils.patching import patch_submodule
 from .utils.streaming_download_manager import (
     xdirname,
+    xglob,
     xjoin,
     xopen,
     xpandas_read_csv,
@@ -47,6 +48,7 @@ def extend_module_for_streaming(module_path, use_auth_token: Optional[Union[str,
         patch_submodule(module, "open", partial(xopen, use_auth_token=use_auth_token)).start()
     else:
         patch_submodule(module, "open", xopen).start()
+    patch_submodule(module, "glob.glob", xglob).start()
     # allow to navigate in remote zip files
     patch_submodule(module, "os.path.join", xjoin).start()
     patch_submodule(module, "os.path.dirname", xdirname).start()
diff --git a/src/datasets/utils/streaming_download_manager.py b/src/datasets/utils/streaming_download_manager.py
index 0c64a009310..097977b8e70 100644
--- a/src/datasets/utils/streaming_download_manager.py
+++ b/src/datasets/utils/streaming_download_manager.py
@@ -1,3 +1,4 @@
+import glob
 import os
 import re
 import time
@@ -180,6 +181,30 @@ def xpathopen(path: Path, *args, **kwargs):
     return xopen(_as_posix(path), *args, **kwargs)
 
 
+def xglob(urlpath, *, recursive=False):
+    """Extend `glob.glob` function to support remote files.
+
+    Args:
+        urlpath (:obj:`str`): URL path with shell-style wildcard patterns.
+        recursive (:obj:`bool`, default `False`): Whether to match the "**" pattern recursively to zero or more
+            directories or subdirectories.
+
+    Returns:
+        :obj:`list` of :obj:`str`
+    """
+    main_hop, *rest_hops = urlpath.split("::")
+    if is_local_path(main_hop):
+        return glob.glob(main_hop, recursive=recursive)
+    else:
+        fs, *_ = fsspec.get_fs_token_paths(urlpath)
+        # - If there's no "*" in the pattern, get_fs_token_paths() doesn't do any pattern matching
+        #   so to be able to glob patterns like "[0-9]", we have to call `fs.glob`.
+        # - Also "*" in get_fs_token_paths() only matches files: we have to call `fs.glob` to match directories.
+        # - If there is "**" in the pattern, `fs.glob` must be called anyway.
+        globbed_paths = fs.glob(main_hop)
+        return ["::".join([f"{fs.protocol}://{globbed_path}"] + rest_hops) for globbed_path in globbed_paths]
+
+
 def xpathglob(path, pattern):
     """Glob function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.
 
diff --git a/tests/test_streaming_download_manager.py b/tests/test_streaming_download_manager.py
index 386e5e863c8..38e746cc464 100644
--- a/tests/test_streaming_download_manager.py
+++ b/tests/test_streaming_download_manager.py
@@ -1,7 +1,6 @@
 import os
 import re
 from pathlib import Path
-from unittest.mock import patch
 
 import pytest
 from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
@@ -12,6 +11,7 @@
     StreamingDownloadManager,
     _as_posix,
     _get_extraction_protocol,
+    xglob,
     xjoin,
     xopen,
     xpathglob,
@@ -117,6 +117,13 @@ def _open(
         )
 
 
+@pytest.fixture
+def mock_fsspec(monkeypatch):
+    dummy_registry = datasets.utils.streaming_download_manager.fsspec.registry.target.copy()
+    dummy_registry["mock"] = DummyTestFS
+    monkeypatch.setattr("datasets.utils.streaming_download_manager.fsspec.registry.target", dummy_registry)
+
+
 def _readd_double_slash_removed_by_path(path_as_posix: str) -> str:
     """Path(...) on an url path like zip://file.txt::http://host.com/data.zip
     converts the :// to :/
@@ -219,6 +226,41 @@ def test_xopen_remote():
         assert list(f) == TEST_URL_CONTENT.splitlines(keepends=True)
 
 
+@pytest.mark.parametrize(
+    "input_path, expected_paths",
+    [
+        ("tmp_path/*.txt", ["file1.txt", "file2.txt"]),
+        ("mock://*", ["mock://glob_test", "mock://misc", "mock://top_level"]),
+        ("mock://top_*", ["mock://top_level"]),
+        (
+            "mock://top_level/second_level/date=2019-10-0[1-4]",
+            [
+                "mock://top_level/second_level/date=2019-10-01",
+                "mock://top_level/second_level/date=2019-10-02",
+                "mock://top_level/second_level/date=2019-10-04",
+            ],
+        ),
+        (
+            "mock://top_level/second_level/date=2019-10-0[1-4]/*",
+            [
+                "mock://top_level/second_level/date=2019-10-01/a.parquet",
+                "mock://top_level/second_level/date=2019-10-01/b.parquet",
+                "mock://top_level/second_level/date=2019-10-02/a.parquet",
+                "mock://top_level/second_level/date=2019-10-04/a.parquet",
+            ],
+        ),
+    ],
+)
+def test_xglob(input_path, expected_paths, tmp_path, mock_fsspec):
+    if input_path.startswith("tmp_path"):
+        input_path = input_path.replace("/", os.sep).replace("tmp_path", str(tmp_path))
+        expected_paths = [str(tmp_path / file) for file in expected_paths]
+        for file in ["file1.txt", "file2.txt", "README.md"]:
+            (tmp_path / file).touch()
+    output_paths = sorted(xglob(input_path))
+    assert output_paths == expected_paths
+
+
 @pytest.mark.parametrize(
     "input_path, pattern, expected_paths",
     [
@@ -246,20 +288,16 @@ def test_xopen_remote():
         ),
     ],
 )
-def test_xpathglob(input_path, pattern, expected_paths, tmp_path):
+def test_xpathglob(input_path, pattern, expected_paths, tmp_path, mock_fsspec):
     if input_path == "tmp_path":
         input_path = tmp_path
         expected_paths = [tmp_path / file for file in expected_paths]
         for file in ["file1.txt", "file2.txt", "README.md"]:
             (tmp_path / file).touch()
-        output_path = sorted(xpathglob(input_path, pattern))
     else:
-        dummy_registry = datasets.utils.streaming_download_manager.fsspec.registry.target.copy()
-        dummy_registry["mock"] = DummyTestFS
         expected_paths = [Path(file) for file in expected_paths]
-        with patch.dict(datasets.utils.streaming_download_manager.fsspec.registry.target, dummy_registry):
-            output_path = sorted(xpathglob(Path(input_path), pattern))
-    assert output_path == expected_paths
+    output_paths = sorted(xpathglob(Path(input_path), pattern))
+    assert output_paths == expected_paths
 
 
 @pytest.mark.parametrize(
@@ -306,7 +344,7 @@ def test_xpathglob(input_path, pattern, expected_paths, tmp_path):
         ),
     ],
 )
-def test_xpathrglob(input_path, pattern, expected_paths, tmp_path):
+def test_xpathrglob(input_path, pattern, expected_paths, tmp_path, mock_fsspec):
     if input_path == "tmp_path":
         input_path = tmp_path
         dir_path = tmp_path / "dir"
@@ -314,14 +352,10 @@ def test_xpathrglob(input_path, pattern, expected_paths, tmp_path):
         expected_paths = [dir_path / file for file in expected_paths]
         for file in ["file1.txt", "file2.txt", "README.md"]:
             (dir_path / file).touch()
-        output_path = sorted(xpathrglob(input_path, pattern))
     else:
-        dummy_registry = datasets.utils.streaming_download_manager.fsspec.registry.target.copy()
-        dummy_registry["mock"] = DummyTestFS
         expected_paths = [Path(file) for file in expected_paths]
-        with patch.dict(datasets.utils.streaming_download_manager.fsspec.registry.target, dummy_registry):
-            output_path = sorted(xpathrglob(Path(input_path), pattern))
-    assert output_path == expected_paths
+    output_paths = sorted(xpathrglob(Path(input_path), pattern))
+    assert output_paths == expected_paths
 
 
 @pytest.mark.parametrize(

From 83bc8a2f01a993233b7d7806c3bda5b6a657ef40 Mon Sep 17 00:00:00 2001
From: Colin Raffel <craffel@gmail.com>
Date: Tue, 5 Oct 2021 10:56:44 -0400
Subject: [PATCH 4/5] Use standard open-domain validation split in nq_open
 (#3029)

* Use standard open-domain validation split in nq_open

* Update dataset_info.json

* Move and update dummy_data.zip

* Add pretty name
---
 datasets/nq_open/README.md                       |   3 ++-
 datasets/nq_open/dataset_infos.json              |   2 +-
 .../nq_open/dummy/nq_open/1.0.0/dummy_data.zip   | Bin 904 -> 0 bytes
 .../nq_open/dummy/nq_open/2.0.0/dummy_data.zip   | Bin 0 -> 1387 bytes
 datasets/nq_open/nq_open.py                      |   4 ++--
 5 files changed, 5 insertions(+), 4 deletions(-)
 delete mode 100644 datasets/nq_open/dummy/nq_open/1.0.0/dummy_data.zip
 create mode 100644 datasets/nq_open/dummy/nq_open/2.0.0/dummy_data.zip

diff --git a/datasets/nq_open/README.md b/datasets/nq_open/README.md
index f87d76b871d..92dfe3f54a1 100644
--- a/datasets/nq_open/README.md
+++ b/datasets/nq_open/README.md
@@ -9,6 +9,7 @@ licenses:
 - cc-by-sa-3.0
 multilinguality:
 - monolingual
+pretty_name: NQ-Open
 size_categories:
 - 10K<n<100K
 source_datasets:
@@ -197,4 +198,4 @@ All of the Natural Questions data is released under the
 
 ### Contributions
 
-Thanks to [@Nilanshrajput](https://github.com/Nilanshrajput) for adding this dataset.
\ No newline at end of file
+Thanks to [@Nilanshrajput](https://github.com/Nilanshrajput) for adding this dataset.
diff --git a/datasets/nq_open/dataset_infos.json b/datasets/nq_open/dataset_infos.json
index fcf3ffd83de..a3661c111ce 100644
--- a/datasets/nq_open/dataset_infos.json
+++ b/datasets/nq_open/dataset_infos.json
@@ -1 +1 @@
-{"nq_open": {"description": "The NQ-Open task, introduced by Lee et.al. 2019,\nis an open domain question answering benchmark that is derived from Natural Questions.\nThe goal is to predict an English answer string for an input English question.\nAll questions can be answered using the contents of English Wikipedia.\n", "citation": "@article{doi:10.1162/tacl_a_00276,\n    author = {Kwiatkowski, Tom and Palomaki, Jennimaria and Redfield, Olivia and Collins, Michael and Parikh, Ankur and Alberti, Chris and Epstein, Danielle and Polosukhin, Illia and Devlin, Jacob and Lee, Kenton and Toutanova, Kristina and Jones, Llion and Kelcey, Matthew and Chang, Ming-Wei and Dai, Andrew                         M. and Uszkoreit, Jakob and Le, Quoc and Petrov, Slav},\n    title = {Natural Questions: A Benchmark for Question Answering Research},\n    journal = {Transactions of the Association for Computational Linguistics},\n    volume = {7},\n    number = {},\n    pages = {453-466},\n    year = {2019},\n    doi = {10.1162/tacl_a_00276},\n    URL = {\n            https://doi.org/10.1162/tacl_a_00276\n        },\n    eprint = {\n            https://doi.org/10.1162/tacl_a_00276\n        },\n    abstract = { We present the Natural Questions corpus, a question answering data set. Questions consist of real anonymized, aggregated queries issued to the Google search engine. An annotator is presented with a question along with a Wikipedia page from the top 5 search results, and annotates a long answer (typically a paragraph) and a short answer (one or more entities) if present on the page, or marks null if no long/short answer is present. The public release consists of 307,373 training examples with single annotations; 7,830 examples with 5-way annotations for development data; and a further 7,842 examples with 5-way annotated sequestered as test data. We present experiments validating quality of the data. We also describe analysis of 25-way annotations on 302 examples, giving insights into human variability on the annotation task. We introduce robust metrics for the purposes of evaluating question answering systems; demonstrate high human upper bounds on these metrics; and establish baseline results using competitive methods drawn from related literature. }\n}\n\n@inproceedings{lee-etal-2019-latent,\n    title = \"Latent Retrieval for Weakly Supervised Open Domain Question Answering\",\n    author = \"Lee, Kenton  and\n      Chang, Ming-Wei  and\n      Toutanova, Kristina\",\n    booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n    month = jul,\n    year = \"2019\",\n    address = \"Florence, Italy\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/P19-1612\",\n    doi = \"10.18653/v1/P19-1612\",\n    pages = \"6086--6096\",\n    abstract = \"Recent work on open domain question answering (QA) assumes strong supervision of the supporting evidence and/or assumes a blackbox information retrieval (IR) system to retrieve evidence candidates. We argue that both are suboptimal, since gold evidence is not always available, and QA is fundamentally different from IR. We show for the first time that it is possible to jointly learn the retriever and reader from question-answer string pairs and without any IR system. In this setting, evidence retrieval from all of Wikipedia is treated as a latent variable. Since this is impractical to learn from scratch, we pre-train the retriever with an Inverse Cloze Task. We evaluate on open versions of five QA datasets. On datasets where the questioner already knows the answer, a traditional IR system such as BM25 is sufficient. On datasets where a user is genuinely seeking an answer, we show that learned retrieval is crucial, outperforming BM25 by up to 19 points in exact match.\",\n}\n", "homepage": "https://efficientqa.github.io/", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "builder_name": "nq_open", "config_name": "nq_open", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6651344, "num_examples": 87925, "dataset_name": "nq_open"}, "validation": {"name": "validation", "num_bytes": 156124, "num_examples": 1800, "dataset_name": "nq_open"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.efficientqa.dev.1.1.jsonl": {"num_bytes": 194998, "checksum": "0ad45e2fa9fd32d10b3cf08c63e6d22f97ba1a0ce44ea98040f4c18bce7e268d"}, "https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.train.jsonl": {"num_bytes": 8522298, "checksum": "0c93aaf13b12f855628489665e4ef85ffb8573ad444646f11f4fabdc57a0aabf"}}, "download_size": 8717296, "post_processing_size": null, "dataset_size": 6807468, "size_in_bytes": 15524764}}
\ No newline at end of file
+{"nq_open": {"description": "The NQ-Open task, introduced by Lee et.al. 2019,\nis an open domain question answering benchmark that is derived from Natural Questions.\nThe goal is to predict an English answer string for an input English question.\nAll questions can be answered using the contents of English Wikipedia.\n", "citation": "@article{doi:10.1162/tacl_a_00276,\n    author = {Kwiatkowski, Tom and Palomaki, Jennimaria and Redfield, Olivia and Collins, Michael and Parikh, Ankur and Alberti, Chris and Epstein, Danielle and Polosukhin, Illia and Devlin, Jacob and Lee, Kenton and Toutanova, Kristina and Jones, Llion and Kelcey, Matthew and Chang, Ming-Wei and Dai, Andrew                         M. and Uszkoreit, Jakob and Le, Quoc and Petrov, Slav},\n    title = {Natural Questions: A Benchmark for Question Answering Research},\n    journal = {Transactions of the Association for Computational Linguistics},\n    volume = {7},\n    number = {},\n    pages = {453-466},\n    year = {2019},\n    doi = {10.1162/tacl_a_00276},\n    URL = {\n            https://doi.org/10.1162/tacl_a_00276\n        },\n    eprint = {\n            https://doi.org/10.1162/tacl_a_00276\n        },\n    abstract = { We present the Natural Questions corpus, a question answering data set. Questions consist of real anonymized, aggregated queries issued to the Google search engine. An annotator is presented with a question along with a Wikipedia page from the top 5 search results, and annotates a long answer (typically a paragraph) and a short answer (one or more entities) if present on the page, or marks null if no long/short answer is present. The public release consists of 307,373 training examples with single annotations; 7,830 examples with 5-way annotations for development data; and a further 7,842 examples with 5-way annotated sequestered as test data. We present experiments validating quality of the data. We also describe analysis of 25-way annotations on 302 examples, giving insights into human variability on the annotation task. We introduce robust metrics for the purposes of evaluating question answering systems; demonstrate high human upper bounds on these metrics; and establish baseline results using competitive methods drawn from related literature. }\n}\n\n@inproceedings{lee-etal-2019-latent,\n    title = \"Latent Retrieval for Weakly Supervised Open Domain Question Answering\",\n    author = \"Lee, Kenton  and\n      Chang, Ming-Wei  and\n      Toutanova, Kristina\",\n    booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n    month = jul,\n    year = \"2019\",\n    address = \"Florence, Italy\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/P19-1612\",\n    doi = \"10.18653/v1/P19-1612\",\n    pages = \"6086--6096\",\n    abstract = \"Recent work on open domain question answering (QA) assumes strong supervision of the supporting evidence and/or assumes a blackbox information retrieval (IR) system to retrieve evidence candidates. We argue that both are suboptimal, since gold evidence is not always available, and QA is fundamentally different from IR. We show for the first time that it is possible to jointly learn the retriever and reader from question-answer string pairs and without any IR system. In this setting, evidence retrieval from all of Wikipedia is treated as a latent variable. Since this is impractical to learn from scratch, we pre-train the retriever with an Inverse Cloze Task. We evaluate on open versions of five QA datasets. On datasets where the questioner already knows the answer, a traditional IR system such as BM25 is sufficient. On datasets where a user is genuinely seeking an answer, we show that learned retrieval is crucial, outperforming BM25 by up to 19 points in exact match.\",\n}\n", "homepage": "https://efficientqa.github.io/", "license": "", "features": {"question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "nq_open", "config_name": "nq_open", "version": {"version_str": "2.0.0", "description": "", "major": 2, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6651344, "num_examples": 87925, "dataset_name": "nq_open"}, "validation": {"name": "validation", "num_bytes": 313841, "num_examples": 3610, "dataset_name": "nq_open"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.dev.jsonl": {"num_bytes": 391316, "checksum": "f15567f38099f3615f5b8a685c0aef449c11ad90d3da3735e8d1b98115b40616"}, "https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.train.jsonl": {"num_bytes": 8522298, "checksum": "0c93aaf13b12f855628489665e4ef85ffb8573ad444646f11f4fabdc57a0aabf"}}, "download_size": 8913614, "post_processing_size": null, "dataset_size": 6965185, "size_in_bytes": 15878799}}
\ No newline at end of file
diff --git a/datasets/nq_open/dummy/nq_open/1.0.0/dummy_data.zip b/datasets/nq_open/dummy/nq_open/1.0.0/dummy_data.zip
deleted file mode 100644
index 747233e7825cc970f2fb2be4399456ad0cf5268e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 904
zcmWIWW@Zs#00Dt{69T~uD8UV+Q%ZAlE8|lVOA_@1pb9y_3iAaLUVa2B*w4toAcv~Z
zFHkqXAT>{~q$n{nPcN%DKQE_r!bU%4L!P$sd5^-M@m+81+O%=Q!2?`8?~cB8-FxKd
z$FCuwwdFObj!!3Sve>l$|K{_JUb2ygmG2!k&i3qC{%-=;<KERPTF<i{x_;p5B$4-A
z$;VH1Xdbe%OPu|y$B)M>CqdKLXzSeb?WgblH2*VoGQUP&jP`fq2uJtQCa25mj8BLu
z&z;a^Uu--pYUOwT^{;gQwQhNSS;vekJYZ&+wCnd9T}@|J+;#4<mY!<Asp!d`#J>`=
z+Gbs4I{o!<);p1-rFBb!&t)zT_tdC%`*x-ytZU_$1=k7`;%)Dx>0bYS_Wz5+2eKZi
zXR?YY|GxQjmU#%*^nBF7QtCd${23TXCxC&agFUcP)6z1NGgI?Q3KR8GQp@xVfe;*Y
zx$vMnbn4Kh*0rgxr4<ggN!&TIcXO5a^c#v=qVM~PI~uiT>)!tV|L6X;?_OCtmNrfs
zCMGzU8qBL^^}OWZy3n)DE+=Rii&v}XVK%0;+>k$COl{t5yZzx?Y5Cv9H{2MX+Gr>|
zuWr^ctKx3YYB1<nax{DCSCut&|NEsxi!?5^rcL8twlrv^Wur{ux>JHY{w&WqA4mO=
zTz=kJ_b}VH(6t8Ne&$yuFK-LuTy#b@Qs$=YuDbzZ_UA4H{I-AgbIA^)>W^n@i>JJi
z-u}v6?R?RcibdQX&bv$g7E!;jdCJjsy)y3ZD{h`jl@*?ML$W&Het<V4lL#~JL<tN*
zAP`{q>j<Jzk|<0g_CyL&4gy;mHGw3OMl4Ad;RuLX=y3=(2y9juFjldcg)>G2yjj^m
Px|xCSK9HWw1mXbzd=6L~

diff --git a/datasets/nq_open/dummy/nq_open/2.0.0/dummy_data.zip b/datasets/nq_open/dummy/nq_open/2.0.0/dummy_data.zip
new file mode 100644
index 0000000000000000000000000000000000000000..8d47eea0612e23736ddee4658de673a70f822a47
GIT binary patch
literal 1387
zcmWIWW@Zs#0D+4kuEAgil;CDiU`Q#=&8>`2Ni0d!4-MgEV9!pDNz4Ia5M5fq&A`a=
zm63q~Y(N0mAPxo&6oc*n4HE?#6d&*F=<FXHp|2N@FcZbdJWL~#6GW5K6Os}>`1*u>
z;1BBvXkeNk!K^MI&G=DEf$iga0|8bUXQ!qGJ`?&h9o{r9YY|jDH-FCj)zc#aCQM#6
zefEqw%pxgi9)i2pzp_4VUg~}-rG3)Vb~)j6eJcir0B?4V1JiU*#sXakawWq30`n#W
zLc@SzKhS$}r~%;@sGDDqnx|J%l$e>PmsOmfmjm|5gH^ZULE6GGJyJShqaU*&Ph0uC
zN8!)-t~YjV+PLB10WO|*N8h^cJ#zHp*O1WK@|sk~rxP|=Y})^S^Z7<E*~r7n_YNCp
zd-g2<H-YPM@9Gt;=UES3KX7%D$a}8j<EJ__4_VnI&i>Wo$77b0plNKhb?*80(|3QG
z|Cu_OU!yNZ`@3<3qkCzS)8%!>C&ZNJPUx~PHl7u=^1J`~SGxaNw>-bBW5yL8Ff&Zr
z_4|#krZX$<I(J!1Pqp7v^kh%sUx`_5v#v6o{(3m;oygJBx+THqGM9&YYE-*@JJS)?
zwerh?YXu7Nw)fI>uYW)L|Ha`0S&!5+Sw)n8-+VgDJcMg{J}7YW1rlC<1cnDF2(bt5
z380J&_P|X^Ekg_1NMNvHNp`vLpgnZz(52S3sjsCK4z@|$IkI<imH6}<idv%Y`-(dn
zwP)+z{{R2y{<iO4Svr<BP8%jBIGGyEt7i4Q<lwr{v&}9iXc>!FtLI@hrnKCUKVM93
z-fX-5;ah3>-^DlF7@yi`C_Jxj)-kK%ZqI5k=vZ<zd+Aq|HFf{{r9_J~F14mj<6pKk
zXr*PNOyaszf;|2#&p97Q{g7OK-dXoB+qclQ2H$?>S0*oS3*%gLMmAFBrtGe}0b=&&
zE(H9xfA(|94x{RiXKRb6ypi7i%3bYz(Ugis+#k-nOa2y7zp#1A(RIBt?(QpYo=TM!
zo_9mCI^aGiaFx0bF@FXIE+_~CycwB9m~m%fP@)8Zzm6ahk^P9s(h!4TnHo8}fHE}<
z{B86EGRex~Ft>xO0OdR6L<7oqFtDYO6X;VUx08_z(d|MGLr^k=fh~<`OjzwAI#C9A
Sv$BCqV*$cRKzAQz2Jrwe*wK~%

literal 0
HcmV?d00001

diff --git a/datasets/nq_open/nq_open.py b/datasets/nq_open/nq_open.py
index 111d7d23283..3fe997d5d52 100644
--- a/datasets/nq_open/nq_open.py
+++ b/datasets/nq_open/nq_open.py
@@ -65,7 +65,7 @@
 """
 
 _URLS = {
-    "dev": "https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.efficientqa.dev.1.1.jsonl",
+    "dev": "https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.dev.jsonl",
     "train": "https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.train.jsonl",
 }
 
@@ -87,7 +87,7 @@ class NQOpen(datasets.GeneratorBasedBuilder):
     BUILDER_CONFIGS = [
         NQOpenConfig(
             name="nq_open",
-            version=datasets.Version("1.0.0", ""),
+            version=datasets.Version("2.0.0", ""),
             description="NQ_Open open domain question answering dataset.",
         ),
     ]

From 9379a5ac78ef5da1170d0b36d532d8620a7c0a78 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
Date: Tue, 5 Oct 2021 19:54:56 +0200
Subject: [PATCH 5/5] Actual "proper" install of ruamel.yaml in the windows CI
 (#3033)

* ruamel-yaml

* fix
---
 .circleci/config.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index ef36247feb6..f59d6bd5940 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -43,11 +43,12 @@ jobs:
             - checkout
             - run: conda update conda
             - run: conda install python=3.6 --yes
+            - run: Remove-Item c:\tools\miniconda3\lib\site-packages\ruamel* -Recurse -Force -Confirm:$false
+            - run: pip install ruamel.yaml
             - run: conda install pytorch --yes
             - run: pip install virtualenv
             - run: python -m virtualenv venv --system-site-packages
             - run: "& venv/Scripts/activate.ps1"
-            - run: pip install --ignore-installed ruamel-yaml
             - run: pip install .[tests]
             - run: pip install -r additional-tests-requirements.txt --no-deps
             - run: pip install pyarrow --upgrade
@@ -63,11 +64,12 @@ jobs:
             - checkout
             - run: conda update conda
             - run: conda install python=3.6 --yes
+            - run: Remove-Item c:\tools\miniconda3\lib\site-packages\ruamel* -Recurse -Force -Confirm:$false
+            - run: pip install ruamel.yaml
             - run: conda install pytorch --yes
             - run: pip install virtualenv
             - run: python -m virtualenv venv --system-site-packages
             - run: "& venv/Scripts/activate.ps1"
-            - run: pip install --ignore-installed ruamel-yaml
             - run: pip install .[tests]
             - run: pip install -r additional-tests-requirements.txt --no-deps
             - run: pip install pyarrow==1.0.0