From 93255c57c0d56f4dc32f2ec6847da27feb3ef9d2 Mon Sep 17 00:00:00 2001 From: "Li-Huai (Allan) Lin" Date: Tue, 5 Oct 2021 19:56:55 +0800 Subject: [PATCH 1/5] Fix typo (#3023) --- src/datasets/fingerprint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/fingerprint.py b/src/datasets/fingerprint.py index 1c5362afee8..d8cd45a1732 100644 --- a/src/datasets/fingerprint.py +++ b/src/datasets/fingerprint.py @@ -164,7 +164,7 @@ def proxy(func): class Hasher: - """Hasher that accepts python objets as inputs.""" + """Hasher that accepts python objects as inputs.""" dispatch: Dict = {} From fdc02f3377bf44ab10ba3402e3dc387690ed6bfe Mon Sep 17 00:00:00 2001 From: Bo Wang <6764450+bwang482@users.noreply.github.com> Date: Tue, 5 Oct 2021 08:13:33 -0400 Subject: [PATCH 2/5] add swedish_medical_ner dataset (#2940) * add swedish_medical_ner dataset * update swedish_medical_ner * Apply suggestions from code review Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> --- datasets/swedish_medical_ner/README.md | 194 +++++++++++++++++ .../swedish_medical_ner/dataset_infos.json | 1 + .../dummy/1177/1.0.0/dummy_data.zip | Bin 0 -> 1256 bytes .../dummy/lt/1.0.0/dummy_data.zip | Bin 0 -> 1256 bytes .../dummy/wiki/1.0.0/dummy_data.zip | Bin 0 -> 1256 bytes .../swedish_medical_ner.py | 202 ++++++++++++++++++ 6 files changed, 397 insertions(+) create mode 100644 datasets/swedish_medical_ner/README.md create mode 100644 datasets/swedish_medical_ner/dataset_infos.json create mode 100644 datasets/swedish_medical_ner/dummy/1177/1.0.0/dummy_data.zip create mode 100644 datasets/swedish_medical_ner/dummy/lt/1.0.0/dummy_data.zip create mode 100644 datasets/swedish_medical_ner/dummy/wiki/1.0.0/dummy_data.zip create mode 100644 datasets/swedish_medical_ner/swedish_medical_ner.py diff --git a/datasets/swedish_medical_ner/README.md b/datasets/swedish_medical_ner/README.md new file mode 100644 index 00000000000..7c4b4910ae8 --- /dev/null +++ b/datasets/swedish_medical_ner/README.md @@ -0,0 +1,194 @@ +--- +annotations_creators: +- machine-generated +- expert-generated +language_creators: +- found +languages: +- sv-SE +licenses: +- cc-by-sa-4.0 +multilinguality: +- monolingual +size_categories: +- 100K|h1Wj?$6N*~aA#y-P(W1} zo|&B)pO}}IUy@joni6kjpjT2+lG|_S$Lz@C`rG!Xy~7E{7ZVn=OuDDS_wihht<-rp z$JKkeU3W0CC+Htb5nd=#a_#s1`}dPIrhRTP{}%A%fj~&7V9k>$mP%*ULq8^T_m{E; zt(wNN?Pl`zN_z#*H@-U){%KV#pP~`=X12Sl_OSquRFBJCij#J~NjcoQt-$+*gY#Ci zt#6AJu6OycCjPq?cH)w2^y8^s=bx|Pj^esrw*Gf}BVWSN$EOd+ZaVQ$S?^}DURY#F z)Z>?KZ&$JOeLsKQ-BWh4`K47pJC=t3_-r|!YulYR`$zjx1LVdYy@c1mfN%o_h#YEw z_=I2thw22wyu$_pEY*9Hyrl~!>s7gX9X@JlxKbNr`s)(k4qw zCV%1T!+ZA%1n=t-PEK|`?~oh)Avd`ER3qBVe>V8Ql<-^YpfB|o*o;t^vKnJ zhJRM|ZM=2o-p=PU7XD~2&UkP2CTaB>?xRbSZMApGpRv_1aNAX^HhGEbmCpP3`<{3T z{?;}5k~Uv?U1N4W+ocz)!F1H?{&N;! z5cL4DI%+T(8k(CU1ypfrUP)?Ra%wRs`Pfc4n|s)Rr}h2JN3xf@+cj?8GjhDZENi$a z>#u}J;ueivy_=oHE+j3Tv&fvW=FdI%xfX%fCKMidEn&y$|3PO(Zudl^qTQ8>J&CC^ zi%-l;vDmX=QEpSw%~|TH^sv(?9jc24TrC7tE|4mk}bdX$pzk5J~Q5=%kToz8zYklGw!?!3{4;q zV0h~YqET`#Oe6NZ3sMdOOBywSB$7s~IT_&$h-v5vA8ZuZG%H{nBbkOJ8=#woo;na_ bB{HFyg)glHc(byBG_nAp2G9wsm_a-M|FPl4 literal 0 HcmV?d00001 diff --git a/datasets/swedish_medical_ner/dummy/lt/1.0.0/dummy_data.zip b/datasets/swedish_medical_ner/dummy/lt/1.0.0/dummy_data.zip new file mode 100644 index 0000000000000000000000000000000000000000..961df338b37cf495b2713fd60ce5f6cf14acf639 GIT binary patch literal 1256 zcmWIWW@Zs#0D;;qHo;&9l;8%^DW$o&mGLQwC5ie0P=y>|h1Wj?$6N*~aA#y-P(W1} zo|&B)pO}}IUy@joni6kjpjT2+lG|_S$Lz@C`rG!Xy~7E{7ZVn=OuDDS_wihht<-rp z$JKkeU3W0CC+Htb5nd=#a_#s1`}dPIrhRTP{}%A%fj~&7V9k>$mP%*ULq8^T_m{E; zt(wNN?Pl`zN_z#*H@-U){%KV#pP~`=X12Sl_OSquRFBJCij#J~NjcoQt-$+*gY#Ci zt#6AJu6OycCjPq?cH)w2^y8^s=bx|Pj^esrw*Gf}BVWSN$EOd+ZaVQ$S?^}DURY#F z)Z>?KZ&$JOeLsKQ-BWh4`K47pJC=t3_-r|!YulYR`$zjx1LVdYy@c1mfN%o_h#YEw z_=I2thw22wyu$_pEY*9Hyrl~!>s7gX9X@JlxKbNr`s)(k4qw zCV%1T!+ZA%1n=t-PEK|`?~oh)Avd`ER3qBVe>V8Ql<-^YpfB|o*o;t^vKnJ zhJRM|ZM=2o-p=PU7XD~2&UkP2CTaB>?xRbSZMApGpRv_1aNAX^HhGEbmCpP3`<{3T z{?;}5k~Uv?U1N4W+ocz)!F1H?{&N;! z5cL4DI%+T(8k(CU1ypfrUP)?Ra%wRs`Pfc4n|s)Rr}h2JN3xf@+cj?8GjhDZENi$a z>#u}J;ueivy_=oHE+j3Tv&fvW=FdI%xfX%fCKMidEn&y$|3PO(Zudl^qTQ8>J&CC^ zi%-l;vDmX=QEpSw%~|TH^sv(?9jc24TrC7tE|4mk}bdX$pzk5J~Q5=%kToz8zYklGw!?!3{4;q zV0h~YqET`#Oe6NZ3sMdOOBywSB$7s~IT_&$h-v5vA8ZuZG%H{nBbkOJ8=#woo;na_ bB{HFyg)glHc(byBG_nAp2G9wsm_a-M|FPl4 literal 0 HcmV?d00001 diff --git a/datasets/swedish_medical_ner/dummy/wiki/1.0.0/dummy_data.zip b/datasets/swedish_medical_ner/dummy/wiki/1.0.0/dummy_data.zip new file mode 100644 index 0000000000000000000000000000000000000000..961df338b37cf495b2713fd60ce5f6cf14acf639 GIT binary patch literal 1256 zcmWIWW@Zs#0D;;qHo;&9l;8%^DW$o&mGLQwC5ie0P=y>|h1Wj?$6N*~aA#y-P(W1} zo|&B)pO}}IUy@joni6kjpjT2+lG|_S$Lz@C`rG!Xy~7E{7ZVn=OuDDS_wihht<-rp z$JKkeU3W0CC+Htb5nd=#a_#s1`}dPIrhRTP{}%A%fj~&7V9k>$mP%*ULq8^T_m{E; zt(wNN?Pl`zN_z#*H@-U){%KV#pP~`=X12Sl_OSquRFBJCij#J~NjcoQt-$+*gY#Ci zt#6AJu6OycCjPq?cH)w2^y8^s=bx|Pj^esrw*Gf}BVWSN$EOd+ZaVQ$S?^}DURY#F z)Z>?KZ&$JOeLsKQ-BWh4`K47pJC=t3_-r|!YulYR`$zjx1LVdYy@c1mfN%o_h#YEw z_=I2thw22wyu$_pEY*9Hyrl~!>s7gX9X@JlxKbNr`s)(k4qw zCV%1T!+ZA%1n=t-PEK|`?~oh)Avd`ER3qBVe>V8Ql<-^YpfB|o*o;t^vKnJ zhJRM|ZM=2o-p=PU7XD~2&UkP2CTaB>?xRbSZMApGpRv_1aNAX^HhGEbmCpP3`<{3T z{?;}5k~Uv?U1N4W+ocz)!F1H?{&N;! z5cL4DI%+T(8k(CU1ypfrUP)?Ra%wRs`Pfc4n|s)Rr}h2JN3xf@+cj?8GjhDZENi$a z>#u}J;ueivy_=oHE+j3Tv&fvW=FdI%xfX%fCKMidEn&y$|3PO(Zudl^qTQ8>J&CC^ zi%-l;vDmX=QEpSw%~|TH^sv(?9jc24TrC7tE|4mk}bdX$pzk5J~Q5=%kToz8zYklGw!?!3{4;q zV0h~YqET`#Oe6NZ3sMdOOBywSB$7s~IT_&$h-v5vA8ZuZG%H{nBbkOJ8=#woo;na_ bB{HFyg)glHc(byBG_nAp2G9wsm_a-M|FPl4 literal 0 HcmV?d00001 diff --git a/datasets/swedish_medical_ner/swedish_medical_ner.py b/datasets/swedish_medical_ner/swedish_medical_ner.py new file mode 100644 index 00000000000..a5055107531 --- /dev/null +++ b/datasets/swedish_medical_ner/swedish_medical_ner.py @@ -0,0 +1,202 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""SwedMedNER: A Named Entity Recognition Dataset on medical texts in Swedish""" + + +import re + +import datasets + + +_CITATION = """\ +@inproceedings{almgrenpavlovmogren2016bioner, + title={Named Entity Recognition in Swedish Medical Journals with Deep Bidirectional Character-Based LSTMs}, + author={Simon Almgren, Sean Pavlov, Olof Mogren}, + booktitle={Proceedings of the Fifth Workshop on Building and Evaluating Resources for Biomedical Text Mining (BioTxtM 2016)}, + pages={1}, + year={2016} +} +""" + + +_DESCRIPTION = """\ +SwedMedNER is a dataset for training and evaluating Named Entity Recognition systems on medical texts in Swedish. +It is derived from medical articles on the Swedish Wikipedia, Läkartidningen, and 1177 Vårdguiden. +""" + + +_LICENSE = """\ +Creative Commons Attribution-ShareAlike 4.0 International Public License (CC BY-SA 4.0) +See http://creativecommons.org/licenses/by-sa/4.0/ for the summary of the license. +""" + + +_URL = "https://github.com/olofmogren/biomedical-ner-data-swedish" + + +_DATA_URL = "https://raw.githubusercontent.com/olofmogren/biomedical-ner-data-swedish/master/" + + +class SwedishMedicalNerConfig(datasets.BuilderConfig): + """BuilderConfig for SwedMedNER""" + + def __init__(self, **kwargs): + """ + Args: + **kwargs: keyword arguments forwarded to super. + """ + super(SwedishMedicalNerConfig, self).__init__(**kwargs) + + +class SwedishMedicalNer(datasets.GeneratorBasedBuilder): + """SwedMedNER: A Named Entity Recognition Dataset on medical texts in Swedish""" + + VERSION = datasets.Version("1.0.0") + + BUILDER_CONFIGS = [ + datasets.BuilderConfig(name="wiki", version=VERSION, description="The Swedish Wikipedia part of the dataset"), + datasets.BuilderConfig(name="lt", version=VERSION, description="The Läkartidningen part of the dataset"), + datasets.BuilderConfig(name="1177", version=VERSION, description="The 1177 Vårdguiden part of the dataset"), + ] + + def _info(self): + if self.config.name == "wiki": + features = datasets.Features( + { + "sid": datasets.Value("string"), + "sentence": datasets.Value("string"), + "entities": datasets.Sequence( + { + "start": datasets.Value("int32"), + "end": datasets.Value("int32"), + "text": datasets.Value("string"), + "type": datasets.ClassLabel( + names=["Disorder and Finding", "Pharmaceutical Drug", "Body Structure"] + ), + } + ), + } + ) + elif self.config.name == "lt": + features = datasets.Features( + { + "sid": datasets.Value("string"), + "sentence": datasets.Value("string"), + "entities": datasets.Sequence( + { + "start": datasets.Value("int32"), + "end": datasets.Value("int32"), + "text": datasets.Value("string"), + "type": datasets.ClassLabel( + names=["Disorder and Finding", "Pharmaceutical Drug", "Body Structure"] + ), + } + ), + } + ) + elif self.config.name == "1177": + features = datasets.Features( + { + "sid": datasets.Value("string"), + "sentence": datasets.Value("string"), + "entities": datasets.Sequence( + { + "start": datasets.Value("int32"), + "end": datasets.Value("int32"), + "text": datasets.Value("string"), + "type": datasets.ClassLabel( + names=["Disorder and Finding", "Pharmaceutical Drug", "Body Structure"] + ), + } + ), + } + ) + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + supervised_keys=None, + homepage=_URL, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + urls_to_download = { + "wiki": _DATA_URL + "Wiki_annotated_60.txt", + "lt": _DATA_URL + "LT_annotated_60.txt", + "1177": _DATA_URL + "1177_annotated_sentences.txt", + } + downloaded_files = dl_manager.download_and_extract(urls_to_download) + + if self.config.name == "wiki": + return [ + datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["wiki"]}) + ] + elif self.config.name == "lt": + return [ + datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["lt"]}) + ] + elif self.config.name == "1177": + return [ + datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["1177"]}) + ] + + def _generate_examples(self, filepath): + """Yields examples as (key, example) tuples.""" + + def find_type(s, e): + if (s == "(") and (e == ")"): + return "Disorder and Finding" + elif (s == "[") and (e == "]"): + return "Pharmaceutical Drug" + elif (s == "{") and (e == "}"): + return "Body Structure" + + pattern = r"\[([^\[\]()]+)\]|\(([^\[\]()]+)\)|\{([^\[\]()]+)\}" + with open(filepath, encoding="utf-8") as f: + for id_, row in enumerate(f): + sentence = row.replace("\n", "") + + if self.config.name == "1177": + targets = [ + { + "start": m.start(0), + "end": m.end(0), + "text": sentence[m.start(0) + 2 : m.end(0) - 2], + "type": find_type(sentence[m.start(0)], sentence[m.end(0) - 1]), + } + for m in re.finditer(pattern, sentence) + ] + yield id_, { + "sid": self.config.name + "_" + str(id_), + "sentence": sentence, + "entities": targets if targets else [], + } + else: + targets = [ + { + "start": m.start(0), + "end": m.end(0), + "text": sentence[m.start(0) + 1 : m.end(0) - 1], + "type": find_type(sentence[m.start(0)], sentence[m.end(0) - 1]), + } + for m in re.finditer(pattern, sentence) + ] + yield id_, { + "sid": self.config.name + "_" + str(id_), + "sentence": sentence, + "entities": targets if targets else [], + } From 2814fbd0e18150be409f10804670e98d9ecb87d4 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 5 Oct 2021 15:46:38 +0200 Subject: [PATCH 3/5] Extend support for streaming datasets that use glob.glob (#3015) * Test xglob * Implement xglob * Use xglob to patch glob.glob * Rename output_path to output_paths in glob tests * Create fixture to mock fsspec * Use fixture to mock fsspec in tests * Remove unused import * Pass recursive parameter to glob --- src/datasets/streaming.py | 2 + .../utils/streaming_download_manager.py | 25 ++++++++ tests/test_streaming_download_manager.py | 64 ++++++++++++++----- 3 files changed, 76 insertions(+), 15 deletions(-) diff --git a/src/datasets/streaming.py b/src/datasets/streaming.py index 02d613eae24..60621c8c240 100644 --- a/src/datasets/streaming.py +++ b/src/datasets/streaming.py @@ -7,6 +7,7 @@ from .utils.patching import patch_submodule from .utils.streaming_download_manager import ( xdirname, + xglob, xjoin, xopen, xpandas_read_csv, @@ -47,6 +48,7 @@ def extend_module_for_streaming(module_path, use_auth_token: Optional[Union[str, patch_submodule(module, "open", partial(xopen, use_auth_token=use_auth_token)).start() else: patch_submodule(module, "open", xopen).start() + patch_submodule(module, "glob.glob", xglob).start() # allow to navigate in remote zip files patch_submodule(module, "os.path.join", xjoin).start() patch_submodule(module, "os.path.dirname", xdirname).start() diff --git a/src/datasets/utils/streaming_download_manager.py b/src/datasets/utils/streaming_download_manager.py index 0c64a009310..097977b8e70 100644 --- a/src/datasets/utils/streaming_download_manager.py +++ b/src/datasets/utils/streaming_download_manager.py @@ -1,3 +1,4 @@ +import glob import os import re import time @@ -180,6 +181,30 @@ def xpathopen(path: Path, *args, **kwargs): return xopen(_as_posix(path), *args, **kwargs) +def xglob(urlpath, *, recursive=False): + """Extend `glob.glob` function to support remote files. + + Args: + urlpath (:obj:`str`): URL path with shell-style wildcard patterns. + recursive (:obj:`bool`, default `False`): Whether to match the "**" pattern recursively to zero or more + directories or subdirectories. + + Returns: + :obj:`list` of :obj:`str` + """ + main_hop, *rest_hops = urlpath.split("::") + if is_local_path(main_hop): + return glob.glob(main_hop, recursive=recursive) + else: + fs, *_ = fsspec.get_fs_token_paths(urlpath) + # - If there's no "*" in the pattern, get_fs_token_paths() doesn't do any pattern matching + # so to be able to glob patterns like "[0-9]", we have to call `fs.glob`. + # - Also "*" in get_fs_token_paths() only matches files: we have to call `fs.glob` to match directories. + # - If there is "**" in the pattern, `fs.glob` must be called anyway. + globbed_paths = fs.glob(main_hop) + return ["::".join([f"{fs.protocol}://{globbed_path}"] + rest_hops) for globbed_path in globbed_paths] + + def xpathglob(path, pattern): """Glob function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs. diff --git a/tests/test_streaming_download_manager.py b/tests/test_streaming_download_manager.py index 386e5e863c8..38e746cc464 100644 --- a/tests/test_streaming_download_manager.py +++ b/tests/test_streaming_download_manager.py @@ -1,7 +1,6 @@ import os import re from pathlib import Path -from unittest.mock import patch import pytest from fsspec.spec import AbstractBufferedFile, AbstractFileSystem @@ -12,6 +11,7 @@ StreamingDownloadManager, _as_posix, _get_extraction_protocol, + xglob, xjoin, xopen, xpathglob, @@ -117,6 +117,13 @@ def _open( ) +@pytest.fixture +def mock_fsspec(monkeypatch): + dummy_registry = datasets.utils.streaming_download_manager.fsspec.registry.target.copy() + dummy_registry["mock"] = DummyTestFS + monkeypatch.setattr("datasets.utils.streaming_download_manager.fsspec.registry.target", dummy_registry) + + def _readd_double_slash_removed_by_path(path_as_posix: str) -> str: """Path(...) on an url path like zip://file.txt::http://host.com/data.zip converts the :// to :/ @@ -219,6 +226,41 @@ def test_xopen_remote(): assert list(f) == TEST_URL_CONTENT.splitlines(keepends=True) +@pytest.mark.parametrize( + "input_path, expected_paths", + [ + ("tmp_path/*.txt", ["file1.txt", "file2.txt"]), + ("mock://*", ["mock://glob_test", "mock://misc", "mock://top_level"]), + ("mock://top_*", ["mock://top_level"]), + ( + "mock://top_level/second_level/date=2019-10-0[1-4]", + [ + "mock://top_level/second_level/date=2019-10-01", + "mock://top_level/second_level/date=2019-10-02", + "mock://top_level/second_level/date=2019-10-04", + ], + ), + ( + "mock://top_level/second_level/date=2019-10-0[1-4]/*", + [ + "mock://top_level/second_level/date=2019-10-01/a.parquet", + "mock://top_level/second_level/date=2019-10-01/b.parquet", + "mock://top_level/second_level/date=2019-10-02/a.parquet", + "mock://top_level/second_level/date=2019-10-04/a.parquet", + ], + ), + ], +) +def test_xglob(input_path, expected_paths, tmp_path, mock_fsspec): + if input_path.startswith("tmp_path"): + input_path = input_path.replace("/", os.sep).replace("tmp_path", str(tmp_path)) + expected_paths = [str(tmp_path / file) for file in expected_paths] + for file in ["file1.txt", "file2.txt", "README.md"]: + (tmp_path / file).touch() + output_paths = sorted(xglob(input_path)) + assert output_paths == expected_paths + + @pytest.mark.parametrize( "input_path, pattern, expected_paths", [ @@ -246,20 +288,16 @@ def test_xopen_remote(): ), ], ) -def test_xpathglob(input_path, pattern, expected_paths, tmp_path): +def test_xpathglob(input_path, pattern, expected_paths, tmp_path, mock_fsspec): if input_path == "tmp_path": input_path = tmp_path expected_paths = [tmp_path / file for file in expected_paths] for file in ["file1.txt", "file2.txt", "README.md"]: (tmp_path / file).touch() - output_path = sorted(xpathglob(input_path, pattern)) else: - dummy_registry = datasets.utils.streaming_download_manager.fsspec.registry.target.copy() - dummy_registry["mock"] = DummyTestFS expected_paths = [Path(file) for file in expected_paths] - with patch.dict(datasets.utils.streaming_download_manager.fsspec.registry.target, dummy_registry): - output_path = sorted(xpathglob(Path(input_path), pattern)) - assert output_path == expected_paths + output_paths = sorted(xpathglob(Path(input_path), pattern)) + assert output_paths == expected_paths @pytest.mark.parametrize( @@ -306,7 +344,7 @@ def test_xpathglob(input_path, pattern, expected_paths, tmp_path): ), ], ) -def test_xpathrglob(input_path, pattern, expected_paths, tmp_path): +def test_xpathrglob(input_path, pattern, expected_paths, tmp_path, mock_fsspec): if input_path == "tmp_path": input_path = tmp_path dir_path = tmp_path / "dir" @@ -314,14 +352,10 @@ def test_xpathrglob(input_path, pattern, expected_paths, tmp_path): expected_paths = [dir_path / file for file in expected_paths] for file in ["file1.txt", "file2.txt", "README.md"]: (dir_path / file).touch() - output_path = sorted(xpathrglob(input_path, pattern)) else: - dummy_registry = datasets.utils.streaming_download_manager.fsspec.registry.target.copy() - dummy_registry["mock"] = DummyTestFS expected_paths = [Path(file) for file in expected_paths] - with patch.dict(datasets.utils.streaming_download_manager.fsspec.registry.target, dummy_registry): - output_path = sorted(xpathrglob(Path(input_path), pattern)) - assert output_path == expected_paths + output_paths = sorted(xpathrglob(Path(input_path), pattern)) + assert output_paths == expected_paths @pytest.mark.parametrize( From 83bc8a2f01a993233b7d7806c3bda5b6a657ef40 Mon Sep 17 00:00:00 2001 From: Colin Raffel Date: Tue, 5 Oct 2021 10:56:44 -0400 Subject: [PATCH 4/5] Use standard open-domain validation split in nq_open (#3029) * Use standard open-domain validation split in nq_open * Update dataset_info.json * Move and update dummy_data.zip * Add pretty name --- datasets/nq_open/README.md | 3 ++- datasets/nq_open/dataset_infos.json | 2 +- .../nq_open/dummy/nq_open/1.0.0/dummy_data.zip | Bin 904 -> 0 bytes .../nq_open/dummy/nq_open/2.0.0/dummy_data.zip | Bin 0 -> 1387 bytes datasets/nq_open/nq_open.py | 4 ++-- 5 files changed, 5 insertions(+), 4 deletions(-) delete mode 100644 datasets/nq_open/dummy/nq_open/1.0.0/dummy_data.zip create mode 100644 datasets/nq_open/dummy/nq_open/2.0.0/dummy_data.zip diff --git a/datasets/nq_open/README.md b/datasets/nq_open/README.md index f87d76b871d..92dfe3f54a1 100644 --- a/datasets/nq_open/README.md +++ b/datasets/nq_open/README.md @@ -9,6 +9,7 @@ licenses: - cc-by-sa-3.0 multilinguality: - monolingual +pretty_name: NQ-Open size_categories: - 10K{~q$n{nPcN%DKQE_r!bU%4L!P$sd5^-M@m+81+O%=Q!2?`8?~cB8-FxKd z$FCuwwdFObj!!3Sve>l$|K{_JUb2ygmG2!k&i3qC{%-=;CqdKLXzSeb?WgblH2*VoGQUP&jP`fq2uJtQCa25mj8BLu z&z;a^Uu--pYUOwT^{;gQwQhNSS;vekJYZ&+wCnd9T}@|J+;#4`= z+Gbs4I{o!<);p1-rFBb!&t)zT_tdC%`*x-ytZU_$1=k7`;%)Dx>0bYS_Wz5+2eKZi zXR?YY|GxQjmU#%*^nBF7QtCd${23TXCxC&agFUcP)6z1NGgI?Q3KR8GQp@xVfe;*Y zx$vMnbn4Kh*0rgxr4)!tV|L6X;?_OCtmNrfs zCMGzU8qBL^^}OWZy3n)DE+=Rii&v}XVK%0;+>k$COl{t5yZzx?Y5Cv9H{2MX+Gr>| zuWr^ctKx3YYB1JHY{w&WqA4mO= zTz=kJ_b}VH(6t8Ne&$yuFK-LuTy#b@Qs$=YuDbzZ_UA4H{I-AgbIA^)>W^n@i>JJi z-u}v6?R?RcibdQX&bv$g7E!;jdCJjsy)y3ZD{h`jl@*?ML$W&HetjG2yjj^m Px|xCSK9HWw1mXbzd=6L~ diff --git a/datasets/nq_open/dummy/nq_open/2.0.0/dummy_data.zip b/datasets/nq_open/dummy/nq_open/2.0.0/dummy_data.zip new file mode 100644 index 0000000000000000000000000000000000000000..8d47eea0612e23736ddee4658de673a70f822a47 GIT binary patch literal 1387 zcmWIWW@Zs#0D+4kuEAgil;CDiU`Q#=&8>`2Ni0d!4-MgEV9!pDNz4Ia5M5fq&A`a= zm63q~Y(N0mAPxo&6oc*n4HE?#6d&*F=`1*u> z;1BBvXkeNk!K^MI&G=DEf$iga0|8bUXQ!qGJ`?&h9o{r9YY|jDH-FCj)zc#aCQM#6 zefEqw%pxgi9)i2pzp_4VUg~}-rG3)Vb~)j6eJcir0B?4V1JiU*#sXakawWq30`n#W zLc@SzKhS$}r~%;@sGDDqnx|J%l$e>PmsOmfmjm|5gH^ZULE6GGJyJShqaU*&Ph0uC zN8!)-t~YjV+PLB10WO|*N8h^cJ#zHp*O1WK@|sk~rxP|=Y})^S^Z7Wo$77b0plNKhb?*80(|3QG z|Cu_OU!yNZ`@3<3qkCzS)8%!>C&ZNJPUx~PHl7u=^1J`~SGxaNw>-bBW5yL8Ff&Zr z_4|#krZX$uYW)L|Ha`0S&!5+Sw)n8-+VgDJcMg{J}7YW1rlC<1cnDF2(bt5 z380J&_P|X^Ekg_1NMNvHNp`vLpgnZz(52S3sjsCK4z@|$IkI!FtLI@hrnKCUKVM93 z-fX-5;ah3>-^DlF7@yi`C_Jxj)-kK%ZqI5k=vZS0*oS3*%gLMmAFBrtGe}0b=&& zE(H9xfA(|94x{RiXKRb6ypi7i%3bYz(Ugis+#k-nOa2y7zp#1A(RIBt?(QpYo=TM! zo_9mCI^aGiaFx0bF@FXIE+_~CycwB9m~m%fP@)8Zzm6ahk^P9s(h!4TnHo8}fHE}< z{B86EGRex~Ft>xO0OdR6L<7oqFtDYO6X;VUx08_z(d|MGLr^k=fh~<`OjzwAI#C9A Sv$BCqV*$cRKzAQz2Jrwe*wK~% literal 0 HcmV?d00001 diff --git a/datasets/nq_open/nq_open.py b/datasets/nq_open/nq_open.py index 111d7d23283..3fe997d5d52 100644 --- a/datasets/nq_open/nq_open.py +++ b/datasets/nq_open/nq_open.py @@ -65,7 +65,7 @@ """ _URLS = { - "dev": "https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.efficientqa.dev.1.1.jsonl", + "dev": "https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.dev.jsonl", "train": "https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/nq_open/NQ-open.train.jsonl", } @@ -87,7 +87,7 @@ class NQOpen(datasets.GeneratorBasedBuilder): BUILDER_CONFIGS = [ NQOpenConfig( name="nq_open", - version=datasets.Version("1.0.0", ""), + version=datasets.Version("2.0.0", ""), description="NQ_Open open domain question answering dataset.", ), ] From 9379a5ac78ef5da1170d0b36d532d8620a7c0a78 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Tue, 5 Oct 2021 19:54:56 +0200 Subject: [PATCH 5/5] Actual "proper" install of ruamel.yaml in the windows CI (#3033) * ruamel-yaml * fix --- .circleci/config.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ef36247feb6..f59d6bd5940 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -43,11 +43,12 @@ jobs: - checkout - run: conda update conda - run: conda install python=3.6 --yes + - run: Remove-Item c:\tools\miniconda3\lib\site-packages\ruamel* -Recurse -Force -Confirm:$false + - run: pip install ruamel.yaml - run: conda install pytorch --yes - run: pip install virtualenv - run: python -m virtualenv venv --system-site-packages - run: "& venv/Scripts/activate.ps1" - - run: pip install --ignore-installed ruamel-yaml - run: pip install .[tests] - run: pip install -r additional-tests-requirements.txt --no-deps - run: pip install pyarrow --upgrade @@ -63,11 +64,12 @@ jobs: - checkout - run: conda update conda - run: conda install python=3.6 --yes + - run: Remove-Item c:\tools\miniconda3\lib\site-packages\ruamel* -Recurse -Force -Confirm:$false + - run: pip install ruamel.yaml - run: conda install pytorch --yes - run: pip install virtualenv - run: python -m virtualenv venv --system-site-packages - run: "& venv/Scripts/activate.ps1" - - run: pip install --ignore-installed ruamel-yaml - run: pip install .[tests] - run: pip install -r additional-tests-requirements.txt --no-deps - run: pip install pyarrow==1.0.0