diff --git a/.circleci/config.yml b/.circleci/config.yml index ef36247feb6..f59d6bd5940 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -43,11 +43,12 @@ jobs: - checkout - run: conda update conda - run: conda install python=3.6 --yes + - run: Remove-Item c:\tools\miniconda3\lib\site-packages\ruamel* -Recurse -Force -Confirm:$false + - run: pip install ruamel.yaml - run: conda install pytorch --yes - run: pip install virtualenv - run: python -m virtualenv venv --system-site-packages - run: "& venv/Scripts/activate.ps1" - - run: pip install --ignore-installed ruamel-yaml - run: pip install .[tests] - run: pip install -r additional-tests-requirements.txt --no-deps - run: pip install pyarrow --upgrade @@ -63,11 +64,12 @@ jobs: - checkout - run: conda update conda - run: conda install python=3.6 --yes + - run: Remove-Item c:\tools\miniconda3\lib\site-packages\ruamel* -Recurse -Force -Confirm:$false + - run: pip install ruamel.yaml - run: conda install pytorch --yes - run: pip install virtualenv - run: python -m virtualenv venv --system-site-packages - run: "& venv/Scripts/activate.ps1" - - run: pip install --ignore-installed ruamel-yaml - run: pip install .[tests] - run: pip install -r additional-tests-requirements.txt --no-deps - run: pip install pyarrow==1.0.0 diff --git a/datasets/nq_open/README.md b/datasets/nq_open/README.md index f87d76b871d..92dfe3f54a1 100644 --- a/datasets/nq_open/README.md +++ b/datasets/nq_open/README.md @@ -9,6 +9,7 @@ licenses: - cc-by-sa-3.0 multilinguality: - monolingual +pretty_name: NQ-Open size_categories: - 10K str: """Path(...) on an url path like zip://file.txt::http://host.com/data.zip converts the :// to :/ @@ -219,6 +226,41 @@ def test_xopen_remote(): assert list(f) == TEST_URL_CONTENT.splitlines(keepends=True) +@pytest.mark.parametrize( + "input_path, expected_paths", + [ + ("tmp_path/*.txt", ["file1.txt", "file2.txt"]), + ("mock://*", ["mock://glob_test", "mock://misc", "mock://top_level"]), + ("mock://top_*", ["mock://top_level"]), + ( + "mock://top_level/second_level/date=2019-10-0[1-4]", + [ + "mock://top_level/second_level/date=2019-10-01", + "mock://top_level/second_level/date=2019-10-02", + "mock://top_level/second_level/date=2019-10-04", + ], + ), + ( + "mock://top_level/second_level/date=2019-10-0[1-4]/*", + [ + "mock://top_level/second_level/date=2019-10-01/a.parquet", + "mock://top_level/second_level/date=2019-10-01/b.parquet", + "mock://top_level/second_level/date=2019-10-02/a.parquet", + "mock://top_level/second_level/date=2019-10-04/a.parquet", + ], + ), + ], +) +def test_xglob(input_path, expected_paths, tmp_path, mock_fsspec): + if input_path.startswith("tmp_path"): + input_path = input_path.replace("/", os.sep).replace("tmp_path", str(tmp_path)) + expected_paths = [str(tmp_path / file) for file in expected_paths] + for file in ["file1.txt", "file2.txt", "README.md"]: + (tmp_path / file).touch() + output_paths = sorted(xglob(input_path)) + assert output_paths == expected_paths + + @pytest.mark.parametrize( "input_path, pattern, expected_paths", [ @@ -246,20 +288,16 @@ def test_xopen_remote(): ), ], ) -def test_xpathglob(input_path, pattern, expected_paths, tmp_path): +def test_xpathglob(input_path, pattern, expected_paths, tmp_path, mock_fsspec): if input_path == "tmp_path": input_path = tmp_path expected_paths = [tmp_path / file for file in expected_paths] for file in ["file1.txt", "file2.txt", "README.md"]: (tmp_path / file).touch() - output_path = sorted(xpathglob(input_path, pattern)) else: - dummy_registry = datasets.utils.streaming_download_manager.fsspec.registry.target.copy() - dummy_registry["mock"] = DummyTestFS expected_paths = [Path(file) for file in expected_paths] - with patch.dict(datasets.utils.streaming_download_manager.fsspec.registry.target, dummy_registry): - output_path = sorted(xpathglob(Path(input_path), pattern)) - assert output_path == expected_paths + output_paths = sorted(xpathglob(Path(input_path), pattern)) + assert output_paths == expected_paths @pytest.mark.parametrize( @@ -306,7 +344,7 @@ def test_xpathglob(input_path, pattern, expected_paths, tmp_path): ), ], ) -def test_xpathrglob(input_path, pattern, expected_paths, tmp_path): +def test_xpathrglob(input_path, pattern, expected_paths, tmp_path, mock_fsspec): if input_path == "tmp_path": input_path = tmp_path dir_path = tmp_path / "dir" @@ -314,14 +352,10 @@ def test_xpathrglob(input_path, pattern, expected_paths, tmp_path): expected_paths = [dir_path / file for file in expected_paths] for file in ["file1.txt", "file2.txt", "README.md"]: (dir_path / file).touch() - output_path = sorted(xpathrglob(input_path, pattern)) else: - dummy_registry = datasets.utils.streaming_download_manager.fsspec.registry.target.copy() - dummy_registry["mock"] = DummyTestFS expected_paths = [Path(file) for file in expected_paths] - with patch.dict(datasets.utils.streaming_download_manager.fsspec.registry.target, dummy_registry): - output_path = sorted(xpathrglob(Path(input_path), pattern)) - assert output_path == expected_paths + output_paths = sorted(xpathrglob(Path(input_path), pattern)) + assert output_paths == expected_paths @pytest.mark.parametrize(