From 680d31a72dbe8188cc1a81a2eb8eb4140a9caa37 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 31 Aug 2023 10:38:22 +0200 Subject: [PATCH 1/3] Test split order in DataFilesDict --- tests/test_data_files.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_data_files.py b/tests/test_data_files.py index 34bfb26332e..d617a9c7c19 100644 --- a/tests/test_data_files.py +++ b/tests/test_data_files.py @@ -1,3 +1,4 @@ +import copy import os from pathlib import Path, PurePath from typing import List @@ -385,6 +386,13 @@ def test_DataFilesList_from_patterns_raises_FileNotFoundError(complex_data_dir): DataFilesList.from_patterns(["file_that_doesnt_exist.txt"], complex_data_dir) +class TestDataFilesDict: + def test_key_order_after_copy(self): + data_files = DataFilesDict({"train": "train.csv", "test": "test.csv"}) + copied_data_files = copy.deepcopy(data_files) + assert list(copied_data_files.keys()) == list(data_files.keys()) # test split order with list() + + @pytest.mark.parametrize("pattern", _TEST_PATTERNS) def test_DataFilesDict_from_patterns_in_dataset_repository( hub_dataset_repo_path, hub_dataset_repo_patterns_results, pattern From 416e9b777ba457acd8371154d8bb9c714a347f96 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 31 Aug 2023 10:39:10 +0200 Subject: [PATCH 2/3] Remove key sorting in DataFilesDict --- src/datasets/data_files.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index fd1eb739c99..77c3ae18284 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -682,17 +682,6 @@ def from_patterns( ) return out - def __reduce__(self): - """ - To make sure the order of the keys doesn't matter when pickling and hashing: - - >>> from datasets.data_files import DataFilesDict - >>> from datasets.fingerprint import Hasher - >>> assert Hasher.hash(DataFilesDict(a=[], b=[])) == Hasher.hash(DataFilesDict(b=[], a=[])) - - """ - return DataFilesDict, (dict(sorted(self.items())),) - def filter_extensions(self, extensions: List[str]) -> "DataFilesDict": out = type(self)() for key, data_files_list in self.items(): From 0f3b6eaf69d3352394d3bf3c4d6ed01dd2af5860 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 31 Aug 2023 11:21:31 +0200 Subject: [PATCH 3/3] Fix test_cache_dir_for_data_files --- tests/test_builder.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/test_builder.py b/tests/test_builder.py index 45b7574ffc7..54bae47ae08 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -759,10 +759,6 @@ def test_cache_dir_for_data_files(self): cache_dir=tmp_dir, data_files={"train": [dummy_data1], "test": dummy_data2} ) self.assertEqual(builder.cache_dir, other_builder.cache_dir) - other_builder = DummyGeneratorBasedBuilder( - cache_dir=tmp_dir, data_files={"test": dummy_data2, "train": dummy_data1} - ) - self.assertEqual(builder.cache_dir, other_builder.cache_dir) other_builder = DummyGeneratorBasedBuilder( cache_dir=tmp_dir, data_files={"train": dummy_data1, "validation": dummy_data2} )