From d8da2e7b30f4067fa415360c621173e3c7807d65 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 17 Jul 2023 17:26:01 +0200 Subject: [PATCH 1/9] rename pattern to path --- docs/source/repository_structure.mdx | 21 +++++++++------------ src/datasets/arrow_dataset.py | 6 +++--- src/datasets/data_files.py | 21 +++++++++++++++------ src/datasets/dataset_dict.py | 4 ++-- src/datasets/utils/metadata.py | 8 ++++---- tests/test_upstream_hub.py | 18 +++++++++--------- 6 files changed, 42 insertions(+), 36 deletions(-) diff --git a/docs/source/repository_structure.mdx b/docs/source/repository_structure.mdx index a9ace91d45c..84a7ec62e00 100644 --- a/docs/source/repository_structure.mdx +++ b/docs/source/repository_structure.mdx @@ -24,7 +24,7 @@ In this simple case, you'll get a dataset with two splits: `train` (containing e ## Splits -If you have multiple files and want to define which file goes into which split, you can use the YAML `configs` field at the top of your README.md using glob patterns. +If you have multiple files and want to define which file goes into which split, you can use the YAML `configs` field at the top of your README.md. For example, given a repository like this one: @@ -45,9 +45,9 @@ configs: - config_name: default data_files: - split: train - pattern: "directory1/*.csv" + path: "directory1/*.csv" - split: test - pattern: "directory2/*.csv" + path: "directory2/*.csv" --- ``` @@ -55,7 +55,7 @@ configs: Note that `config_name` field is required even if you have a single configuration. -Having several patterns per split is also supported: +Having several paths per split is also supported: ``` my_dataset_repository/ @@ -74,11 +74,11 @@ configs: - config_name: default data_files: - split: train - pattern: + paths: - "directory1/*.csv" - "directory1bis/*.csv" - split: test - pattern: + paths: - "directory2/*.csv" --- ``` @@ -89,14 +89,11 @@ configs: - config_name: default data_files: - split: random - pattern: - - "directory1bis/*.csv" + path: "directory1bis/*.csv" - split: train - pattern: - - "directory1/*.csv" + path: "directory1/*.csv" - split: test - pattern: - - "directory2/*.csv" + path: "directory2/*.csv" --- ``` diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 7bee3ea2ab1..c9e98bcdf0a 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -5512,7 +5512,7 @@ def push_to_hub( } default_metadata_configs_to_dump = { "data_files": [ - {"split": _resolved_split, "pattern": f"data/{_resolved_split}-*"} + {"split": _resolved_split, "path": f"data/{_resolved_split}-*"} for _resolved_split in _resolved_splits ] } @@ -5530,13 +5530,13 @@ def push_to_hub( "data_files": [ { "split": _split, - "pattern": _pattern[0] if isinstance(_pattern, list) and len(_pattern) == 1 else _pattern, + "path" if len(_pattern) == 1 else "paths": _pattern[0] if len(_pattern) == 1 else _pattern, } for _split, _pattern in data_files_to_dump.items() ] } else: - metadata_config_to_dump = {"data_files": [{"split": split, "pattern": f"{data_dir}/{split}-*"}]} + metadata_config_to_dump = {"data_files": [{"split": split, "path": f"{data_dir}/{split}-*"}]} # push to the deprecated dataset_infos.json if config.DATASETDICT_INFOS_FILENAME in repo_files: download_config = DownloadConfig() diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 2ebaa14fd63..3d1c659dda7 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -99,17 +99,26 @@ def sanitize_patterns(patterns: Union[Dict, List, str]) -> Dict[str, Union[List[ elif isinstance(patterns, list): if any(isinstance(pattern, dict) for pattern in patterns): for pattern in patterns: - if not isinstance(pattern, dict) or sorted(pattern) != ["pattern", "split"]: - raise ValueError( - f"Expected each pattern in a list of patterns to be a string or a list, but got {pattern}" + if ( + not isinstance(pattern, dict) + or sorted(pattern) != ["path", "split"] + or sorted(pattern) != ["paths", "split"] + ): + raise ValueError(f"Expected each split to have 'path' or 'paths', but got {pattern}") + if "path" in pattern: + if not isinstance(pattern["path"], str): + raise TypeError( + f"Expected 'path' to be a string, but got {type(pattern['path'])} in {pattern}" + ) + elif not isinstance(pattern["paths"], list): + raise TypeError( + f"Expected 'paths' to be a list of strings, but got {type(pattern['paths'])} in {pattern}" ) splits = [pattern["split"] for pattern in patterns] if len(set(splits)) != len(splits): raise ValueError(f"Some splits are duplicated in data_files: {splits}") return { - str(pattern["split"]): pattern["pattern"] - if isinstance(pattern["pattern"], list) - else [pattern["pattern"]] + str(pattern["split"]): [pattern["path"]] if "path" in pattern else [pattern["paths"]] for pattern in patterns } else: diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index ffdb74b17a8..dc9cc7dd848 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -1660,7 +1660,7 @@ def push_to_hub( info_to_dump.size_in_bytes = total_uploaded_size + total_dataset_nbytes metadata_config_to_dump = { - "data_files": [{"split": split, "pattern": f"{data_dir}/{split}-*"} for split in self.keys()], + "data_files": [{"split": split, "path": f"{data_dir}/{split}-*"} for split in self.keys()], } api = HfApi(endpoint=config.HF_ENDPOINT) @@ -1715,7 +1715,7 @@ def push_to_hub( } default_metadata_configs_to_dump = { "data_files": [ - {"split": _resolved_split, "pattern": f"data/{_resolved_split}-*"} + {"split": _resolved_split, "path": f"data/{_resolved_split}-*"} for _resolved_split in _resolved_splits ] } diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index 5df937b49c0..97137560505 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -128,7 +128,7 @@ def _raise_if_data_files_field_not_valid(metadata_config: dict): yaml_error_message = textwrap.dedent( f""" Expected data_files in YAML to be either a string or a list of strings - or a list of dicts with two keys: 'split' and 'pattern', but got {yaml_data_files} + or a list of dicts with two keys: 'split' and 'path', but got {yaml_data_files} Examples of data_files in YAML: data_files: data.csv @@ -141,9 +141,9 @@ def _raise_if_data_files_field_not_valid(metadata_config: dict): data_files: - split: train - pattern: train/* + path: train/* - split: test - pattern: test/* + path: test/* """ ) if not isinstance(yaml_data_files, (list, str)): @@ -154,7 +154,7 @@ def _raise_if_data_files_field_not_valid(metadata_config: dict): isinstance(yaml_data_files_item, dict) and sorted(yaml_data_files_item) == [ - "pattern", + "path", "split", ] ): diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py index a2f5f832833..7ab1de5d2ec 100644 --- a/tests/test_upstream_hub.py +++ b/tests/test_upstream_hub.py @@ -614,19 +614,19 @@ def test_push_multiple_dataset_configs_to_hub_readme_metadata_content(self, temp { "config_name": "config1", "data_files": [ - {"split": "train", "pattern": "config1/train-*"}, + {"split": "train", "path": "config1/train-*"}, ], }, { "config_name": "config2", "data_files": [ - {"split": "train", "pattern": "config2/train-*"}, + {"split": "train", "path": "config2/train-*"}, ], }, { "config_name": "default", "data_files": [ - {"split": "train", "pattern": "data/train-*"}, + {"split": "train", "path": "data/train-*"}, ], }, ] @@ -743,22 +743,22 @@ def test_push_multiple_dataset_dict_configs_to_hub_readme_metadata_content(self, { "config_name": "config1", "data_files": [ - {"split": "train", "pattern": "config1/train-*"}, - {"split": "random", "pattern": "config1/random-*"}, + {"split": "train", "path": "config1/train-*"}, + {"split": "random", "path": "config1/random-*"}, ], }, { "config_name": "config2", "data_files": [ - {"split": "train", "pattern": "config2/train-*"}, - {"split": "random", "pattern": "config2/random-*"}, + {"split": "train", "path": "config2/train-*"}, + {"split": "random", "path": "config2/random-*"}, ], }, { "config_name": "default", "data_files": [ - {"split": "train", "pattern": "data/train-*"}, - {"split": "random", "pattern": "data/random-*"}, + {"split": "train", "path": "data/train-*"}, + {"split": "random", "path": "data/random-*"}, ], }, ] From 5be59becaa65f1fa08129091b8c778823e4a50ac Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 17 Jul 2023 17:40:51 +0200 Subject: [PATCH 2/9] docs --- docs/source/repository_structure.mdx | 47 +++++++++++++--------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/docs/source/repository_structure.mdx b/docs/source/repository_structure.mdx index 84a7ec62e00..cea65f3986c 100644 --- a/docs/source/repository_structure.mdx +++ b/docs/source/repository_structure.mdx @@ -31,10 +31,8 @@ For example, given a repository like this one: ``` my_dataset_repository/ ├── README.md -├── directory1/ -│ └── bees.csv -└── directory2/ - └── more_bees.csv +├── data.csv +└── holdout.csv ``` You can define your splits by adding the `configs` field in the YAML block at the top of your README.md: @@ -45,27 +43,23 @@ configs: - config_name: default data_files: - split: train - path: "directory1/*.csv" + path: "data.csv" - split: test - path: "directory2/*.csv" + path: "holdout.csv" --- ``` - -Note that `config_name` field is required even if you have a single configuration. - -Having several paths per split is also supported: +You can select multiple files per split using a list of paths: ``` my_dataset_repository/ ├── README.md -├── directory1/ -│ └── bees.csv -├── directory1bis/ -│ └── more_bees.csv -└── directory2/ - └── even_more_bees.csv +├── data/ +│ ├── abc.csv +│ └── def.csv +└── holdout/ + └── ghi.csv ``` ```yaml @@ -75,28 +69,31 @@ configs: data_files: - split: train paths: - - "directory1/*.csv" - - "directory1bis/*.csv" + - "data/abc.csv" + - "data/def.csv" - split: test - paths: - - "directory2/*.csv" + path: "holdout/ghi.csv" --- ``` -Custom split names are also supported: +Or you can use glob patterns to not have to manually list all the files: + ```yaml +--- configs: - config_name: default data_files: - - split: random - path: "directory1bis/*.csv" - split: train - path: "directory1/*.csv" + path: "data/*.csv" - split: test - path: "directory2/*.csv" + path: "holdout/*.csv" --- ``` + +Note that `config_name` field is required even if you have a single configuration. + + ## Configurations Your dataset might have several subsets of data that you want to be able to load separately. In that case you can define a list of configurations inside the `configs` field in YAML: From 4904f14459c862f0ab525ec034a636177be5dee4 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Mon, 17 Jul 2023 17:53:17 +0200 Subject: [PATCH 3/9] better _raise_if_data_files_field_not_valid --- src/datasets/data_files.py | 20 ++++++-------------- src/datasets/utils/metadata.py | 23 ++++++++++++++++------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 3d1c659dda7..d79bbe235cf 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -99,21 +99,13 @@ def sanitize_patterns(patterns: Union[Dict, List, str]) -> Dict[str, Union[List[ elif isinstance(patterns, list): if any(isinstance(pattern, dict) for pattern in patterns): for pattern in patterns: - if ( - not isinstance(pattern, dict) - or sorted(pattern) != ["path", "split"] - or sorted(pattern) != ["paths", "split"] + if not ( + isinstance(pattern, dict) + and len(pattern) == 2 + and "split" in pattern + and (isinstance(pattern.get("path"), str) or isinstance(pattern.get("paths"), list)) ): - raise ValueError(f"Expected each split to have 'path' or 'paths', but got {pattern}") - if "path" in pattern: - if not isinstance(pattern["path"], str): - raise TypeError( - f"Expected 'path' to be a string, but got {type(pattern['path'])} in {pattern}" - ) - elif not isinstance(pattern["paths"], list): - raise TypeError( - f"Expected 'paths' to be a list of strings, but got {type(pattern['paths'])} in {pattern}" - ) + raise ValueError(f"Expected each split to have a 'path' or a list of 'paths', but got {pattern}") splits = [pattern["split"] for pattern in patterns] if len(set(splits)) != len(splits): raise ValueError(f"Some splits are duplicated in data_files: {splits}") diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index 97137560505..a8d2aa6e9cf 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -128,7 +128,7 @@ def _raise_if_data_files_field_not_valid(metadata_config: dict): yaml_error_message = textwrap.dedent( f""" Expected data_files in YAML to be either a string or a list of strings - or a list of dicts with two keys: 'split' and 'path', but got {yaml_data_files} + or a list of dicts with two keys: 'split' and 'path' (or 'paths'), but got {yaml_data_files} Examples of data_files in YAML: data_files: data.csv @@ -144,19 +144,28 @@ def _raise_if_data_files_field_not_valid(metadata_config: dict): path: train/* - split: test path: test/* + + data_files: + - split: train + paths: + - train/part1/* + - train/part2/* + - split: test + path: test/* """ ) if not isinstance(yaml_data_files, (list, str)): raise ValueError(yaml_error_message) if isinstance(yaml_data_files, list): for yaml_data_files_item in yaml_data_files: - if not isinstance(yaml_data_files_item, str) and not ( + if not isinstance(yaml_data_files_item, str) or ( isinstance(yaml_data_files_item, dict) - and sorted(yaml_data_files_item) - == [ - "path", - "split", - ] + and len(yaml_data_files_item) == 2 + and "split" in yaml_data_files_item + and ( + isinstance(yaml_data_files_item.get("path"), str) + or isinstance(yaml_data_files_item.get("paths"), list) + ) ): raise ValueError(yaml_error_message) From 6ea38fc40ee2b10d3b5c6df09b09ad05e02a2cff Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Tue, 18 Jul 2023 11:14:55 +0200 Subject: [PATCH 4/9] Apply suggestions from code review Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/repository_structure.mdx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/source/repository_structure.mdx b/docs/source/repository_structure.mdx index cea65f3986c..9aefe508aba 100644 --- a/docs/source/repository_structure.mdx +++ b/docs/source/repository_structure.mdx @@ -76,7 +76,7 @@ configs: --- ``` -Or you can use glob patterns to not have to manually list all the files: +Or you can use glob patterns to automatically list all the files: ```yaml --- @@ -91,7 +91,9 @@ configs: ``` + Note that `config_name` field is required even if you have a single configuration. + ## Configurations From d7298d4d1b169442a8d0bc8c1667298bb89ca501 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 18 Jul 2023 14:39:59 +0200 Subject: [PATCH 5/9] fix check --- src/datasets/utils/metadata.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index a8d2aa6e9cf..bc178cb8fcf 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -158,13 +158,16 @@ def _raise_if_data_files_field_not_valid(metadata_config: dict): raise ValueError(yaml_error_message) if isinstance(yaml_data_files, list): for yaml_data_files_item in yaml_data_files: - if not isinstance(yaml_data_files_item, str) or ( - isinstance(yaml_data_files_item, dict) - and len(yaml_data_files_item) == 2 - and "split" in yaml_data_files_item - and ( - isinstance(yaml_data_files_item.get("path"), str) - or isinstance(yaml_data_files_item.get("paths"), list) + if ( + not isinstance(yaml_data_files_item, (str, dict)) + or isinstance(yaml_data_files_item, dict) + and not ( + len(yaml_data_files_item) == 2 + and "split" in yaml_data_files_item + and ( + isinstance(yaml_data_files_item.get("path"), str) + or isinstance(yaml_data_files_item.get("paths"), list) + ) ) ): raise ValueError(yaml_error_message) From d6d2ba47759d8acbf3d750b1cc4d89b195b1f9c9 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 18 Jul 2023 17:03:16 +0200 Subject: [PATCH 6/9] fix --- src/datasets/arrow_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index c9e98bcdf0a..97acac4e7c4 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -5525,7 +5525,7 @@ def push_to_hub( else: data_files_to_dump = {} # add the new split - data_files_to_dump[split] = f"{data_dir}/{split}-*" + data_files_to_dump[split] = [f"{data_dir}/{split}-*"] metadata_config_to_dump = { "data_files": [ { From 8c9c24d1d90f0c2db043ae2bc39f7c292454a58c Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 18 Jul 2023 17:52:15 +0200 Subject: [PATCH 7/9] only "path" (removed plural) --- docs/source/repository_structure.mdx | 2 +- src/datasets/arrow_dataset.py | 2 +- src/datasets/data_files.py | 6 +++--- src/datasets/utils/metadata.py | 9 +++------ 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/docs/source/repository_structure.mdx b/docs/source/repository_structure.mdx index 9aefe508aba..18c8ae559ff 100644 --- a/docs/source/repository_structure.mdx +++ b/docs/source/repository_structure.mdx @@ -68,7 +68,7 @@ configs: - config_name: default data_files: - split: train - paths: + path: - "data/abc.csv" - "data/def.csv" - split: test diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 97acac4e7c4..1693d229727 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -5530,7 +5530,7 @@ def push_to_hub( "data_files": [ { "split": _split, - "path" if len(_pattern) == 1 else "paths": _pattern[0] if len(_pattern) == 1 else _pattern, + "path": _pattern[0] if len(_pattern) == 1 else _pattern, } for _split, _pattern in data_files_to_dump.items() ] diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index d79bbe235cf..9cb04e43837 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -103,14 +103,14 @@ def sanitize_patterns(patterns: Union[Dict, List, str]) -> Dict[str, Union[List[ isinstance(pattern, dict) and len(pattern) == 2 and "split" in pattern - and (isinstance(pattern.get("path"), str) or isinstance(pattern.get("paths"), list)) + and isinstance(pattern.get("path"), (str, list)) ): - raise ValueError(f"Expected each split to have a 'path' or a list of 'paths', but got {pattern}") + raise ValueError(f"Expected each split to have a 'path' (which can be a list), but got {pattern}") splits = [pattern["split"] for pattern in patterns] if len(set(splits)) != len(splits): raise ValueError(f"Some splits are duplicated in data_files: {splits}") return { - str(pattern["split"]): [pattern["path"]] if "path" in pattern else [pattern["paths"]] + str(pattern["split"]): pattern["path"] if isinstance(pattern["path"], list) else [pattern["path"]] for pattern in patterns } else: diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index bc178cb8fcf..b2db9d687b3 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -128,7 +128,7 @@ def _raise_if_data_files_field_not_valid(metadata_config: dict): yaml_error_message = textwrap.dedent( f""" Expected data_files in YAML to be either a string or a list of strings - or a list of dicts with two keys: 'split' and 'path' (or 'paths'), but got {yaml_data_files} + or a list of dicts with two keys: 'split' and 'path', but got {yaml_data_files} Examples of data_files in YAML: data_files: data.csv @@ -147,7 +147,7 @@ def _raise_if_data_files_field_not_valid(metadata_config: dict): data_files: - split: train - paths: + path: - train/part1/* - train/part2/* - split: test @@ -164,10 +164,7 @@ def _raise_if_data_files_field_not_valid(metadata_config: dict): and not ( len(yaml_data_files_item) == 2 and "split" in yaml_data_files_item - and ( - isinstance(yaml_data_files_item.get("path"), str) - or isinstance(yaml_data_files_item.get("paths"), list) - ) + and isinstance(yaml_data_files_item.get("path"), (str, list)) ) ): raise ValueError(yaml_error_message) From f87d6e6394bf4b390ccc82235eb7667f874e5d43 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Tue, 18 Jul 2023 18:21:09 +0200 Subject: [PATCH 8/9] Apply suggestions from code review Co-authored-by: Polina Kazakova --- docs/source/repository_structure.mdx | 2 +- src/datasets/data_files.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/repository_structure.mdx b/docs/source/repository_structure.mdx index 18c8ae559ff..80b5da95d28 100644 --- a/docs/source/repository_structure.mdx +++ b/docs/source/repository_structure.mdx @@ -76,7 +76,7 @@ configs: --- ``` -Or you can use glob patterns to automatically list all the files: +Or you can use glob patterns to automatically list all the files you need: ```yaml --- diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 9cb04e43837..9a1df27a58d 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -105,7 +105,7 @@ def sanitize_patterns(patterns: Union[Dict, List, str]) -> Dict[str, Union[List[ and "split" in pattern and isinstance(pattern.get("path"), (str, list)) ): - raise ValueError(f"Expected each split to have a 'path' (which can be a list), but got {pattern}") + raise ValueError(f"Expected each split to have a 'path' key which can be a string or a list of strings, but got {pattern}") splits = [pattern["split"] for pattern in patterns] if len(set(splits)) != len(splits): raise ValueError(f"Some splits are duplicated in data_files: {splits}") From 8f6fa96ae5de873a49ef28739e8f64edf8b18cae Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 18 Jul 2023 18:35:50 +0200 Subject: [PATCH 9/9] style --- src/datasets/data_files.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 9a1df27a58d..384740bc991 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -105,7 +105,9 @@ def sanitize_patterns(patterns: Union[Dict, List, str]) -> Dict[str, Union[List[ and "split" in pattern and isinstance(pattern.get("path"), (str, list)) ): - raise ValueError(f"Expected each split to have a 'path' key which can be a string or a list of strings, but got {pattern}") + raise ValueError( + f"Expected each split to have a 'path' key which can be a string or a list of strings, but got {pattern}" + ) splits = [pattern["split"] for pattern in patterns] if len(set(splits)) != len(splits): raise ValueError(f"Some splits are duplicated in data_files: {splits}")