Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename "pattern" to "path" in YAML data_files configs #6044

Merged
merged 9 commits into from
Jul 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 26 additions & 30 deletions docs/source/repository_structure.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,15 @@ In this simple case, you'll get a dataset with two splits: `train` (containing e

## Splits

If you have multiple files and want to define which file goes into which split, you can use the YAML `configs` field at the top of your README.md using glob patterns.
If you have multiple files and want to define which file goes into which split, you can use the YAML `configs` field at the top of your README.md.

For example, given a repository like this one:

```
my_dataset_repository/
├── README.md
├── directory1/
│ └── bees.csv
└── directory2/
└── more_bees.csv
├── data.csv
└── holdout.csv
```

You can define your splits by adding the `configs` field in the YAML block at the top of your README.md:
Expand All @@ -45,27 +43,23 @@ configs:
- config_name: default
data_files:
- split: train
pattern: "directory1/*.csv"
path: "data.csv"
- split: test
pattern: "directory2/*.csv"
path: "holdout.csv"
---
```

<Tip warning={true}>
Note that `config_name` field is required even if you have a single configuration.
</Tip>

Having several patterns per split is also supported:
You can select multiple files per split using a list of paths:

```
my_dataset_repository/
├── README.md
├── directory1/
│ └── bees.csv
├── directory1bis/
│ └── more_bees.csv
└── directory2/
└── even_more_bees.csv
├── data/
│ ├── abc.csv
│ └── def.csv
└── holdout/
└── ghi.csv
```

```yaml
Expand All @@ -74,32 +68,34 @@ configs:
- config_name: default
data_files:
- split: train
pattern:
- "directory1/*.csv"
- "directory1bis/*.csv"
path:
- "data/abc.csv"
- "data/def.csv"
- split: test
pattern:
- "directory2/*.csv"
path: "holdout/ghi.csv"
---
```

Custom split names are also supported:
Or you can use glob patterns to automatically list all the files you need:

```yaml
---
configs:
- config_name: default
data_files:
- split: random
pattern:
- "directory1bis/*.csv"
- split: train
pattern:
- "directory1/*.csv"
path: "data/*.csv"
- split: test
pattern:
- "directory2/*.csv"
path: "holdout/*.csv"
---
```

<Tip warning={true}>

Note that `config_name` field is required even if you have a single configuration.

</Tip>

## Configurations

Your dataset might have several subsets of data that you want to be able to load separately. In that case you can define a list of configurations inside the `configs` field in YAML:
Expand Down
8 changes: 4 additions & 4 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5512,7 +5512,7 @@ def push_to_hub(
}
default_metadata_configs_to_dump = {
"data_files": [
{"split": _resolved_split, "pattern": f"data/{_resolved_split}-*"}
{"split": _resolved_split, "path": f"data/{_resolved_split}-*"}
for _resolved_split in _resolved_splits
]
}
Expand All @@ -5525,18 +5525,18 @@ def push_to_hub(
else:
data_files_to_dump = {}
# add the new split
data_files_to_dump[split] = f"{data_dir}/{split}-*"
data_files_to_dump[split] = [f"{data_dir}/{split}-*"]
metadata_config_to_dump = {
"data_files": [
{
"split": _split,
"pattern": _pattern[0] if isinstance(_pattern, list) and len(_pattern) == 1 else _pattern,
"path": _pattern[0] if len(_pattern) == 1 else _pattern,
}
for _split, _pattern in data_files_to_dump.items()
]
}
else:
metadata_config_to_dump = {"data_files": [{"split": split, "pattern": f"{data_dir}/{split}-*"}]}
metadata_config_to_dump = {"data_files": [{"split": split, "path": f"{data_dir}/{split}-*"}]}
# push to the deprecated dataset_infos.json
if config.DATASETDICT_INFOS_FILENAME in repo_files:
download_config = DownloadConfig()
Expand Down
13 changes: 8 additions & 5 deletions src/datasets/data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,17 +99,20 @@ def sanitize_patterns(patterns: Union[Dict, List, str]) -> Dict[str, Union[List[
elif isinstance(patterns, list):
if any(isinstance(pattern, dict) for pattern in patterns):
for pattern in patterns:
if not isinstance(pattern, dict) or sorted(pattern) != ["pattern", "split"]:
if not (
isinstance(pattern, dict)
and len(pattern) == 2
and "split" in pattern
and isinstance(pattern.get("path"), (str, list))
):
raise ValueError(
f"Expected each pattern in a list of patterns to be a string or a list, but got {pattern}"
f"Expected each split to have a 'path' key which can be a string or a list of strings, but got {pattern}"
)
splits = [pattern["split"] for pattern in patterns]
if len(set(splits)) != len(splits):
raise ValueError(f"Some splits are duplicated in data_files: {splits}")
return {
str(pattern["split"]): pattern["pattern"]
if isinstance(pattern["pattern"], list)
else [pattern["pattern"]]
str(pattern["split"]): pattern["path"] if isinstance(pattern["path"], list) else [pattern["path"]]
for pattern in patterns
}
else:
Expand Down
4 changes: 2 additions & 2 deletions src/datasets/dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -1660,7 +1660,7 @@ def push_to_hub(
info_to_dump.size_in_bytes = total_uploaded_size + total_dataset_nbytes

metadata_config_to_dump = {
"data_files": [{"split": split, "pattern": f"{data_dir}/{split}-*"} for split in self.keys()],
"data_files": [{"split": split, "path": f"{data_dir}/{split}-*"} for split in self.keys()],
}

api = HfApi(endpoint=config.HF_ENDPOINT)
Expand Down Expand Up @@ -1715,7 +1715,7 @@ def push_to_hub(
}
default_metadata_configs_to_dump = {
"data_files": [
{"split": _resolved_split, "pattern": f"data/{_resolved_split}-*"}
{"split": _resolved_split, "path": f"data/{_resolved_split}-*"}
for _resolved_split in _resolved_splits
]
}
Expand Down
29 changes: 19 additions & 10 deletions src/datasets/utils/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def _raise_if_data_files_field_not_valid(metadata_config: dict):
yaml_error_message = textwrap.dedent(
f"""
Expected data_files in YAML to be either a string or a list of strings
or a list of dicts with two keys: 'split' and 'pattern', but got {yaml_data_files}
or a list of dicts with two keys: 'split' and 'path', but got {yaml_data_files}
Examples of data_files in YAML:

data_files: data.csv
Expand All @@ -141,22 +141,31 @@ def _raise_if_data_files_field_not_valid(metadata_config: dict):

data_files:
- split: train
pattern: train/*
path: train/*
- split: test
pattern: test/*
path: test/*

data_files:
- split: train
path:
- train/part1/*
- train/part2/*
- split: test
path: test/*
"""
)
if not isinstance(yaml_data_files, (list, str)):
raise ValueError(yaml_error_message)
if isinstance(yaml_data_files, list):
for yaml_data_files_item in yaml_data_files:
if not isinstance(yaml_data_files_item, str) and not (
isinstance(yaml_data_files_item, dict)
and sorted(yaml_data_files_item)
== [
"pattern",
"split",
]
if (
not isinstance(yaml_data_files_item, (str, dict))
or isinstance(yaml_data_files_item, dict)
and not (
len(yaml_data_files_item) == 2
and "split" in yaml_data_files_item
and isinstance(yaml_data_files_item.get("path"), (str, list))
)
):
raise ValueError(yaml_error_message)

Expand Down
18 changes: 9 additions & 9 deletions tests/test_upstream_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,19 +614,19 @@ def test_push_multiple_dataset_configs_to_hub_readme_metadata_content(self, temp
{
"config_name": "config1",
"data_files": [
{"split": "train", "pattern": "config1/train-*"},
{"split": "train", "path": "config1/train-*"},
],
},
{
"config_name": "config2",
"data_files": [
{"split": "train", "pattern": "config2/train-*"},
{"split": "train", "path": "config2/train-*"},
],
},
{
"config_name": "default",
"data_files": [
{"split": "train", "pattern": "data/train-*"},
{"split": "train", "path": "data/train-*"},
],
},
]
Expand Down Expand Up @@ -743,22 +743,22 @@ def test_push_multiple_dataset_dict_configs_to_hub_readme_metadata_content(self,
{
"config_name": "config1",
"data_files": [
{"split": "train", "pattern": "config1/train-*"},
{"split": "random", "pattern": "config1/random-*"},
{"split": "train", "path": "config1/train-*"},
{"split": "random", "path": "config1/random-*"},
],
},
{
"config_name": "config2",
"data_files": [
{"split": "train", "pattern": "config2/train-*"},
{"split": "random", "pattern": "config2/random-*"},
{"split": "train", "path": "config2/train-*"},
{"split": "random", "path": "config2/random-*"},
],
},
{
"config_name": "default",
"data_files": [
{"split": "train", "pattern": "data/train-*"},
{"split": "random", "pattern": "data/random-*"},
{"split": "train", "path": "data/train-*"},
{"split": "random", "path": "data/random-*"},
],
},
]
Expand Down