Skip to content

Commit

Permalink
docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq committed Dec 7, 2022
1 parent 598b9da commit 24e24bf
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 8 deletions.
15 changes: 10 additions & 5 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1204,7 +1204,7 @@ def save_to_disk(
of the dataset directory where the dataset will be saved to.
max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`):
The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit
(like `"5MB"`).
(like `"50MB"`).
num_shards (:obj:`int`, optional): Number of shards to write.
Default to the same value as `num_proc` if specified.
Expand All @@ -1220,7 +1220,9 @@ def save_to_disk(
Example:
```py
>>> saved_ds = ds.save_to_disk("path/to/dataset/directory")
>>> ds.save_to_disk("path/to/dataset/directory")
>>> ds.save_to_disk("path/to/dataset/directory", max_shard_size="1GB")
>>> ds.save_to_disk("path/to/dataset/directory", num_shards=1024)
```
"""
if max_shard_size is not None and num_shards is not None:
Expand Down Expand Up @@ -4452,7 +4454,7 @@ def _push_parquet_shards_to_hub(
in your repository, which defaults to `"main"`.
max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`):
The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit
(like `"5MB"`).
(like `"50MB"`).
num_shards (:obj:`int`, optional): Number of shards to write.
Default to the same value as `num_proc` if specified.
Expand Down Expand Up @@ -4659,7 +4661,7 @@ def push_to_hub(
in your repository, which defaults to `"main"`.
max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`):
The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit
(like `"5MB"`).
(like `"50MB"`).
num_shards (:obj:`int`, optional): Number of shards to write.
Default to the same value as `num_proc` if specified.
Expand All @@ -4675,7 +4677,10 @@ def push_to_hub(
Example:
```python
>>> dataset.push_to_hub("<organization>/<dataset_id>", split="evaluation")
>>> dataset.push_to_hub("<organization>/<dataset_id>")
>>> dataset.push_to_hub("<organization>/<dataset_id>", split="validation")
>>> dataset.push_to_hub("<organization>/<dataset_id>", max_shard_size="1GB")
>>> dataset.push_to_hub("<organization>/<dataset_id>", num_shards=1024)
```
"""
if shard_size != "deprecated":
Expand Down
34 changes: 32 additions & 2 deletions src/datasets/dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -1037,7 +1037,8 @@ def save_to_disk(
self,
dataset_dict_path: PathLike,
fs="deprecated",
num_shards: Optional[Dict[str, int]] = None,
max_shard_size: Optional[Union[str, int]] = None,
num_shards: Optional[Union[int, Dict[str, int]]] = None,
num_proc: Optional[int] = None,
storage_options: Optional[dict] = None,
):
Expand All @@ -1054,9 +1055,13 @@ def save_to_disk(
dataset_dict_path (``PathLike``): Path (e.g. `path/to/dataset`) or remote URI
(e.g. `s3://my-bucket/dataset/train`) of the dataset dict directory where the dataset dict will be
saved to.
max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`):
The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit
(like `"50MB"`).
num_shards (:obj:`Dict[str, int]`, optional): Number of shards to write.
You need to provide the number of shards for each dataset in the dataset dictionary.
Default to the same value as `num_proc` if specified.
Use a dictionary to define a different num_shards for each split.
<Added version="2.8.0"/>
num_proc (:obj:`int`, optional, default `None`): Number of processes when downloading and generating the dataset locally.
Expand All @@ -1067,6 +1072,13 @@ def save_to_disk(
<Added version="2.8.0"/>
Example:
```python
>>> dataset_dict.save_to_disk("path/to/dataset/directory")
>>> dataset_dict.save_to_disk("path/to/dataset/directory", max_shard_size="1GB")
>>> dataset_dict.save_to_disk("path/to/dataset/directory", num_shards={"train": 1024, "test": 8})
```
"""
if fs != "deprecated":
warnings.warn(
Expand Down Expand Up @@ -1099,6 +1111,7 @@ def save_to_disk(
dataset.save_to_disk(
path_join(dataset_dict_path, k),
num_shards=num_shards.get(k),
max_shard_size=max_shard_size,
num_proc=num_proc,
storage_options=storage_options,
)
Expand Down Expand Up @@ -1336,7 +1349,8 @@ def push_to_hub(
token: Optional[str] = None,
branch: Optional[None] = None,
max_shard_size: Optional[Union[int, str]] = None,
shard_size: Optional[int] = "deprecated",
num_shards: Optional[int] = None,
shard_size: Optional[Union[int, Dict[str, int]]] = "deprecated",
embed_external_files: bool = True,
):
"""Pushes the ``DatasetDict`` to the hub as a Parquet dataset.
Expand Down Expand Up @@ -1365,6 +1379,11 @@ def push_to_hub(
max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`):
The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit
(like `"500MB"` or `"1GB"`).
num_shards (`Union[int, Dict[str, int]]`, optional): Number of shards to write.
Default to the same value as `num_proc` if specified.
Use a dictionary to define a different num_shards for each split.
<Added version="2.8.0"/>
shard_size (Optional :obj:`int`):
Deprecated: 'shard_size' was renamed to 'max_shard_size' in version 2.1.1 and will be removed in 2.4.0.
embed_external_files (:obj:`bool`, default ``True``):
Expand All @@ -1377,6 +1396,9 @@ def push_to_hub(
```python
>>> dataset_dict.push_to_hub("<organization>/<dataset_id>")
>>> dataset_dict.push_to_hub("<organization>/<dataset_id>", private=True)
>>> dataset_dict.push_to_hub("<organization>/<dataset_id>", max_shard_size="1GB")
>>> dataset_dict.push_to_hub("<organization>/<dataset_id>", num_shards={"train": 1024, "test": 8})
```
"""
if shard_size != "deprecated":
Expand All @@ -1386,6 +1408,13 @@ def push_to_hub(
)
max_shard_size = shard_size

if num_shards is None:
num_shards = {k: None for k in self}
elif not isinstance(num_shards, dict):
raise ValueError(
"Please provide one `num_shards` per dataset in the dataset dictionary, e.g. {{'train': 128, 'test': 4}}"
)

self._check_values_type()
self._check_values_features()
total_uploaded_size = 0
Expand All @@ -1407,6 +1436,7 @@ def push_to_hub(
token=token,
branch=branch,
max_shard_size=max_shard_size,
num_shards=num_shards.get(split),
embed_external_files=embed_external_files,
)
total_uploaded_size += uploaded_size
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/utils/py_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def size_str(size_in_bytes):

def convert_file_size_to_int(size: Union[int, str]) -> int:
"""
Converts a size expressed as a string with digits an unit (like `"5MB"`) to an integer (in bytes).
Converts a size expressed as a string with digits an unit (like `"50MB"`) to an integer (in bytes).
Args:
size (`int` or `str`): The size to convert. Will be directly returned if an `int`.
Expand Down

0 comments on commit 24e24bf

Please sign in to comment.