docstrings

huggingface · Dec 7, 2022 · 24e24bf · 24e24bf
1 parent 598b9da
commit 24e24bf
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 8 deletions.
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -1204,7 +1204,7 @@ def save_to_disk(
                 of the dataset directory where the dataset will be saved to.
             max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`):
                 The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit
-                (like `"5MB"`).
+                (like `"50MB"`).
             num_shards (:obj:`int`, optional): Number of shards to write.
                 Default to the same value as `num_proc` if specified.
 
@@ -1220,7 +1220,9 @@ def save_to_disk(
         Example:
 
         ```py
-        >>> saved_ds = ds.save_to_disk("path/to/dataset/directory")
+        >>> ds.save_to_disk("path/to/dataset/directory")
+        >>> ds.save_to_disk("path/to/dataset/directory", max_shard_size="1GB")
+        >>> ds.save_to_disk("path/to/dataset/directory", num_shards=1024)
         ```
         """
         if max_shard_size is not None and num_shards is not None:
@@ -4452,7 +4454,7 @@ def _push_parquet_shards_to_hub(
                 in your repository, which defaults to `"main"`.
             max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`):
                 The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit
-                (like `"5MB"`).
+                (like `"50MB"`).
             num_shards (:obj:`int`, optional): Number of shards to write.
                 Default to the same value as `num_proc` if specified.
 
@@ -4659,7 +4661,7 @@ def push_to_hub(
                 in your repository, which defaults to `"main"`.
             max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`):
                 The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit
-                (like `"5MB"`).
+                (like `"50MB"`).
             num_shards (:obj:`int`, optional): Number of shards to write.
                 Default to the same value as `num_proc` if specified.
 
@@ -4675,7 +4677,10 @@ def push_to_hub(
         Example:
 
         ```python
-        >>> dataset.push_to_hub("<organization>/<dataset_id>", split="evaluation")
+        >>> dataset.push_to_hub("<organization>/<dataset_id>")
+        >>> dataset.push_to_hub("<organization>/<dataset_id>", split="validation")
+        >>> dataset.push_to_hub("<organization>/<dataset_id>", max_shard_size="1GB")
+        >>> dataset.push_to_hub("<organization>/<dataset_id>", num_shards=1024)
         ```
         """
         if shard_size != "deprecated":

diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py
@@ -1037,7 +1037,8 @@ def save_to_disk(
         self,
         dataset_dict_path: PathLike,
         fs="deprecated",
-        num_shards: Optional[Dict[str, int]] = None,
+        max_shard_size: Optional[Union[str, int]] = None,
+        num_shards: Optional[Union[int, Dict[str, int]]] = None,
         num_proc: Optional[int] = None,
         storage_options: Optional[dict] = None,
     ):
@@ -1054,9 +1055,13 @@ def save_to_disk(
             dataset_dict_path (``PathLike``): Path (e.g. `path/to/dataset`) or remote URI
                 (e.g. `s3://my-bucket/dataset/train`) of the dataset dict directory where the dataset dict will be
                 saved to.
+            max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`):
+                The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit
+                (like `"50MB"`).
             num_shards (:obj:`Dict[str, int]`, optional): Number of shards to write.
                 You need to provide the number of shards for each dataset in the dataset dictionary.
                 Default to the same value as `num_proc` if specified.
+                Use a dictionary to define a different num_shards for each split.
 
                 <Added version="2.8.0"/>
             num_proc (:obj:`int`, optional, default `None`): Number of processes when downloading and generating the dataset locally.
@@ -1067,6 +1072,13 @@ def save_to_disk(
 
                 <Added version="2.8.0"/>
 
+        Example:
+
+        ```python
+        >>> dataset_dict.save_to_disk("path/to/dataset/directory")
+        >>> dataset_dict.save_to_disk("path/to/dataset/directory", max_shard_size="1GB")
+        >>> dataset_dict.save_to_disk("path/to/dataset/directory", num_shards={"train": 1024, "test": 8})
+        ```
         """
         if fs != "deprecated":
             warnings.warn(
@@ -1099,6 +1111,7 @@ def save_to_disk(
             dataset.save_to_disk(
                 path_join(dataset_dict_path, k),
                 num_shards=num_shards.get(k),
+                max_shard_size=max_shard_size,
                 num_proc=num_proc,
                 storage_options=storage_options,
             )
@@ -1336,7 +1349,8 @@ def push_to_hub(
         token: Optional[str] = None,
         branch: Optional[None] = None,
         max_shard_size: Optional[Union[int, str]] = None,
-        shard_size: Optional[int] = "deprecated",
+        num_shards: Optional[int] = None,
+        shard_size: Optional[Union[int, Dict[str, int]]] = "deprecated",
         embed_external_files: bool = True,
     ):
         """Pushes the ``DatasetDict`` to the hub as a Parquet dataset.
@@ -1365,6 +1379,11 @@ def push_to_hub(
             max_shard_size (`int` or `str`, *optional*, defaults to `"500MB"`):
                 The maximum size of the dataset shards to be uploaded to the hub. If expressed as a string, needs to be digits followed by a unit
                 (like `"500MB"` or `"1GB"`).
+            num_shards (`Union[int, Dict[str, int]]`, optional): Number of shards to write.
+                Default to the same value as `num_proc` if specified.
+                Use a dictionary to define a different num_shards for each split.
+
+                <Added version="2.8.0"/>
             shard_size (Optional :obj:`int`):
                 Deprecated: 'shard_size' was renamed to 'max_shard_size' in version 2.1.1 and will be removed in 2.4.0.
             embed_external_files (:obj:`bool`, default ``True``):
@@ -1377,6 +1396,9 @@ def push_to_hub(
 
         ```python
         >>> dataset_dict.push_to_hub("<organization>/<dataset_id>")
+        >>> dataset_dict.push_to_hub("<organization>/<dataset_id>", private=True)
+        >>> dataset_dict.push_to_hub("<organization>/<dataset_id>", max_shard_size="1GB")
+        >>> dataset_dict.push_to_hub("<organization>/<dataset_id>", num_shards={"train": 1024, "test": 8})
         ```
         """
         if shard_size != "deprecated":
@@ -1386,6 +1408,13 @@ def push_to_hub(
             )
             max_shard_size = shard_size
 
+        if num_shards is None:
+            num_shards = {k: None for k in self}
+        elif not isinstance(num_shards, dict):
+            raise ValueError(
+                "Please provide one `num_shards` per dataset in the dataset dictionary, e.g. {{'train': 128, 'test': 4}}"
+            )
+
         self._check_values_type()
         self._check_values_features()
         total_uploaded_size = 0
@@ -1407,6 +1436,7 @@ def push_to_hub(
                 token=token,
                 branch=branch,
                 max_shard_size=max_shard_size,
+                num_shards=num_shards.get(split),
                 embed_external_files=embed_external_files,
             )
             total_uploaded_size += uploaded_size

diff --git a/src/datasets/utils/py_utils.py b/src/datasets/utils/py_utils.py
@@ -90,7 +90,7 @@ def size_str(size_in_bytes):
 
 def convert_file_size_to_int(size: Union[int, str]) -> int:
     """
-    Converts a size expressed as a string with digits an unit (like `"5MB"`) to an integer (in bytes).
+    Converts a size expressed as a string with digits an unit (like `"50MB"`) to an integer (in bytes).
 
     Args:
         size (`int` or `str`): The size to convert. Will be directly returned if an `int`.