Fix max_shard_size docs (#5267)

fix max_shard_size docs
huggingface · Nov 18, 2022 · 7ef5f6d · 7ef5f6d · github-actions · Nov 18, 2022
1 parent 4eccb22
commit 7ef5f6d
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 7 deletions.
diff --git a/docs/source/filesystems.mdx b/docs/source/filesystems.mdx
@@ -140,7 +140,7 @@ Use your own data files (see [how to load local and remote files](./loading#loca
 It is highly recommended to save the files as compressed Parquet files to optimize I/O by specifying `file_format="parquet"`.
 Otherwise the dataset is saved as an uncompressed Arrow file.
 
-You can also specify the size of the Parquet shard using `max_shard_size` (default is 500MB):
+You can also specify the size of the shards using `max_shard_size` (default is 500MB):
 
 ```py
 >>> builder.download_and_prepare(output_dir, storage_options=storage_options, file_format="parquet", max_shard_size="1GB")

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -638,9 +638,9 @@ def download_and_prepare(
                 If the format is "parquet", then image and audio data are embedded into the Parquet files instead of pointing to local files.
 
                 <Added version="2.5.0"/>
-            max_shard_size (:obj:`Union[str, int]`, optional): Maximum number of bytes written per shard.
-                Only available for the "parquet" format with a default of "500MB". The size is based on uncompressed data size,
-                so in practice your shard files may be smaller than `max_shard_size` thanks to Parquet compression.
+            max_shard_size (:obj:`Union[str, int]`, optional): Maximum number of bytes written per shard, default is "500MB".
+                The size is based on uncompressed data size, so in practice your shard files may be smaller than
+                `max_shard_size` thanks to Parquet compression for example.
 
                 <Added version="2.5.0"/>
             num_proc (:obj:`int`, optional, default `None`): Number of processes when downloading and generating the dataset locally.
@@ -1262,9 +1262,9 @@ def _prepare_split(
             split_generator: `SplitGenerator`, Split generator to process
             file_format (:obj:`str`, optional): format of the data files in which the dataset will be written.
                 Supported formats: "arrow", "parquet". Default to "arrow" format.
-            max_shard_size (:obj:`Union[str, int]`, optional): Approximate maximum number of bytes written per shard.
-                Only available for the "parquet" format with a default of "500MB". The size is based on uncompressed data size,
-                so in practice your shard files may be smaller than `max_shard_size` thanks to Parquet compression.
+            max_shard_size (:obj:`Union[str, int]`, optional): Maximum number of bytes written per shard, default is "500MB".
+                The size is based on uncompressed data size, so in practice your shard files may be smaller than
+                `max_shard_size` thanks to Parquet compression for example.
             num_proc (:obj:`int`, optional, default `None`): Number of processes when downloading and generating the dataset locally.
                 Multiprocessing is disabled by default.