From 7ef5f6de02038dd200276a0ad63be7a6019d152f Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
Date: Fri, 18 Nov 2022 18:25:26 +0100
Subject: [PATCH] Fix `max_shard_size` docs (#5267)
fix max_shard_size docs
---
docs/source/filesystems.mdx | 2 +-
src/datasets/builder.py | 12 ++++++------
2 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/docs/source/filesystems.mdx b/docs/source/filesystems.mdx
index 962966bd65f..b92b8e6f243 100644
--- a/docs/source/filesystems.mdx
+++ b/docs/source/filesystems.mdx
@@ -140,7 +140,7 @@ Use your own data files (see [how to load local and remote files](./loading#loca
It is highly recommended to save the files as compressed Parquet files to optimize I/O by specifying `file_format="parquet"`.
Otherwise the dataset is saved as an uncompressed Arrow file.
-You can also specify the size of the Parquet shard using `max_shard_size` (default is 500MB):
+You can also specify the size of the shards using `max_shard_size` (default is 500MB):
```py
>>> builder.download_and_prepare(output_dir, storage_options=storage_options, file_format="parquet", max_shard_size="1GB")
diff --git a/src/datasets/builder.py b/src/datasets/builder.py
index 961a2b6e6ff..829acd91e59 100644
--- a/src/datasets/builder.py
+++ b/src/datasets/builder.py
@@ -638,9 +638,9 @@ def download_and_prepare(
If the format is "parquet", then image and audio data are embedded into the Parquet files instead of pointing to local files.
- max_shard_size (:obj:`Union[str, int]`, optional): Maximum number of bytes written per shard.
- Only available for the "parquet" format with a default of "500MB". The size is based on uncompressed data size,
- so in practice your shard files may be smaller than `max_shard_size` thanks to Parquet compression.
+ max_shard_size (:obj:`Union[str, int]`, optional): Maximum number of bytes written per shard, default is "500MB".
+ The size is based on uncompressed data size, so in practice your shard files may be smaller than
+ `max_shard_size` thanks to Parquet compression for example.
num_proc (:obj:`int`, optional, default `None`): Number of processes when downloading and generating the dataset locally.
@@ -1262,9 +1262,9 @@ def _prepare_split(
split_generator: `SplitGenerator`, Split generator to process
file_format (:obj:`str`, optional): format of the data files in which the dataset will be written.
Supported formats: "arrow", "parquet". Default to "arrow" format.
- max_shard_size (:obj:`Union[str, int]`, optional): Approximate maximum number of bytes written per shard.
- Only available for the "parquet" format with a default of "500MB". The size is based on uncompressed data size,
- so in practice your shard files may be smaller than `max_shard_size` thanks to Parquet compression.
+ max_shard_size (:obj:`Union[str, int]`, optional): Maximum number of bytes written per shard, default is "500MB".
+ The size is based on uncompressed data size, so in practice your shard files may be smaller than
+ `max_shard_size` thanks to Parquet compression for example.
num_proc (:obj:`int`, optional, default `None`): Number of processes when downloading and generating the dataset locally.
Multiprocessing is disabled by default.