From 7ef5f6de02038dd200276a0ad63be7a6019d152f Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Fri, 18 Nov 2022 18:25:26 +0100 Subject: [PATCH] Fix `max_shard_size` docs (#5267) fix max_shard_size docs --- docs/source/filesystems.mdx | 2 +- src/datasets/builder.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/source/filesystems.mdx b/docs/source/filesystems.mdx index 962966bd65f..b92b8e6f243 100644 --- a/docs/source/filesystems.mdx +++ b/docs/source/filesystems.mdx @@ -140,7 +140,7 @@ Use your own data files (see [how to load local and remote files](./loading#loca It is highly recommended to save the files as compressed Parquet files to optimize I/O by specifying `file_format="parquet"`. Otherwise the dataset is saved as an uncompressed Arrow file. -You can also specify the size of the Parquet shard using `max_shard_size` (default is 500MB): +You can also specify the size of the shards using `max_shard_size` (default is 500MB): ```py >>> builder.download_and_prepare(output_dir, storage_options=storage_options, file_format="parquet", max_shard_size="1GB") diff --git a/src/datasets/builder.py b/src/datasets/builder.py index 961a2b6e6ff..829acd91e59 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -638,9 +638,9 @@ def download_and_prepare( If the format is "parquet", then image and audio data are embedded into the Parquet files instead of pointing to local files. - max_shard_size (:obj:`Union[str, int]`, optional): Maximum number of bytes written per shard. - Only available for the "parquet" format with a default of "500MB". The size is based on uncompressed data size, - so in practice your shard files may be smaller than `max_shard_size` thanks to Parquet compression. + max_shard_size (:obj:`Union[str, int]`, optional): Maximum number of bytes written per shard, default is "500MB". + The size is based on uncompressed data size, so in practice your shard files may be smaller than + `max_shard_size` thanks to Parquet compression for example. num_proc (:obj:`int`, optional, default `None`): Number of processes when downloading and generating the dataset locally. @@ -1262,9 +1262,9 @@ def _prepare_split( split_generator: `SplitGenerator`, Split generator to process file_format (:obj:`str`, optional): format of the data files in which the dataset will be written. Supported formats: "arrow", "parquet". Default to "arrow" format. - max_shard_size (:obj:`Union[str, int]`, optional): Approximate maximum number of bytes written per shard. - Only available for the "parquet" format with a default of "500MB". The size is based on uncompressed data size, - so in practice your shard files may be smaller than `max_shard_size` thanks to Parquet compression. + max_shard_size (:obj:`Union[str, int]`, optional): Maximum number of bytes written per shard, default is "500MB". + The size is based on uncompressed data size, so in practice your shard files may be smaller than + `max_shard_size` thanks to Parquet compression for example. num_proc (:obj:`int`, optional, default `None`): Number of processes when downloading and generating the dataset locally. Multiprocessing is disabled by default.