From 986899ee489ffe9b32dd8dbec2674732d187544f Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 7 May 2021 15:49:02 +0200 Subject: [PATCH] save_to_disk note in docs --- src/datasets/arrow_dataset.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 7f563b8f9d6..c0e41fbbfb1 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -551,6 +551,18 @@ def save_to_disk(self, dataset_path: str, fs=None): Saves a dataset to a dataset directory, or in a filesystem using either :class:`~filesystems.S3FileSystem` or any implementation of ``fsspec.spec.AbstractFileSystem``. + + Note regarding sliced datasets: + + If you sliced the dataset in some way (using shard, train_test_split or select for example), then an indices mapping + is added to avoid having to rewrite a new arrow Table (save time + disk/memory usage). + It maps the indices used by __getitem__ to the right rows if the arrow Table. + By default save_to_disk does save the full dataset table + the mapping. + + If you want to only save the shard of the dataset instead of the original arrow file and the indices, + then you have to call :func:`datasets.Dataset.flatten_indices` before saving. + This will create a new arrow table by using the right rows of the original table. + Args: dataset_path (:obj:`str`): Path (e.g. `dataset/train`) or remote URI (e.g. `s3://my-bucket/dataset/train`) of the dataset directory where the dataset will be saved to.