From d305e437e84e6a4777ad2f97ee6a7cac659b5710 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Tue, 8 Oct 2024 15:13:04 +0200 Subject: [PATCH] with_format docstring (#7203) --- src/datasets/arrow_dataset.py | 26 +++++++++++-- src/datasets/dataset_dict.py | 64 +++++++++++++++++++++++++------- src/datasets/iterable_dataset.py | 39 +++++++++++++++++-- 3 files changed, 109 insertions(+), 20 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index e9e074e0e97..b289fba4106 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -2649,12 +2649,32 @@ def with_format( 'format_kwargs': {}, 'output_all_columns': False, 'type': None} - >>> ds = ds.with_format(type='tensorflow', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label']) + >>> ds = ds.with_format("torch") >>> ds.format - {'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'], + {'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], 'format_kwargs': {}, 'output_all_columns': False, - 'type': 'tensorflow'} + 'type': 'torch'} + >>> ds[0] + {'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .', + 'label': tensor(1), + 'input_ids': tensor([ 101, 18027, 16310, 16001, 1103, 9321, 178, 11604, 7235, 6617, + 1742, 2165, 2820, 1206, 6588, 22572, 12937, 1811, 2153, 1105, + 1147, 12890, 19587, 6463, 1105, 15026, 1482, 119, 102, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0]), + 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), + 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])} ``` """ dataset = copy.deepcopy(self) diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 40e14d346e1..f92a1a8afda 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -693,12 +693,32 @@ def with_format( 'format_kwargs': {}, 'output_all_columns': False, 'type': None} - >>> ds = ds.with_format(type='tensorflow', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label']) + >>> ds = ds.with_format("torch") >>> ds["train"].format - {'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'], + {'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], 'format_kwargs': {}, 'output_all_columns': False, - 'type': 'tensorflow'} + 'type': 'torch'} + >>> ds["train"][0] + {'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .', + 'label': tensor(1), + 'input_ids': tensor([ 101, 18027, 16310, 16001, 1103, 9321, 178, 11604, 7235, 6617, + 1742, 2165, 2820, 1206, 6588, 22572, 12937, 1811, 2153, 1105, + 1147, 12890, 19587, 6463, 1105, 15026, 1482, 119, 102, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0]), + 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), + 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])} ``` """ dataset = copy.deepcopy(self) @@ -1801,25 +1821,43 @@ def with_format( ) -> "IterableDatasetDict": """ Return a dataset with the specified format. - This method only supports the "torch" format for now. - The format is set to all the datasets of the dataset dictionary. + The 'pandas' format is currently not implemented. Args: - type (`str`, *optional*, defaults to `None`): - If set to "torch", the returned dataset - will be a subclass of `torch.utils.data.IterableDataset` to be used in a `DataLoader`. + + type (`str`, *optional*): + Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'arrow', 'jax']`. + `None` means it returns python objects (default). Example: ```py >>> from datasets import load_dataset - >>> ds = load_dataset("rotten_tomatoes", streaming=True) >>> from transformers import AutoTokenizer - >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") - >>> def encode(example): - ... return tokenizer(examples["text"], truncation=True, padding="max_length") - >>> ds = ds.map(encode, batched=True, remove_columns=["text"]) + >>> ds = load_dataset("rotten_tomatoes", split="validation", streaming=True) + >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + >>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True) >>> ds = ds.with_format("torch") + >>> next(iter(ds)) + {'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .', + 'label': tensor(1), + 'input_ids': tensor([ 101, 18027, 16310, 16001, 1103, 9321, 178, 11604, 7235, 6617, + 1742, 2165, 2820, 1206, 6588, 22572, 12937, 1811, 2153, 1105, + 1147, 12890, 19587, 6463, 1105, 15026, 1482, 119, 102, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0]), + 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), + 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])} ``` """ return IterableDatasetDict({k: dataset.with_format(type=type) for k, dataset in self.items()}) diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index 5f5c49f1556..7900b321ffe 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -2178,13 +2178,44 @@ def with_format( ) -> "IterableDataset": """ Return a dataset with the specified format. - Supported formats: "arrow", or None for regular python objects. - The other formats are currently not implemented. + The 'pandas' format is currently not implemented. Args: - type (`str`, optional, default None): if set to "torch", the returned dataset - will be a subclass of torch.utils.data.IterableDataset to be used in a DataLoader + type (`str`, *optional*): + Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'arrow', 'jax']`. + `None` means it returns python objects (default). + + Example: + + ```py + >>> from datasets import load_dataset + >>> from transformers import AutoTokenizer + >>> ds = load_dataset("rotten_tomatoes", split="validation", streaming=True) + >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") + >>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True) + >>> ds = ds.with_format("torch") + >>> next(iter(ds)) + {'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .', + 'label': tensor(1), + 'input_ids': tensor([ 101, 18027, 16310, 16001, 1103, 9321, 178, 11604, 7235, 6617, + 1742, 2165, 2820, 1206, 6588, 22572, 12937, 1811, 2153, 1105, + 1147, 12890, 19587, 6463, 1105, 15026, 1482, 119, 102, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0]), + 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), + 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])} + ``` """ type = get_format_type_from_alias(type) # TODO(QL): add format_kwargs