Skip to content

Commit

Permalink
with_format docstring (#7203)
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq authored Oct 8, 2024
1 parent cdb1d32 commit d305e43
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 20 deletions.
26 changes: 23 additions & 3 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2649,12 +2649,32 @@ def with_format(
'format_kwargs': {},
'output_all_columns': False,
'type': None}
>>> ds = ds.with_format(type='tensorflow', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
>>> ds = ds.with_format("torch")
>>> ds.format
{'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
{'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
'format_kwargs': {},
'output_all_columns': False,
'type': 'tensorflow'}
'type': 'torch'}
>>> ds[0]
{'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .',
'label': tensor(1),
'input_ids': tensor([ 101, 18027, 16310, 16001, 1103, 9321, 178, 11604, 7235, 6617,
1742, 2165, 2820, 1206, 6588, 22572, 12937, 1811, 2153, 1105,
1147, 12890, 19587, 6463, 1105, 15026, 1482, 119, 102, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0]),
'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}
```
"""
dataset = copy.deepcopy(self)
Expand Down
64 changes: 51 additions & 13 deletions src/datasets/dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,12 +693,32 @@ def with_format(
'format_kwargs': {},
'output_all_columns': False,
'type': None}
>>> ds = ds.with_format(type='tensorflow', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
>>> ds = ds.with_format("torch")
>>> ds["train"].format
{'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
{'columns': ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
'format_kwargs': {},
'output_all_columns': False,
'type': 'tensorflow'}
'type': 'torch'}
>>> ds["train"][0]
{'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .',
'label': tensor(1),
'input_ids': tensor([ 101, 18027, 16310, 16001, 1103, 9321, 178, 11604, 7235, 6617,
1742, 2165, 2820, 1206, 6588, 22572, 12937, 1811, 2153, 1105,
1147, 12890, 19587, 6463, 1105, 15026, 1482, 119, 102, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0]),
'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}
```
"""
dataset = copy.deepcopy(self)
Expand Down Expand Up @@ -1801,25 +1821,43 @@ def with_format(
) -> "IterableDatasetDict":
"""
Return a dataset with the specified format.
This method only supports the "torch" format for now.
The format is set to all the datasets of the dataset dictionary.
The 'pandas' format is currently not implemented.
Args:
type (`str`, *optional*, defaults to `None`):
If set to "torch", the returned dataset
will be a subclass of `torch.utils.data.IterableDataset` to be used in a `DataLoader`.
type (`str`, *optional*):
Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'arrow', 'jax']`.
`None` means it returns python objects (default).
Example:
```py
>>> from datasets import load_dataset
>>> ds = load_dataset("rotten_tomatoes", streaming=True)
>>> from transformers import AutoTokenizer
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
>>> def encode(example):
... return tokenizer(examples["text"], truncation=True, padding="max_length")
>>> ds = ds.map(encode, batched=True, remove_columns=["text"])
>>> ds = load_dataset("rotten_tomatoes", split="validation", streaming=True)
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
>>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True)
>>> ds = ds.with_format("torch")
>>> next(iter(ds))
{'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .',
'label': tensor(1),
'input_ids': tensor([ 101, 18027, 16310, 16001, 1103, 9321, 178, 11604, 7235, 6617,
1742, 2165, 2820, 1206, 6588, 22572, 12937, 1811, 2153, 1105,
1147, 12890, 19587, 6463, 1105, 15026, 1482, 119, 102, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0]),
'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}
```
"""
return IterableDatasetDict({k: dataset.with_format(type=type) for k, dataset in self.items()})
Expand Down
39 changes: 35 additions & 4 deletions src/datasets/iterable_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2178,13 +2178,44 @@ def with_format(
) -> "IterableDataset":
"""
Return a dataset with the specified format.
Supported formats: "arrow", or None for regular python objects.
The other formats are currently not implemented.
The 'pandas' format is currently not implemented.
Args:
type (`str`, optional, default None): if set to "torch", the returned dataset
will be a subclass of torch.utils.data.IterableDataset to be used in a DataLoader
type (`str`, *optional*):
Either output type selected in `[None, 'numpy', 'torch', 'tensorflow', 'arrow', 'jax']`.
`None` means it returns python objects (default).
Example:
```py
>>> from datasets import load_dataset
>>> from transformers import AutoTokenizer
>>> ds = load_dataset("rotten_tomatoes", split="validation", streaming=True)
>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
>>> ds = ds.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True)
>>> ds = ds.with_format("torch")
>>> next(iter(ds))
{'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .',
'label': tensor(1),
'input_ids': tensor([ 101, 18027, 16310, 16001, 1103, 9321, 178, 11604, 7235, 6617,
1742, 2165, 2820, 1206, 6588, 22572, 12937, 1811, 2153, 1105,
1147, 12890, 19587, 6463, 1105, 15026, 1482, 119, 102, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0]),
'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}
```
"""
type = get_format_type_from_alias(type)
# TODO(QL): add format_kwargs
Expand Down

0 comments on commit d305e43

Please sign in to comment.