Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cherry-pick] Add random_split and Subset dataset (#29291) #32090

Merged
merged 1 commit into from
Apr 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 129 additions & 1 deletion python/paddle/fluid/dataloader/dataset.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

__all__ = [
"Dataset", "IterableDataset", "TensorDataset", "ComposeDataset",
"ChainDataset"
"ChainDataset", "random_split", "Subset"
]


Expand Down Expand Up @@ -405,3 +405,131 @@ def __iter__(self):
for dataset in self.datasets:
for sample in dataset:
yield sample


class Subset(Dataset):
"""
Subset of a dataset at specified indices.

Args:
dataset (Dataset): The whole Dataset.
indices (sequence): Indices in the whole set selected for subset.

Returns:
Dataset: A Dataset which is the subset of the original dataset.

Example code:

.. code-block:: python

import paddle
from paddle.io import Subset

# example 1:
a = paddle.io.Subset(dataset=range(1, 4), indices=[0, 2])
print(list(a))
# [1, 3]

# example 2:
b = paddle.io.Subset(dataset=range(1, 4), indices=[1, 1])
print(list(b))
# [2, 2]
"""

def __init__(self, dataset, indices):
self.dataset = dataset
self.indices = indices

def __getitem__(self, idx):
return self.dataset[self.indices[idx]]

def __len__(self):
return len(self.indices)


def random_split(dataset, lengths, generator=None):
"""
Randomly split a dataset into non-overlapping new datasets of given lengths.
Optionally fix the generator for reproducible results, e.g.:

Args:
dataset (Dataset): Dataset to be split
lengths (sequence): lengths of splits to be produced
generator (Generator, optional): Generator used for the random permutation. Default is None then the DefaultGenerator is used in manual_seed().

Returns:
Datasets: A list of subset Datasets, which are the non-overlapping subsets of the original Dataset.

Example code:

.. code-block:: python

import paddle
from paddle.io import random_split

a_list = paddle.io.random_split(range(10), [3, 7])
print(len(a_list))
# 2

for idx, v in enumerate(a_list[0]):
print(idx, v)

# output of the first subset
# 0 1
# 1 3
# 2 9

for idx, v in enumerate(a_list[1]):
print(idx, v)
# output of the second subset
# 0 5
# 1 7
# 2 8
# 3 6
# 4 0
# 5 2
# 6 4
"""
# Cannot verify that dataset is Sized
if sum(lengths) != len(dataset): # type: ignore
raise ValueError(
"Sum of input lengths does not equal the length of the input dataset!"
)
# TODO(@Joejiong): support Variable or Tensor type with .tolist class member function.
# For example var.item() and var.tolist()
indices = paddle.randperm(sum(lengths)).numpy().tolist()
return [
Subset(dataset, indices[offset - length:offset])
for offset, length in zip(_accumulate(lengths), lengths)
]


def _accumulate(iterable, fn=lambda x, y: x + y):
"""
Return running totals

Args:
iterable: any iterable object for example dataset.
y (x): one element in the iterable object.
fn (x, y): Defaults to lambdax.

Yields:
yields total from beginning iterator to current iterator.

Example code:

.. code-block:: python

_accumulate([1,2,3,4,5]) --> 1 3 6 10 15
_accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
"""

it = iter(iterable)
try:
total = next(it)
except StopIteration:
return
yield total
for element in it:
total = fn(total, element)
yield total
127 changes: 110 additions & 17 deletions python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@
import paddle
import paddle.fluid as fluid
from paddle.io import Dataset, IterableDataset, TensorDataset, \
ComposeDataset, ChainDataset, DataLoader
from paddle.fluid.dygraph.base import to_variable
ComposeDataset, ChainDataset, DataLoader, random_split, Subset

IMAGE_SIZE = 32

Expand Down Expand Up @@ -54,14 +53,14 @@ def __iter__(self):

class TestTensorDataset(unittest.TestCase):
def run_main(self, num_workers, places):
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
place = fluid.CPUPlace()
paddle.static.default_startup_program().random_seed = 1
paddle.static.default_main_program().random_seed = 1
place = paddle.CPUPlace()
with fluid.dygraph.guard(place):
input_np = np.random.random([16, 3, 4]).astype('float32')
input = to_variable(input_np)
input = paddle.to_tensor(input_np)
label_np = np.random.random([16, 1]).astype('int32')
label = to_variable(label_np)
label = paddle.to_tensor(label_np)

dataset = TensorDataset([input, label])
assert len(dataset) == 16
Expand All @@ -83,17 +82,17 @@ def run_main(self, num_workers, places):
assert np.allclose(label.numpy(), label_np[i])

def test_main(self):
places = [fluid.CPUPlace()]
if fluid.core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
places = [paddle.CPUPlace()]
if paddle.is_compiled_with_cuda():
places.append(paddle.CUDAPlace(0))
for p in places:
self.run_main(num_workers=0, places=p)


class TestComposeDataset(unittest.TestCase):
def test_main(self):
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
paddle.static.default_startup_program().random_seed = 1
paddle.static.default_main_program().random_seed = 1

dataset1 = RandomDataset(10)
dataset2 = RandomDataset(10)
Expand All @@ -110,10 +109,104 @@ def test_main(self):
assert np.allclose(label2, label2_t)


class TestRandomSplitApi(unittest.TestCase):
def test_main(self):
paddle.static.default_startup_program().random_seed = 1
paddle.static.default_main_program().random_seed = 1

dataset1, dataset2 = paddle.io.random_split(range(5), [1, 4])

self.assertTrue(len(dataset1) == 1)
self.assertTrue(len(dataset2) == 4)

elements_list = list(range(5))

for _, val in enumerate(dataset1):
elements_list.remove(val)

for _, val in enumerate(dataset2):
elements_list.remove(val)

self.assertTrue(len(elements_list) == 0)


class TestRandomSplitError(unittest.TestCase):
def test_errors(self):
paddle.static.default_startup_program().random_seed = 1
paddle.static.default_main_program().random_seed = 1

self.assertRaises(ValueError, paddle.io.random_split, range(5), [3, 8])
self.assertRaises(ValueError, paddle.io.random_split, range(5), [8])
self.assertRaises(ValueError, paddle.io.random_split, range(5), [])


class TestSubsetDataset(unittest.TestCase):
def run_main(self, num_workers, places):
paddle.static.default_startup_program().random_seed = 1
paddle.static.default_main_program().random_seed = 1

input_np = np.random.random([5, 3, 4]).astype('float32')
input = paddle.to_tensor(input_np)
label_np = np.random.random([5, 1]).astype('int32')
label = paddle.to_tensor(label_np)

dataset = TensorDataset([input, label])
even_subset = paddle.io.Subset(dataset, [0, 2, 4])
odd_subset = paddle.io.Subset(dataset, [1, 3])

assert len(dataset) == 5

def prepare_dataloader(dataset):
return DataLoader(
dataset,
places=places,
num_workers=num_workers,
batch_size=1,
drop_last=True)

dataloader = prepare_dataloader(dataset)
dataloader_even = prepare_dataloader(even_subset)
dataloader_odd = prepare_dataloader(odd_subset)

def assert_basic(input, label):
assert len(input) == 1
assert len(label) == 1
assert input.shape == [1, 3, 4]
assert label.shape == [1, 1]
assert isinstance(input, paddle.Tensor)
assert isinstance(label, paddle.Tensor)

elements_list = list()
for _, (input, label) in enumerate(dataloader()):
assert_basic(input, label)
elements_list.append(label)

for _, (input, label) in enumerate(dataloader_even()):
assert_basic(input, label)
elements_list.remove(label)

odd_list = list()
for _, (input, label) in enumerate(dataloader_odd()):
assert_basic(input, label)
odd_list.append(label)

self.assertEqual(odd_list, elements_list)

def test_main(self):
paddle.static.default_startup_program().random_seed = 1
paddle.static.default_main_program().random_seed = 1

places = [paddle.CPUPlace()]
if paddle.is_compiled_with_cuda():
places.append(paddle.CUDAPlace(0))
for p in places:
self.run_main(num_workers=0, places=p)


class TestChainDataset(unittest.TestCase):
def run_main(self, num_workers, places):
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
paddle.static.default_startup_program().random_seed = 1
paddle.static.default_main_program().random_seed = 1

dataset1 = RandomIterableDataset(10)
dataset2 = RandomIterableDataset(10)
Expand All @@ -135,9 +228,9 @@ def run_main(self, num_workers, places):
idx += 1

def test_main(self):
places = [fluid.CPUPlace()]
if fluid.core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
places = [paddle.CPUPlace()]
if paddle.is_compiled_with_cuda():
places.append(paddle.CUDAPlace(0))
for p in places:
self.run_main(num_workers=0, places=p)

Expand Down
4 changes: 3 additions & 1 deletion python/paddle/io/__init__.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@
'SequenceSampler',
'RandomSampler',
'WeightedRandomSampler',
'random_split',
'Subset'
]

from ..fluid.io import DataLoader
from ..fluid.dataloader import Dataset, IterableDataset, BatchSampler, get_worker_info, \
TensorDataset, Sampler, SequenceSampler, RandomSampler, DistributedBatchSampler, \
ComposeDataset, ChainDataset, WeightedRandomSampler
ComposeDataset, ChainDataset, WeightedRandomSampler, Subset, random_split