Skip to content
This repository has been archived by the owner on Oct 9, 2023. It is now read-only.

Commit

Permalink
Add support for (input, target) style datasets (e.g. torchvision) to …
Browse files Browse the repository at this point in the history
…from_datasets (#552)

* Add support for torchvision data sets to from_datasets

* Update CHANGELOG.md
  • Loading branch information
ethanwharris committed Jul 7, 2021
1 parent 5d98cde commit bf5d679
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 21 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Added

- Added support for (input, target) style datasets (e.g. torchvision) to the from_datasets method ([#552](https://github.com/PyTorchLightning/lightning-flash/pull/552))

### Changed

Expand Down
24 changes: 11 additions & 13 deletions flash/core/data/data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,20 +334,18 @@ def generate_dataset(
SEQUENCE_DATA_TYPE = TypeVar("SEQUENCE_DATA_TYPE")


class DatasetDataSource(DataSource):

def load_data(self, dataset: Dataset, auto_dataset: AutoDataset) -> Dataset:
if self.training:
# store a sample to infer the shape
parameters = signature(self.load_sample).parameters
if len(parameters) > 1 and AutoDataset.DATASET_KEY in parameters:
auto_dataset.sample = self.load_sample(dataset[0], self)
else:
auto_dataset.sample = self.load_sample(dataset[0])
return dataset
class DatasetDataSource(DataSource[Dataset]):
"""The ``DatasetDataSource`` implements default behaviours for data sources which expect the input to
:meth:`~flash.core.data.data_source.DataSource.load_data` to be a :class:`torch.utils.data.dataset.Dataset`
Args:
labels: Optionally pass the labels as a mapping from class index to label string. These will then be set as the
:class:`~flash.core.data.data_source.LabelsState`.
"""

def load_sample(self, sample: Mapping[str, Any], dataset: Optional[Any]) -> Any:
# wrap everything within `.INPUT`.
def load_sample(self, sample: Any, dataset: Optional[Any] = None) -> Mapping[str, Any]:
if isinstance(sample, tuple) and len(sample) == 2:
return {DefaultDataKeys.INPUT: sample[0], DefaultDataKeys.TARGET: sample[1]}
return {DefaultDataKeys.INPUT: sample}


Expand Down
9 changes: 1 addition & 8 deletions tests/core/data/test_auto_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@

from flash.core.data.auto_dataset import AutoDataset, BaseAutoDataset, IterableAutoDataset
from flash.core.data.callback import FlashCallback
from flash.core.data.data_module import DataModule
from flash.core.data.data_source import DataSource, DefaultDataKeys
from flash.core.data.data_source import DataSource


class _AutoDatasetTestDataSource(DataSource):
Expand Down Expand Up @@ -189,9 +188,3 @@ def test_preprocessing_data_source_with_running_stage(with_dataset):
else:
assert data_source.train_load_sample_count == len(dataset)
assert data_source.train_load_data_count == 1


def test_dataset_data_source():

dm = DataModule.from_datasets(range(10), range(10))
assert dm.train_dataset.sample == {DefaultDataKeys.INPUT: 0}
23 changes: 23 additions & 0 deletions tests/core/data/test_data_source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from flash.core.data.data_source import DatasetDataSource, DefaultDataKeys


def test_dataset_data_source():
data_source = DatasetDataSource()

input, target = 'test', 3

assert data_source.load_sample((input, target)) == {DefaultDataKeys.INPUT: input, DefaultDataKeys.TARGET: target}
assert data_source.load_sample(input) == {DefaultDataKeys.INPUT: input}
30 changes: 30 additions & 0 deletions tests/image/classification/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

if _TORCHVISION_AVAILABLE:
import torchvision
from torchvision.datasets import FakeData

if _PIL_AVAILABLE:
from PIL import Image
Expand Down Expand Up @@ -443,3 +444,32 @@ def test_from_fiftyone(tmpdir):
assert imgs.shape == (2, 3, 196, 196)
assert labels.shape == (2, )
assert sorted(list(labels.numpy())) == [0, 1]


@pytest.mark.skipif(not _IMAGE_TESTING, reason="image libraries aren't installed.")
def test_from_datasets():
img_data = ImageClassificationData.from_datasets(
train_dataset=FakeData(size=3, num_classes=2),
val_dataset=FakeData(size=3, num_classes=2),
test_dataset=FakeData(size=3, num_classes=2),
batch_size=2,
num_workers=0,
)

# check training data
data = next(iter(img_data.train_dataloader()))
imgs, labels = data[DefaultDataKeys.INPUT], data[DefaultDataKeys.TARGET]
assert imgs.shape == (2, 3, 196, 196)
assert labels.shape == (2, )

# check validation data
data = next(iter(img_data.val_dataloader()))
imgs, labels = data[DefaultDataKeys.INPUT], data[DefaultDataKeys.TARGET]
assert imgs.shape == (2, 3, 196, 196)
assert labels.shape == (2, )

# check test data
data = next(iter(img_data.test_dataloader()))
imgs, labels = data[DefaultDataKeys.INPUT], data[DefaultDataKeys.TARGET]
assert imgs.shape == (2, 3, 196, 196)
assert labels.shape == (2, )

0 comments on commit bf5d679

Please sign in to comment.