From 6592bac6a2df72906886dafaaf455c788c5ef3ea Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@posteo.de>
Date: Wed, 9 Nov 2022 10:38:02 +0100
Subject: [PATCH 01/11] add custom data iter docs

---
 .../data/custom_data_iterables.rst            | 109 ++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 docs/source-pytorch/data/custom_data_iterables.rst

diff --git a/docs/source-pytorch/data/custom_data_iterables.rst b/docs/source-pytorch/data/custom_data_iterables.rst
new file mode 100644
index 0000000000000..bb0d076174b60
--- /dev/null
+++ b/docs/source-pytorch/data/custom_data_iterables.rst
@@ -0,0 +1,109 @@
+.. _dataiters:
+
+##################################
+Injecting 3rd Party Data Iterables
+##################################
+
+When training a model with on a specific task, dataloading and preprocessing might become a bottleneck.
+Lightning does not enforce a specific data loading approach nor does it try to control it.
+The only assumption Lightning makes is that the data is returned as an iterable of batches.
+
+For PyTorch-based programs these iterables are typically instances of :class:`~torch.utils.data.DataLoader`.
+
+However, Lightning also supports other data types such as plain list of batches, generators or other custom iterables.
+
+.. code-block:: python
+
+    # random list of batches
+    data = [(torch.rand(32, 3, 32, 32), torch.randint(0, 10, (32,))) for _ in range(100)]
+    model = LitClassifier()
+    trainer = Trainer()
+    trainer.fit(model, data)
+
+Examples for custom iterables include `NVIDIA DALI <https://github.com/NVIDIA/DALI>`__ or `FFCV <https://github.com/libffcv/ffcv>`__ for computer vision.
+Both libraries offer support for custom data loading and preprocessing (also hardware accelerated) and can be used with Lightning.
+
+
+For example taking the example from FFCV's readme, we can use it with Lightning by just replacing the hardcoded ``ToDevice(0)``
+which would always take the first GPU, no matter the actual process, with ``ToDevice(self.trainer.local_rank)`` to correctly map to the desired GPU.
+
+.. code-block:: python
+
+    from ffcv.loader import Loader, OrderOption
+    from ffcv.transforms import ToTensor, ToDevice, ToTorchImage, Cutout
+    from ffcv.fields.decoders import IntDecoder, RandomResizedCropRGBImageDecoder
+
+
+    class CustomClf(LitClassifier):
+        def train_dataloader(self):
+
+            # Random resized crop
+            decoder = RandomResizedCropRGBImageDecoder((224, 224))
+
+            # Data decoding and augmentation
+            image_pipeline = [decoder, Cutout(), ToTensor(), ToTorchImage(), ToDevice(self.trainer.local_rank)]
+            label_pipeline = [IntDecoder(), ToTensor(), ToDevice(self.trainer.local_rank)]
+
+            # Pipeline for each data field
+            pipelines = {"image": image_pipeline, "label": label_pipeline}
+
+            # Replaces PyTorch data loader (`torch.utils.data.Dataloader`)
+            loader = Loader(
+                write_path, batch_size=bs, num_workers=num_workers, order=OrderOption.RANDOM, pipelines=pipelines
+            )
+
+            return loader
+
+When moving data to a specific device, you can always refer to ``self.trainer.local_rank`` to get the accelerator
+used by the current process.
+
+By just changing ``device_id=0`` to ``device_id=self.trainer.local_rank`` we can also leverage DALI's GPU decoding:
+
+.. code-block:: python
+
+        from nvidia.dali.pipeline import pipeline_def
+        import nvidia.dali.types as types
+        import nvidia.dali.fn as fn
+        from nvidia.dali.plugin.pytorch import DALIGenericIterator
+        import os
+
+
+        class CustomLitClassifier(LitClassifier):
+            def train_dataloader(self):
+
+                # To run with different data, see documentation of nvidia.dali.fn.readers.file
+                # points to https://github.com/NVIDIA/DALI_extra
+                data_root_dir = os.environ["DALI_EXTRA_PATH"]
+                images_dir = os.path.join(data_root_dir, "db", "single", "jpeg")
+
+                @pipeline_def(num_threads=4, device_id=self.trainer.local_rank)
+                def get_dali_pipeline():
+                    images, labels = fn.readers.file(file_root=images_dir, random_shuffle=True, name="Reader")
+                    # decode data on the GPU
+                    images = fn.decoders.image_random_crop(images, device="mixed", output_type=types.RGB)
+                    # the rest of processing happens on the GPU as well
+                    images = fn.resize(images, resize_x=256, resize_y=256)
+                    images = fn.crop_mirror_normalize(
+                        images,
+                        crop_h=224,
+                        crop_w=224,
+                        mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
+                        std=[0.229 * 255, 0.224 * 255, 0.225 * 255],
+                        mirror=fn.random.coin_flip(),
+                    )
+                    return images, labels
+
+                train_data = DALIGenericIterator(
+                    [get_dali_pipeline(batch_size=16)],
+                    ["data", "label"],
+                    reader_name="Reader",
+                )
+
+                return train_data
+
+
+Lightning works seamlessly with all kinds of custom data iterables,
+but unfortunately it cannot support the entire featureset with arbitrary iterables as some are specific to dataloaders.
+
+These features are mainly automatic replacement of the sampler and fully fault-tolerant training as these dataloaders
+typically don't expose sampling APIs to fast-forward or save and load states.

From ea35db1198816f9ca88fab4351e86fad4e46f794 Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@posteo.de>
Date: Wed, 9 Nov 2022 10:39:57 +0100
Subject: [PATCH 02/11] add custom data iter docs

---
 docs/source-pytorch/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source-pytorch/index.rst b/docs/source-pytorch/index.rst
index 1c867e1e345e9..e8803ba147e83 100644
--- a/docs/source-pytorch/index.rst
+++ b/docs/source-pytorch/index.rst
@@ -207,6 +207,7 @@ Current Lightning Users
    Train on single or multiple TPUs <accelerators/tpu>
    Train on MPS <accelerators/mps>
    Use a pretrained model <advanced/pretrained>
+   Inject Custom Data Iterables <data/custom_data_iterables>
    model/own_your_loop
 
 .. toctree::

From 5bfa795d5349bc6a0a33bda0e396dfe7cecb6ebf Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Wed, 9 Nov 2022 18:17:02 +0100
Subject: [PATCH 03/11] Update
 docs/source-pytorch/data/custom_data_iterables.rst

---
 docs/source-pytorch/data/custom_data_iterables.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source-pytorch/data/custom_data_iterables.rst b/docs/source-pytorch/data/custom_data_iterables.rst
index bb0d076174b60..03e9293739c92 100644
--- a/docs/source-pytorch/data/custom_data_iterables.rst
+++ b/docs/source-pytorch/data/custom_data_iterables.rst
@@ -104,6 +104,5 @@ By just changing ``device_id=0`` to ``device_id=self.trainer.local_rank`` we can
 
 Lightning works seamlessly with all kinds of custom data iterables,
 but unfortunately it cannot support the entire featureset with arbitrary iterables as some are specific to dataloaders.
-
 These features are mainly automatic replacement of the sampler and fully fault-tolerant training as these dataloaders
 typically don't expose sampling APIs to fast-forward or save and load states.

From b7a690eac2f6453fd589bb30c96ee6845e5c58aa Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@posteo.de>
Date: Thu, 10 Nov 2022 11:52:20 +0100
Subject: [PATCH 04/11] remove ToDevice

---
 docs/source-pytorch/data/custom_data_iterables.rst | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/source-pytorch/data/custom_data_iterables.rst b/docs/source-pytorch/data/custom_data_iterables.rst
index 03e9293739c92..c59ae54b8a84c 100644
--- a/docs/source-pytorch/data/custom_data_iterables.rst
+++ b/docs/source-pytorch/data/custom_data_iterables.rst
@@ -24,8 +24,9 @@ Examples for custom iterables include `NVIDIA DALI <https://github.com/NVIDIA/DA
 Both libraries offer support for custom data loading and preprocessing (also hardware accelerated) and can be used with Lightning.
 
 
-For example taking the example from FFCV's readme, we can use it with Lightning by just replacing the hardcoded ``ToDevice(0)``
-which would always take the first GPU, no matter the actual process, with ``ToDevice(self.trainer.local_rank)`` to correctly map to the desired GPU.
+For example taking the example from FFCV's readme, we can use it with Lightning by just removing the hardcoded ``ToDevice(0)``
+as Lightning takes care of GPU placement. In case you want to use some data transformations on GPUs, change the
+``ToDevice(0)`` to ``ToDevice(self.trainer.local_rank)`` to correctly map to the desired GPU in your pipeline.
 
 .. code-block:: python
 
@@ -41,8 +42,8 @@ which would always take the first GPU, no matter the actual process, with ``ToDe
             decoder = RandomResizedCropRGBImageDecoder((224, 224))
 
             # Data decoding and augmentation
-            image_pipeline = [decoder, Cutout(), ToTensor(), ToTorchImage(), ToDevice(self.trainer.local_rank)]
-            label_pipeline = [IntDecoder(), ToTensor(), ToDevice(self.trainer.local_rank)]
+            image_pipeline = [decoder, Cutout(), ToTensor(), ToTorchImage()]
+            label_pipeline = [IntDecoder(), ToTensor()]
 
             # Pipeline for each data field
             pipelines = {"image": image_pipeline, "label": label_pipeline}

From 5b0efcdd26a0ccdfb5e6258837453b41f5fa9710 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Fri, 11 Nov 2022 10:57:16 +0900
Subject: [PATCH 05/11] nit

---
 docs/source-pytorch/data/custom_data_iterables.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source-pytorch/data/custom_data_iterables.rst b/docs/source-pytorch/data/custom_data_iterables.rst
index c59ae54b8a84c..2b793b6b79f8f 100644
--- a/docs/source-pytorch/data/custom_data_iterables.rst
+++ b/docs/source-pytorch/data/custom_data_iterables.rst
@@ -4,11 +4,11 @@
 Injecting 3rd Party Data Iterables
 ##################################
 
-When training a model with on a specific task, dataloading and preprocessing might become a bottleneck.
+When training a model on a specific task, data loading and preprocessing might become a bottleneck.
 Lightning does not enforce a specific data loading approach nor does it try to control it.
 The only assumption Lightning makes is that the data is returned as an iterable of batches.
 
-For PyTorch-based programs these iterables are typically instances of :class:`~torch.utils.data.DataLoader`.
+For PyTorch-based programs, these iterables are typically instances of :class:`~torch.utils.data.DataLoader`.
 
 However, Lightning also supports other data types such as plain list of batches, generators or other custom iterables.
 
@@ -24,7 +24,7 @@ Examples for custom iterables include `NVIDIA DALI <https://github.com/NVIDIA/DA
 Both libraries offer support for custom data loading and preprocessing (also hardware accelerated) and can be used with Lightning.
 
 
-For example taking the example from FFCV's readme, we can use it with Lightning by just removing the hardcoded ``ToDevice(0)``
+For example, taking the example from FFCV's readme, we can use it with Lightning by just removing the hardcoded ``ToDevice(0)``
 as Lightning takes care of GPU placement. In case you want to use some data transformations on GPUs, change the
 ``ToDevice(0)`` to ``ToDevice(self.trainer.local_rank)`` to correctly map to the desired GPU in your pipeline.
 

From 2a56f7efd0ba95c58252c74fd8e2862a81142fef Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Fri, 11 Nov 2022 11:28:21 +0100
Subject: [PATCH 06/11] Update
 docs/source-pytorch/data/custom_data_iterables.rst

Co-authored-by: Luca Antiga <luca.antiga@gmail.com>
---
 docs/source-pytorch/data/custom_data_iterables.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source-pytorch/data/custom_data_iterables.rst b/docs/source-pytorch/data/custom_data_iterables.rst
index 2b793b6b79f8f..bc5e41a429edc 100644
--- a/docs/source-pytorch/data/custom_data_iterables.rst
+++ b/docs/source-pytorch/data/custom_data_iterables.rst
@@ -35,7 +35,7 @@ as Lightning takes care of GPU placement. In case you want to use some data tran
     from ffcv.fields.decoders import IntDecoder, RandomResizedCropRGBImageDecoder
 
 
-    class CustomClf(LitClassifier):
+    class CustomClassifier(LitClassifier):
         def train_dataloader(self):
 
             # Random resized crop

From 8cd9345d5cf5a6541f8919bb31be8b1f2da5bb8a Mon Sep 17 00:00:00 2001
From: Justus Schock <justus.schock@posteo.de>
Date: Fri, 11 Nov 2022 11:36:40 +0100
Subject: [PATCH 07/11] clarification for @lantiga

---
 .../data/custom_data_iterables.rst            | 21 +++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/docs/source-pytorch/data/custom_data_iterables.rst b/docs/source-pytorch/data/custom_data_iterables.rst
index bc5e41a429edc..8dcb7869759aa 100644
--- a/docs/source-pytorch/data/custom_data_iterables.rst
+++ b/docs/source-pytorch/data/custom_data_iterables.rst
@@ -103,7 +103,20 @@ By just changing ``device_id=0`` to ``device_id=self.trainer.local_rank`` we can
                 return train_data
 
 
-Lightning works seamlessly with all kinds of custom data iterables,
-but unfortunately it cannot support the entire featureset with arbitrary iterables as some are specific to dataloaders.
-These features are mainly automatic replacement of the sampler and fully fault-tolerant training as these dataloaders
-typically don't expose sampling APIs to fast-forward or save and load states.
+Limitiations
+------------
+Lightning works with all kinds of custom data iterables as shown above. There are, however, a few features that cannot
+be supported this way. These restrictions come from the fact that for their support,
+Lightning needs to know a lot on the internals of these iterables.
+
+- In a distributed multi-GPU setting (ddp),
+  Lightning automatically replaces the DataLoader's sampler with its distributed counterpart.
+  This makes sure that each GPU sees a different part of the dataset.
+  As sampling can be implemented in arbitrary ways with custom iterables,
+  there is no way for Lightning to know, how to replace the sampler.
+
+- When training fails for some reason, Lightning is able to extract all of the relevant data from the model,
+  optimizers, trainer and dataloader to resume it at the exact same batch it crashed.
+  This feature is called fault-tolerance and is limited to PyTorch DataLoaders as well as
+  Lighning also needs to know a lot about sampling, fast forwarding and random number handling to enable this,
+  meaning that this cannot be supported for arbitrary iterables either.

From e8d3c295ff8028d4b5ec12d0d12c587c0f92831c Mon Sep 17 00:00:00 2001
From: Justus Schock <12886177+justusschock@users.noreply.github.com>
Date: Fri, 11 Nov 2022 14:32:27 +0100
Subject: [PATCH 08/11] typo

---
 docs/source-pytorch/data/custom_data_iterables.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source-pytorch/data/custom_data_iterables.rst b/docs/source-pytorch/data/custom_data_iterables.rst
index 8dcb7869759aa..cc244cdb1073d 100644
--- a/docs/source-pytorch/data/custom_data_iterables.rst
+++ b/docs/source-pytorch/data/custom_data_iterables.rst
@@ -103,7 +103,7 @@ By just changing ``device_id=0`` to ``device_id=self.trainer.local_rank`` we can
                 return train_data
 
 
-Limitiations
+Limitations
 ------------
 Lightning works with all kinds of custom data iterables as shown above. There are, however, a few features that cannot
 be supported this way. These restrictions come from the fact that for their support,

From 9ae78d3ed97b2da811618155b79e493ada76188e Mon Sep 17 00:00:00 2001
From: Luca Antiga <luca.antiga@gmail.com>
Date: Sat, 12 Nov 2022 16:15:56 +0100
Subject: [PATCH 09/11] Update
 docs/source-pytorch/data/custom_data_iterables.rst

---
 docs/source-pytorch/data/custom_data_iterables.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source-pytorch/data/custom_data_iterables.rst b/docs/source-pytorch/data/custom_data_iterables.rst
index cc244cdb1073d..b93f6c3161520 100644
--- a/docs/source-pytorch/data/custom_data_iterables.rst
+++ b/docs/source-pytorch/data/custom_data_iterables.rst
@@ -117,6 +117,6 @@ Lightning needs to know a lot on the internals of these iterables.
 
 - When training fails for some reason, Lightning is able to extract all of the relevant data from the model,
   optimizers, trainer and dataloader to resume it at the exact same batch it crashed.
-  This feature is called fault-tolerance and is limited to PyTorch DataLoaders as well as
+  This feature is called fault-tolerance and is limited to PyTorch DataLoaders.
   Lighning also needs to know a lot about sampling, fast forwarding and random number handling to enable this,
   meaning that this cannot be supported for arbitrary iterables either.

From 2507f8bcf948e9c37004956e096c41e360a5b62e Mon Sep 17 00:00:00 2001
From: Luca Antiga <luca.antiga@gmail.com>
Date: Sat, 12 Nov 2022 16:16:05 +0100
Subject: [PATCH 10/11] Update
 docs/source-pytorch/data/custom_data_iterables.rst

---
 docs/source-pytorch/data/custom_data_iterables.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source-pytorch/data/custom_data_iterables.rst b/docs/source-pytorch/data/custom_data_iterables.rst
index b93f6c3161520..eeed3ebb07dd4 100644
--- a/docs/source-pytorch/data/custom_data_iterables.rst
+++ b/docs/source-pytorch/data/custom_data_iterables.rst
@@ -118,5 +118,5 @@ Lightning needs to know a lot on the internals of these iterables.
 - When training fails for some reason, Lightning is able to extract all of the relevant data from the model,
   optimizers, trainer and dataloader to resume it at the exact same batch it crashed.
   This feature is called fault-tolerance and is limited to PyTorch DataLoaders.
-  Lighning also needs to know a lot about sampling, fast forwarding and random number handling to enable this,
+  Lighning needs to know a lot about sampling, fast forwarding and random number handling to enable fault tolerance,
   meaning that this cannot be supported for arbitrary iterables either.

From 16b89fcc2bff644ad667d5e1bd1576bd2de5bb4e Mon Sep 17 00:00:00 2001
From: Luca Antiga <luca.antiga@gmail.com>
Date: Sat, 12 Nov 2022 16:16:13 +0100
Subject: [PATCH 11/11] Update
 docs/source-pytorch/data/custom_data_iterables.rst

---
 docs/source-pytorch/data/custom_data_iterables.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source-pytorch/data/custom_data_iterables.rst b/docs/source-pytorch/data/custom_data_iterables.rst
index eeed3ebb07dd4..3b124c1356aee 100644
--- a/docs/source-pytorch/data/custom_data_iterables.rst
+++ b/docs/source-pytorch/data/custom_data_iterables.rst
@@ -119,4 +119,4 @@ Lightning needs to know a lot on the internals of these iterables.
   optimizers, trainer and dataloader to resume it at the exact same batch it crashed.
   This feature is called fault-tolerance and is limited to PyTorch DataLoaders.
   Lighning needs to know a lot about sampling, fast forwarding and random number handling to enable fault tolerance,
-  meaning that this cannot be supported for arbitrary iterables either.
+  meaning that it cannot be supported for arbitrary iterables.