Fix Video DDP (#1189)

Lightning-Universe · Feb 23, 2022 · de4e856 · de4e856
1 parent 0cd7bb6
commit de4e856
Show file tree

Hide file tree

Showing 6 changed files with 9 additions and 10 deletions.
diff --git a/.azure-pipelines/gpu-example-tests.yml b/.azure-pipelines/gpu-example-tests.yml
@@ -14,6 +14,7 @@ jobs:
     - "image"
     - "text"
     - "tabular"
+    - "video"
     gpu_inds:
     - "0"
     - "0,1"
diff --git a/.azure-pipelines/testing-template.yml b/.azure-pipelines/testing-template.yml
@@ -43,7 +43,7 @@ jobs:
 
       - bash: |
           python -c "import torch; print(f'found GPUs: {torch.cuda.device_count()}')"
-          python -m coverage run --source flash -m pytest flash tests/examples/test_scripts.py -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=30
+          python -m coverage run --source flash -m pytest tests/examples/test_scripts.py -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=30
         env:
           CUDA_VISIBLE_DEVICES: ${{gids}}
         displayName: 'Testing'

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,6 +20,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed a bug where DDP would not work with Flash tasks ([#1182](https://github.com/PyTorchLightning/lightning-flash/pull/1182))
 
+- Fixed DDP support for `VideoClassifier` ([#1189](https://github.com/PyTorchLightning/lightning-flash/pull/1189))
+
 ## [0.7.0] - 2022-02-15
 
 ### Added

diff --git a/flash/core/trainer.py b/flash/core/trainer.py
@@ -79,7 +79,6 @@ def __init__(self, *args, **kwargs):
         if flash._IS_TESTING:
             if torch.cuda.is_available():
                 kwargs["gpus"] = -1
-                kwargs["max_epochs"] = 3
                 kwargs["limit_train_batches"] = 1.0
                 kwargs["limit_val_batches"] = 1.0
                 kwargs["limit_test_batches"] = 1.0

diff --git a/flash/video/classification/model.py b/flash/video/classification/model.py
@@ -120,13 +120,13 @@ def __init__(
 
     def on_train_start(self) -> None:
         if accelerator_connector(self.trainer).is_distributed:
-            encoded_dataset = self.trainer.train_dataloader.loaders.dataset.dataset
+            encoded_dataset = self.trainer.train_dataloader.loaders.dataset.data
             encoded_dataset._video_sampler = DistributedSampler(encoded_dataset._labeled_videos)
         super().on_train_start()
 
     def on_train_epoch_start(self) -> None:
         if accelerator_connector(self.trainer).is_distributed:
-            encoded_dataset = self.trainer.train_dataloader.loaders.dataset.dataset
+            encoded_dataset = self.trainer.train_dataloader.loaders.dataset.data
             encoded_dataset._video_sampler.set_epoch(self.trainer.current_epoch)
         super().on_train_epoch_start()
 
@@ -147,8 +147,3 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> A
     def modules_to_freeze(self) -> Union[nn.Module, Iterable[Union[nn.Module, Iterable]]]:
         """Return the module attributes of the model to be frozen."""
         return list(self.backbone.children())
-
-    @staticmethod
-    def _ci_benchmark_fn(history: List[Dict[str, Any]]):
-        """This function is used only for debugging usage with CI."""
-        assert history[-1]["val_accuracy"] > 0.70
diff --git a/flash_examples/video_classification.py b/flash_examples/video_classification.py
@@ -34,7 +34,9 @@
 model = VideoClassifier(backbone="x3d_xs", labels=datamodule.labels, pretrained=False)
 
 # 3. Create the trainer and finetune the model
-trainer = flash.Trainer(max_epochs=3, gpus=torch.cuda.device_count(), fast_dev_run=True)
+trainer = flash.Trainer(
+    max_epochs=1, gpus=torch.cuda.device_count(), strategy="ddp" if torch.cuda.device_count() > 1 else None
+)
 trainer.finetune(model, datamodule=datamodule, strategy="freeze")
 
 # 4. Make a prediction