Lightning-AI · SeanNaren · Mar 3, 2021 · Mar 4, 2021 · Mar 4, 2021 · Mar 4, 2021
@@ -43,7 +43,6 @@
     from hydra.core.hydra_config import HydraConfig
     from hydra.utils import get_original_cwd, to_absolute_path
 
-
 log = logging.getLogger(__name__)
 
 
@@ -253,8 +252,9 @@ def pre_dispatch(self):
         if self.sync_batchnorm:
             self.model = self.configure_sync_batchnorm(self.model)
 
-        # move the model to the correct device
-        self.model_to_device()
+        if self.call_move_to_device_hook_in_pre_dispatch:
+            # move the model to the correct device
+            self.model_to_device()
 
         self.configure_ddp()
 
@@ -313,3 +313,11 @@ def predict(self, *args, **kwargs):
     def post_training_step(self):
         if not self.lightning_module.automatic_optimization:
             self.model.require_backward_grad_sync = True
+
+    @property
+    def call_move_to_device_hook_in_pre_dispatch(self) -> bool:
+        """
+        Call the ``model_to_device`` function within pre_dispatch if this is set to True.
+        Useful for when plugin would like to call model_to_device at another time, or skip the call.
+        """
+        return True
@@ -59,7 +59,7 @@ def __init__(
         self.sync_batchnorm = sync_batchnorm
         self._ddp_kwargs = kwargs
         self.dist = LightningDistributed()
-        self.num_processes = len(parallel_devices)
+        self.num_processes = len(parallel_devices) if parallel_devices is not None else parallel_devices
         self.node_rank = 0
         self.mp_queue = None
 
@@ -151,8 +151,9 @@ def new_process(self, process_idx, trainer, mp_queue):
         if self.sync_batchnorm:
             self.model = self.configure_sync_batchnorm(self.model)
 
-        # move the model to the correct device
-        self.model_to_device()
+        if self.call_move_to_device_hook_in_pre_dispatch:
+            # move the model to the correct device
+            self.model_to_device()
 
         self.configure_ddp()
 
@@ -290,3 +291,11 @@ def predict(self, *args, **kwargs):
     def post_training_step(self):
         if not self.lightning_module.automatic_optimization:
             self.model.require_backward_grad_sync = True
+
+    @property
+    def call_move_to_device_hook_in_pre_dispatch(self) -> bool:
+        """
+        Call the ``model_to_device`` function within pre_dispatch if this is set to True.
+        Useful for when plugin would like to call model_to_device at another time, or skip the call.
+        """
+        return True
@@ -12,12 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+from unittest import mock
 from unittest.mock import patch
 
 import pytest
 import torch
 
 from pytorch_lightning import Trainer
+from pytorch_lightning.plugins import DDPPlugin, DDPSpawnPlugin
 from tests.accelerators import ddp_model, DDPLauncher
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.runif import RunIf
@@ -91,7 +93,6 @@ def test_torch_distributed_backend_env_variables(tmpdir):
     _environ = {"PL_TORCH_DISTRIBUTED_BACKEND": "undefined", "CUDA_VISIBLE_DEVICES": "0,1", "WORLD_SIZE": "2"}
     with patch.dict(os.environ, _environ), \
          patch('torch.cuda.device_count', return_value=2):
-
         with pytest.raises(ValueError, match="Invalid backend: 'undefined'"):
             model = BoringModel()
             trainer = Trainer(
@@ -102,3 +103,28 @@ def test_torch_distributed_backend_env_variables(tmpdir):
                 logger=False,
             )
             trainer.fit(model)
+
+
+@pytest.mark.parametrize('move_to_device_pre_dispatch_enabled', [True, False])
+@mock.patch('pytorch_lightning.plugins.DDPPlugin.model_to_device')
+def test_move_to_device_in_pre_dispatch(mock_model_to_device, tmpdir, move_to_device_pre_dispatch_enabled):
+    """
+    Test if ``call_move_to_device_hook_in_pre_dispatch`` is disabled we do not move to device till later
+    in training.
+    """
+
+    with mock.patch(
+        f'pytorch_lightning.plugins.DDPPlugin.call_move_to_device_hook_in_pre_dispatch',
+        move_to_device_pre_dispatch_enabled
+    ):
+        model = BoringModel()
+        trainer = Trainer(
+            default_root_dir=tmpdir, fast_dev_run=True, accelerator='ddp', plugins=DDPPlugin(), num_processes=1
+        )
+        trainer.fit(model)
+
+        # Check if mocked device was called. Since we're on CPU, model_to_device does nothing anyway.
+        if move_to_device_pre_dispatch_enabled:
+            mock_model_to_device.assert_called()
+        else:
+            mock_model_to_device.assert_not_called()