Lightning-AI · SeanNaren · Mar 29, 2021 · Mar 23, 2021 · Mar 25, 2021 · Mar 25, 2021
@@ -64,6 +64,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `outputs` parameter to callback's `on_validation_epoch_end` & `on_test_epoch_end` hooks ([#6120](https://github.com/PyTorchLightning/pytorch-lightning/pull/6120))
 
 
+- Added `on_model_parallel_setup` hook ([#6679](https://github.com/PyTorchLightning/pytorch-lightning/pull/6679))
+
+
 - Added support for `precision=64`, enabling training with double precision ([#6595](https://github.com/PyTorchLightning/pytorch-lightning/pull/6595))
 
 

@@ -11,7 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, TYPE_CHECKING, Union
+import contextlib
+from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Sequence, TYPE_CHECKING, Union
 
 import torch
 from torch.optim import Optimizer
@@ -439,6 +440,18 @@ def results(self) -> Any:
         """
         return self.training_type_plugin.results
 
+    @contextlib.contextmanager
+    def model_parallel_context(self) -> Generator:
+        """
+        Provide hook to create modules in a distributed aware context. This is useful for when we'd like to
+        shard the model instantly - useful for extremely large models. Can save memory and
+        initialization time.
+
+        Returns: Model parallel context.
+        """
+        with self.training_type_plugin.model_parallel_context():
+            yield
+
     # todo: remove in v1.5
     def connect_training_type_plugin(self, plugin: TrainingTypePlugin, model: LightningModule) -> None:
         """
@@ -466,3 +479,26 @@ def connect_precision_plugin(self, plugin: PrecisionPlugin) -> None:
             ' It will be removed in v1.5.'
         )
         self.setup_precision_plugin(plugin)
+
+    @property
+    def call_model_parallel_setup_hook(self) -> bool:
+        """
+        Allow model parallel hook to be called in suitable environments determined by the training type plugin.
+        This is useful for when we want to shard the model once within fit.
+        Returns: True if we want to call the model parallel setup hook.
+        """
+        return self.training_type_plugin.call_model_parallel_setup_hook
+
+    @call_model_parallel_setup_hook.setter
+    def call_model_parallel_setup_hook(self, mode: bool) -> None:
+        self.training_type_plugin.call_model_parallel_setup_hook = mode
+
+    @property
+    def setup_optimizers_in_pre_dispatch(self) -> bool:
+        """
+        Override to delay setting optimizers and schedulers till after dispatch.
+        This is useful when the `TrainingTypePlugin` requires operating on the wrapped accelerator model.
+        However this may break certain precision plugins such as APEX which require optimizers to be set.
+        Returns: If True, delay setup optimizers till pre_dispatch, else call within setup.
+        """
+        return self.training_type_plugin.setup_optimizers_in_pre_dispatch
@@ -29,6 +29,9 @@ class Callback(abc.ABC):
     Subclass this class and override any of the relevant hooks
     """
 
+    def on_model_parallel_setup(self, trainer, pl_module: LightningModule) -> None:
+        """Called before model parallel accelerator setup"""
+
     def on_before_accelerator_backend_setup(self, trainer, pl_module: LightningModule) -> None:
         """Called before accelerator is being setup"""
         pass

@@ -42,6 +42,7 @@ def __init__(
         self,
         on_before_accelerator_backend_setup: Optional[Callable] = None,
         setup: Optional[Callable] = None,
+        on_model_parallel_setup: Optional[Callable] = None,
         teardown: Optional[Callable] = None,
         on_init_start: Optional[Callable] = None,
         on_init_end: Optional[Callable] = None,
@@ -83,6 +84,8 @@ def __init__(
             self.on_before_accelerator_backend_setup = on_before_accelerator_backend_setup
         if setup is not None:
             self.setup = setup
+        if on_model_parallel_setup is not None:
+            self.on_model_parallel_setup = on_model_parallel_setup
         if teardown is not None:
             self.teardown = teardown
         if on_init_start is not None:

@@ -310,6 +310,20 @@ def on_post_move_to_device(self):
 
         """
 
+    def on_model_parallel_setup(self) -> None:
+        """
+        Hook to create modules in a distributed aware context. This is useful for when using sharded plugins,
+        where we'd like to shard the model instantly, which is useful for extremely large models
+        which can save memory and initialization time.
+
+        The accelerator manages whether to call this hook at every given stage.
+        For sharded plugins where model parallelism is required, the hook is usually on called once
+        to initialize the sharded parameters, and not called again in the same process.
+
+        By default for accelerators/plugins that do not use model sharding techniques,
+        this hook is called during each fit/val/test/predict stages.
+        """
+
 
 class DataHooks:
     """Hooks to be used for data related stuff."""

@@ -11,8 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import contextlib
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Iterable, Optional, TYPE_CHECKING, Union
+from typing import Any, Callable, Dict, Generator, Iterable, Optional, TYPE_CHECKING, Union
 
 import torch
 from torch.nn import Module
@@ -33,6 +34,7 @@ class TrainingTypePlugin(Plugin, ABC):
     def __init__(self) -> None:
         self._model = None
         self._results = None
+        self._call_model_parallel_setup_hook = True
 
     def connect(self, model: 'Module') -> None:
         """Called by the accelerator to connect the accelerator and the model with this plugin"""
@@ -192,3 +194,27 @@ def setup_optimizers_in_pre_dispatch(self) -> bool:
         Returns: If True, delay setup optimizers till pre_dispatch, else call within setup.
         """
         return False
+
+    @contextlib.contextmanager
+    def model_parallel_context(self) -> Generator:
+        """
+        Provide hook to create modules in a distributed aware context. This is useful for when we'd like to
+        shard the model instantly, which is useful for extremely large models which can save memory and
+        initialization time.
+
+        Returns: Model parallel context.
+        """
+        yield
+
+    @property
+    def call_model_parallel_setup_hook(self) -> bool:
+        """
+        Allow model parallel hook to be called in suitable environments determined by the training type plugin.
+        This is useful for when we want to shard the model once within fit.
+        Returns: True if we want to call the model parallel setup hook.
+        """
+        return self._call_model_parallel_setup_hook
+
+    @call_model_parallel_setup_hook.setter
+    def call_model_parallel_setup_hook(self, mode: bool) -> None:
+        self._call_model_parallel_setup_hook = mode
@@ -38,6 +38,11 @@ def on_before_accelerator_backend_setup(self, model: LightningModule) -> None:
         for callback in self.callbacks:
             callback.on_before_accelerator_backend_setup(self, model)
 
+    def on_model_parallel_setup(self, model: LightningModule) -> None:
+        """Called at the beginning of fit (train + validate), validate, test, or predict, or tune."""
+        for callback in self.callbacks:
+            callback.on_model_parallel_setup(self, model)
+
     def setup(self, model: LightningModule, stage: Optional[str]) -> None:
         """Called at the beginning of fit (train + validate), validate, test, or predict, or tune."""
         for callback in self.callbacks:

@@ -55,6 +55,11 @@ def _setup_log():
         """Called when fit or test begins"""
         return None
 
+    @staticmethod
+    def _on_model_parallel_setup_log():
+        """Called when fit or test begins"""
+        return None
+
     @staticmethod
     def _teardown_log():
         """Called at the end of fit and test"""

@@ -436,6 +436,7 @@ def fit(
         self.accelerator.connect(model)
         self.accelerator.setup_environment()
         self.call_setup_hook(model)  # allow user to setup lightning_module in accelerator environment
+        self.call_model_parallel_hook(model)  # allow user to setup in model parallel environment
         self.accelerator.setup(self, model)  # note: this sets up self.lightning_module
 
         # ----------------------------
@@ -1075,6 +1076,15 @@ def call_setup_hook(self, model: LightningModule) -> None:
         self.setup(model, stage=state)
         model.setup(stage=state)
 
+    def call_model_parallel_hook(self, model: LightningModule) -> None:
+        # Call model parallel hook if accelerator requests. In some cases
+        # we will not call the hook; the hook has initialized the sharded model for example.
+        if self.accelerator.call_model_parallel_setup_hook:
+            with self.accelerator.model_parallel_context():
+                self.on_model_parallel_setup(model)
+                model.on_model_parallel_setup()
+            self.accelerator.call_model_parallel_setup_hook = False
+
     def call_teardown_hook(self, model: LightningModule) -> None:
         state = self._teardown_state
 

@@ -1,9 +1,24 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import pytest
 import torch
 
 import tests.helpers.utils as tutils
 from pytorch_lightning import Trainer
+from pytorch_lightning.plugins import SingleDevicePlugin
 from tests.accelerators.test_dp import CustomClassificationModelDP
+from tests.helpers.boring_model import BoringModel
 from tests.helpers.datamodules import ClassifDataModule
 from tests.helpers.runif import RunIf
 
@@ -44,3 +59,78 @@ def test_evaluate(tmpdir, trainer_kwargs):
     # make sure weights didn't change
     new_weights = model.layer_0.weight.clone().detach().cpu()
     torch.testing.assert_allclose(old_weights, new_weights)
+
+
+def test_model_parallel_setup_called(tmpdir):
+
+    class TestModel(BoringModel):
+
+        def __init__(self):
+            super().__init__()
+            self.on_model_parallel_setup_called = False
+            self.layer = None
+
+        def on_model_parallel_setup(self):
+            self.on_model_parallel_setup_called = True
+            self.layer = torch.nn.Linear(32, 2)
+
+    model = TestModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        max_epochs=1,
+    )
+    trainer.fit(model)
+
+    assert model.on_model_parallel_setup_called
+
+
+class DummyModel(BoringModel):
+
+    def __init__(self):
+        super().__init__()
+        self.on_model_parallel_setup_called = False
+
+    def on_model_parallel_setup(self):
+        self.on_model_parallel_setup_called = True
+
+
+def test_model_parallel_setup_false(tmpdir):
+    """Ensure ``on_model_parallel_setup`` is not called, when turned off"""
+
+    class CustomPlugin(SingleDevicePlugin):
+
+        @property
+        def call_model_parallel_setup_hook(self) -> bool:
+            return False
+
+    model = DummyModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        max_epochs=1,
+        plugins=CustomPlugin(device=torch.device("cpu"))
+    )
+    trainer.fit(model)
+
+    assert not model.on_model_parallel_setup_called
+
+
+def test_model_parallel_setup_called_once(tmpdir):
+    """Ensure ``on_model_parallel_setup`` is only called once"""
+
+    model = DummyModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        max_epochs=1,
+    )
+    trainer.fit(model)
+
+    assert model.on_model_parallel_setup_called
+    model.on_model_parallel_setup_called = False
+
+    assert not model.on_model_parallel_setup_called
@@ -48,6 +48,7 @@ def test_trainer_callback_hook_system_fit(_, tmpdir):
         call.on_init_end(trainer),
         call.on_before_accelerator_backend_setup(trainer, model),
         call.setup(trainer, model, 'fit'),
+        call.on_model_parallel_setup(trainer, model),
         call.on_fit_start(trainer, model),
         call.on_pretrain_routine_start(trainer, model),
         call.on_pretrain_routine_end(trainer, model),
@@ -119,6 +120,7 @@ def test_trainer_callback_hook_system_test(tmpdir):
         call.on_init_end(trainer),
         call.on_before_accelerator_backend_setup(trainer, model),
         call.setup(trainer, model, 'test'),
+        call.on_model_parallel_setup(trainer, model),
         call.on_test_start(trainer, model),
         call.on_epoch_start(trainer, model),
         call.on_test_epoch_start(trainer, model),
@@ -153,6 +155,7 @@ def test_trainer_callback_hook_system_validate(tmpdir):
         call.on_init_end(trainer),
         call.on_before_accelerator_backend_setup(trainer, model),
         call.setup(trainer, model, 'validate'),
+        call.on_model_parallel_setup(trainer, model),
         call.on_validation_start(trainer, model),
         call.on_epoch_start(trainer, model),
         call.on_validation_epoch_start(trainer, model),

@@ -280,6 +280,7 @@ def test_call_back_validator(tmpdir):
         'on_epoch_end',
         'on_epoch_start',
         'on_fit_end',
+        'on_model_parallel_setup',
         'on_fit_start',
         'on_init_end',
         'on_init_start',
@@ -316,6 +317,7 @@ def test_call_back_validator(tmpdir):
         "on_before_accelerator_backend_setup",
         "on_fit_end",
         "on_fit_start",
+        "on_model_parallel_setup",
         "on_init_end",
         "on_init_start",
         "on_keyboard_interrupt",
Original file line number	Diff line number	Diff line change
Expand Up		@@ -64,6 +64,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
		- Added `outputs` parameter to callback's `on_validation_epoch_end` & `on_test_epoch_end` hooks ([#6120](https://github.com/PyTorchLightning/pytorch-lightning/pull/6120))


		- Added `on_model_parallel_setup` hook ([#6679](https://github.com/PyTorchLightning/pytorch-lightning/pull/6679))


		- Added support for `precision=64`, enabling training with double precision ([#6595](https://github.com/PyTorchLightning/pytorch-lightning/pull/6595))


Expand Down