Lightning-AI · tchaton · Jul 9, 2021 · May 27, 2021 · May 27, 2021 · May 28, 2021
@@ -195,6 +195,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `on_load_checkpoint` and `on_save_checkpoint` hooks to the `PrecisionPlugin` base class ([#7831](https://github.com/PyTorchLightning/pytorch-lightning/pull/7831))
 
 
+- Added `on_before_backward` hook ([#7865](https://github.com/PyTorchLightning/pytorch-lightning/pull/7865))
+
+
 - `LightningCLI` now aborts with a clearer message if config already exists and disables save config during `fast_dev_run`([#7963](https://github.com/PyTorchLightning/pytorch-lightning/pull/7963))
 
 

@@ -1160,6 +1160,7 @@ for more information.
             on_before_zero_grad()
             optimizer_zero_grad()
 
+            on_before_backward()
             backward()
             on_after_backward()
 
@@ -1215,6 +1216,12 @@ get_progress_bar_dict
 .. automethod:: pytorch_lightning.core.lightning.LightningModule.get_progress_bar_dict
     :noindex:
 
+on_before_backward
+~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.hooks.ModelHooks.on_before_backward
+    :noindex:
+
 on_after_backward
 ~~~~~~~~~~~~~~~~~
 

@@ -351,6 +351,12 @@ on_load_checkpoint
 .. automethod:: pytorch_lightning.callbacks.Callback.on_load_checkpoint
     :noindex:
 
+on_before_backward
+^^^^^^^^^^^^^^^^^
+
+.. automethod:: pytorch_lightning.callbacks.Callback.on_before_backward
+    :noindex:
+
 on_after_backward
 ^^^^^^^^^^^^^^^^^
 

@@ -19,6 +19,7 @@
 import abc
 from typing import Any, Dict, List, Optional
 
+import torch
 from torch.optim import Optimizer
 
 import pytorch_lightning as pl
@@ -296,6 +297,10 @@ def on_load_checkpoint(
         """
         pass
 
+    def on_before_backward(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule', loss: torch.Tensor) -> None:
+        """Called before ``loss.backward()``."""
+        pass
+
     def on_after_backward(self, trainer: 'pl.Trainer', pl_module: 'pl.LightningModule') -> None:
         """Called after ``loss.backward()`` and before optimizers do anything."""
         pass

@@ -77,6 +77,7 @@ def __init__(
         on_keyboard_interrupt: Optional[Callable] = None,
         on_save_checkpoint: Optional[Callable] = None,
         on_load_checkpoint: Optional[Callable] = None,
+        on_before_backward: Optional[Callable] = None,
         on_after_backward: Optional[Callable] = None,
         on_before_zero_grad: Optional[Callable] = None,
         on_predict_start: Optional[Callable] = None,

@@ -295,6 +295,23 @@ def on_before_zero_grad(self, optimizer: Optimizer) -> None:
             optimizer: The optimizer for which grads should be zeroed.
         """
 
+    def on_before_backward(self, loss: torch.Tensor) -> None:
+        """
+        Override on_before_backward with your own implementation if you need to.
+
+        Args:
+            loss: Loss divided by number of batches for gradient accumulation and possibly scaled by Mixed Precision.
+
+        Called before ``loss.backward()``.
+
+        Example::
+
+            def on_before_backward(self, loss):
+                print(f"Current Loss: {loss}")
+
+        """
+        pass
+
     def on_after_backward(self) -> None:
         """
         Called in the training loop after loss.backward() and before optimizers do anything.

diff --git a/pytorch_lightning/plugins/precision/apex_amp.py b/pytorch_lightning/plugins/precision/apex_amp.py
@@ -74,6 +74,9 @@ def backward(
         # enter apex context
         closure_loss = scaled_loss.__enter__()
 
+        # hook
+        model.trainer.call_hook("on_before_backward", closure_loss)
+
         # do backward pass
         # TODO: not entirely sure, why we need this
         if model is not None and isinstance(model, LightningModule):

diff --git a/pytorch_lightning/plugins/precision/deepspeed_precision.py b/pytorch_lightning/plugins/precision/deepspeed_precision.py
@@ -69,6 +69,10 @@ def backward(
             )
         # todo: hack around for deepspeed engine to call backward
         deepspeed_engine = model.trainer.model
+
+        # hook
+        model.trainer.call_hook("on_before_backward", closure_loss)
+
         deepspeed_engine.backward(closure_loss, *args, **kwargs)
         # once backward has been applied, release graph
         closure_loss = closure_loss.detach()

diff --git a/pytorch_lightning/plugins/precision/ipu_precision.py b/pytorch_lightning/plugins/precision/ipu_precision.py
@@ -40,6 +40,8 @@ def backward(
         **kwargs: Any,
     ) -> Tensor:
         # IPU internally manages bwd step.
+        # hook
+        model.trainer.call_hook("on_before_backward", closure_loss)
         return closure_loss
 
     def clip_gradients(

@@ -59,6 +59,10 @@ def backward(
         """
         closure_loss = self.scaler.scale(closure_loss)
 
+        # hook
+        model.trainer.call_hook("on_before_backward", closure_loss)
+
+        # call `on_before_backward` hook
         closure_loss = super().backward(model, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs)
 
         # unscale gradient to allow analyze within `on_after_backward`

@@ -73,6 +73,8 @@ def backward(
         """
         automatic_optimization = model.automatic_optimization
 
+        model.trainer.call_hook("on_before_backward", closure_loss)
+
         # do backward pass
         if automatic_optimization:
             model.backward(closure_loss, optimizer, opt_idx)

@@ -17,6 +17,8 @@
 from inspect import signature
 from typing import Any, Callable, Dict, List, Optional, Type
 
+import torch
+
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.utilities import rank_zero_deprecation, rank_zero_warn
@@ -313,6 +315,13 @@ def on_load_checkpoint(self, checkpoint):
                 else:
                     callback.on_load_checkpoint(self, self.lightning_module, state)
 
+    def on_before_backward(self, loss: torch.Tensor) -> None:
+        """
+        Called before ``loss.backward()``.
+        """
+        for callback in self.callbacks:
+            callback.on_before_backward(self, self.lightning_module, loss)
+
     def on_after_backward(self):
         """
         Called after loss.backward() and before optimizers do anything.

@@ -21,6 +21,7 @@ class FxValidator:
     functions: Dict[str, Optional[Dict[str, Tuple[bool]]]] = dict(
         on_before_accelerator_backend_setup=None,
         on_configure_sharded_model=None,
+        on_before_backward=dict(on_step=(False, True), on_epoch=(False, True)),
         on_after_backward=dict(on_step=(False, True), on_epoch=(False, True)),
         on_before_zero_grad=dict(on_step=(False, True), on_epoch=(False, True)),
         on_init_start=None,

diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
@@ -68,18 +68,21 @@ def test_trainer_callback_hook_system_fit(_, tmpdir):
         call.on_batch_start(trainer, model),
         call.on_train_batch_start(trainer, model, ANY, 0, 0),
         call.on_before_zero_grad(trainer, model, trainer.optimizers[0]),
+        call.on_before_backward(trainer, model, ANY),
         call.on_after_backward(trainer, model),
         call.on_train_batch_end(trainer, model, ANY, ANY, 0, 0),
         call.on_batch_end(trainer, model),
         call.on_batch_start(trainer, model),
         call.on_train_batch_start(trainer, model, ANY, 1, 0),
         call.on_before_zero_grad(trainer, model, trainer.optimizers[0]),
+        call.on_before_backward(trainer, model, ANY),
         call.on_after_backward(trainer, model),
         call.on_train_batch_end(trainer, model, ANY, ANY, 1, 0),
         call.on_batch_end(trainer, model),
         call.on_batch_start(trainer, model),
         call.on_train_batch_start(trainer, model, ANY, 2, 0),
         call.on_before_zero_grad(trainer, model, trainer.optimizers[0]),
+        call.on_before_backward(trainer, model, ANY),
         call.on_after_backward(trainer, model),
         call.on_train_batch_end(trainer, model, ANY, ANY, 2, 0),
         call.on_batch_end(trainer, model),

@@ -24,6 +24,10 @@
 if _SKLEARN_AVAILABLE:
     from sklearn.datasets import make_classification, make_regression
     from sklearn.model_selection import train_test_split
+else:
+    make_classification = None
+    make_regression = None
+    train_test_split = None
 
 
 class MNISTDataModule(LightningDataModule):

@@ -294,6 +294,7 @@ def _train_batch():
             'training_step_end',
             'on_before_zero_grad',
             'optimizer_zero_grad',
+            'on_before_backward',
             'backward',
             'on_after_backward',
             'optimizer_step',

diff --git a/tests/plugins/test_amp_plugins.py b/tests/plugins/test_amp_plugins.py
@@ -19,6 +19,7 @@
 import torch
 
 from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin
 from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
 from tests.helpers import BoringModel
@@ -69,18 +70,34 @@ def test_amp_apex_ddp(
     assert isinstance(trainer.precision_plugin, plugin_cls)
 
 
+class CheckOnBeforeBackward(Callback):
+
+    def __init__(self):
+        self.on_before_backward_called = False
+
+    def on_before_backward(self, trainer, pl_module, loss):
+        assert isinstance(loss, torch.Tensor)
+        assert loss.grad_fn is not None
+        self.on_before_backward_called = True
+
+
 class GradientUnscaleBoringModel(BoringModel):
 
     def on_after_backward(self):
         norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2)
         if not (torch.isinf(norm) or torch.isnan(norm)):
             assert norm.item() < 15.
 
+        cb = [cb for cb in self.trainer.callbacks if isinstance(cb, CheckOnBeforeBackward)]
+        assert len(cb) == 1
+        assert cb[0].on_before_backward_called
+
 
 @RunIf(min_gpus=2, amp_native=True)
 @pytest.mark.parametrize('accum', [1, 2])
 def test_amp_gradient_unscale(tmpdir, accum: int):
     model = GradientUnscaleBoringModel()
+    cb = CheckOnBeforeBackward()
 
     trainer = Trainer(
         max_epochs=2,
@@ -95,6 +112,7 @@ def test_amp_gradient_unscale(tmpdir, accum: int):
         track_grad_norm=2,
         log_every_n_steps=1,
         accumulate_grad_batches=accum,
+        callbacks=[cb]
     )
     trainer.fit(model)
 
@@ -143,6 +161,7 @@ def configure_optimizers(self):
 @RunIf(min_gpus=2, amp_apex=True, special=True)
 @pytest.mark.parametrize("amp_level", ['O2'])
 def test_amp_apex_ddp_fit(amp_level, tmpdir):
+    cb = CheckOnBeforeBackward()
 
     class CustomBoringModel(BoringModel):
 
@@ -151,6 +170,11 @@ def training_step(self, batch, batch_idx):
             assert self.trainer.precision_plugin._connected
             return super().training_step(batch, batch_idx)
 
+        def on_after_backward(self):
+            cb = [cb for cb in self.trainer.callbacks if isinstance(cb, CheckOnBeforeBackward)]
+            assert len(cb) == 1
+            assert cb[0].on_before_backward_called
+
     trainer = Trainer(
         default_root_dir=tmpdir,
         fast_dev_run=True,
@@ -159,6 +183,7 @@ def training_step(self, batch, batch_idx):
         gpus=2,
         accelerator='ddp',
         plugins=ApexMixedPrecisionPlugin(amp_level=amp_level),
+        callbacks=[cb]
     )
     assert isinstance(trainer.precision_plugin, ApexMixedPrecisionPlugin)
     model = CustomBoringModel()

@@ -649,26 +649,40 @@ def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_opt
 
     class VerificationCallback(Callback):
 
+        def __init__(self):
+            self.on_train_batch_start_called = False
+            self.on_before_backward_called = False
+
         def on_train_batch_start(
             self, trainer, pl_module: LightningModule, batch: Any, batch_idx: int, dataloader_idx: int
         ) -> None:
             deepspeed_engine = trainer.training_type_plugin.model
             assert trainer.global_step == deepspeed_engine.global_steps
+            self.on_train_batch_start_called = True
+
+        def on_before_backward(self, trainer, pl_module, loss):
+            assert isinstance(loss, torch.Tensor)
+            assert loss.grad_fn is not None
+            self.on_before_backward_called = True
 
     model = ModelParallelClassificationModel()
     dm = ClassifDataModule()
+    verification_callback = VerificationCallback()
     trainer = Trainer(
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=5,
         plugins=[DeepSpeedPlugin(stage=2, offload_optimizer=offload_optimizer)],
         gpus=2,
+        accelerator="ddp",
         limit_val_batches=2,
         precision=16,
         accumulate_grad_batches=2,
-        callbacks=[VerificationCallback()]
+        callbacks=[verification_callback]
     )
     trainer.fit(model, datamodule=dm)
+    assert verification_callback.on_train_batch_start_called
+    assert verification_callback.on_before_backward_called
 
 
 @RunIf(min_gpus=2, deepspeed=True, special=True)

@@ -33,6 +33,7 @@ def test_fx_validator(tmpdir):
     funcs_name = sorted([f for f in dir(Callback) if not f.startswith('_')])
 
     callbacks_func = [
+        'on_before_backward',
         'on_after_backward',
         'on_batch_end',
         'on_batch_start',