From 7e08b0d710208e980f27896aa62a59f26f0cb3b3 Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Tue, 10 Nov 2020 19:44:51 +0000
Subject: [PATCH 01/11] [bug-fix] DDP and automatic_optimization=False (#4485)

* resolve bug

* add self._running_manual_optim

* update

* update tests

* update lightning module

* resolve bug

* update tests

* update

* resolve pep8

* update

* replace by `ddp_spawn`

* temporary fix

* update

* update

* move update to training_loop

* make both ddp_spawn

* introduce `manual_optimizer_step`

* update changelog

* added changelog wrong place

* add force_optimizer_step

* update docstring for tests

* update optimizer_step

* update zero_grad

* resolve flake8

* move update into manual_optimizer_step

* add zero_grad

* remove zero_grad tests

* remove manual_backward in AMP, it doesn't help

* update

* loosen tests

* update

* update doc

* add TODO

* Removed unnecessary get model from native amp

* Remove try except with pytest raise

* Add seed, clean up imports, remove try catch to reproduce error

* update code

* update test

* revert back

* formatting

* Update pytorch_lightning/core/lightning.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

Co-authored-by: SeanNaren <sean@grid.ai>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 .gitignore                                    |   1 +
 CHANGELOG.md                                  |   3 +
 docs/source/lightning_module.rst              |   6 +
 docs/source/optimizers.rst                    |   7 +-
 pytorch_lightning/accelerators/accelerator.py |   9 +-
 pytorch_lightning/core/lightning.py           |  50 +++-
 pytorch_lightning/trainer/training_loop.py    |  28 +-
 .../optimization/test_manual_optimization.py  | 274 +++++++++++++++++-
 .../warnings_tests/test_flow_warnings.py      |  11 +-
 9 files changed, 366 insertions(+), 23 deletions(-)

diff --git a/.gitignore b/.gitignore
index fff549a7187945..946d5f0f4c2ca3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,6 +33,7 @@ timit_data/
 .Python
 ide_layouts/
 build/
+_build/
 develop-eggs/
 dist/
 downloads/
diff --git a/CHANGELOG.md b/CHANGELOG.md
index bb286e82759c79..16daa24aa2ed9b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -33,6 +33,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775))
 
 
+- Added `manual_optimizer_step` which work with `AMP Native` and `accumulated_grad_batches` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485))
+
+
 - Added `persistent(mode)` method to metrics, to enable and disable metric states being added to `state_dict` ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482))
 
 
diff --git a/docs/source/lightning_module.rst b/docs/source/lightning_module.rst
index 11641fc35e8a05..c26e0fc0351d16 100644
--- a/docs/source/lightning_module.rst
+++ b/docs/source/lightning_module.rst
@@ -1009,6 +1009,12 @@ manual_backward
 .. automethod:: pytorch_lightning.core.lightning.LightningModule.manual_backward
     :noindex:
 
+manual_optimizer_step
+~~~~~~~~~~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.lightning.LightningModule.manual_optimizer_step
+    :noindex:
+
 on_after_backward
 ~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst
index 1e7baadb64480c..7f1bcc97662b48 100644
--- a/docs/source/optimizers.rst
+++ b/docs/source/optimizers.rst
@@ -36,8 +36,8 @@ to manually manage the optimization process. To do so, do the following:
 
         # use self.backward which will also handle scaling the loss when using amp
         self.manual_backward(loss_a, opt_g)
-        opt_g.step()
-        opt_g.zero_grad()
+        self.manual_optimizer_step(opt_g)
+
 
         # do anything you want
         loss_b = ...
@@ -45,8 +45,7 @@ to manually manage the optimization process. To do so, do the following:
         # pass in any args that loss.backward() normally takes
         self.manual_backward(loss_b, opt_d, retain_graph=True)
         self.manual_backward(loss_b, opt_d)
-        opt_d.step()
-        opt_d.zero_grad()
+        self.manual_optimizer_step(opt_d)
 
         # log losses
         self.log('loss_a', loss_a)
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index dc0b0bf63a98d6..3b762e08ed5e6d 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -109,10 +109,11 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
     def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure):
         model_ref = self.trainer.get_model()
         is_lbfgs = isinstance(optimizer, torch.optim.LBFGS)
-        native_amp = self.trainer.amp_backend == AMPType.NATIVE
+        using_native_amp = self.trainer.amp_backend == AMPType.NATIVE
+        automatic_optimization = self.trainer.train_loop.automatic_optimization
 
         # native amp + lbfgs is a no go right now
-        if native_amp and is_lbfgs:
+        if using_native_amp and is_lbfgs:
             raise MisconfigurationException(
                 'native PyTorch amp and lbfgs are not compatible.'
                 ' To request, please file a Github issue in PyTorch and tag @mcarilli')
@@ -125,12 +126,12 @@ def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure):
             optimizer_idx=opt_idx,
             optimizer_closure=lambda_closure,
             on_tpu=False,  # TPUAccelerator class sets this as True
-            using_native_amp=native_amp,
+            using_native_amp=using_native_amp,
             using_lbfgs=is_lbfgs
         )
 
         # scale when native amp
-        if native_amp:
+        if automatic_optimization and using_native_amp:
             self.trainer.scaler.update()
 
     def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx):
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 3d38f65892983d..a332c0dcaa99a1 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -111,6 +111,7 @@ def __init__(self, *args, **kwargs):
         self._datamodule = None
         self._results: Optional[Result] = None
         self._current_fx_name = ''
+        self._running_manual_backward = False
         self._current_hook_fx_name = None
         self._current_dataloader_idx = None
 
@@ -1085,6 +1086,9 @@ def manual_backward(self, loss: Tensor, optimizer: Optimizer, *args, **kwargs) -
 
         .. tip:: In manual mode we still automatically clip grads if Trainer(gradient_clip_val=x) is set
 
+        .. tip:: In manual mode we still automatically accumulate grad over batches if Trainer(accumulate_grad_batches=x) is set
+            and you use `model.manual_optimizer_step(optimizer)`
+
         Example::
 
             def training_step(...):
@@ -1092,12 +1096,55 @@ def training_step(...):
                 loss = ...
                 # automatically applies scaling, etc...
                 self.manual_backward(loss, opt_a)
+                self.manual_optimizer_step(opt_a)
         """
         # make sure we're using manual opt
         self._verify_is_manual_optimization('manual_backward')
 
         # backward
+        self._running_manual_backward = True
         self.trainer.train_loop.backward(loss, optimizer, -1, *args, **kwargs)
+        self._running_manual_backward = False
+
+    def manual_optimizer_step(self, optimizer: Optimizer, force_optimizer_step:bool = False) -> None:
+        """
+        Call this directly from your training_step when doing optimizations manually.
+        By using this we can ensure that all the proper scaling when using 16-bit etc has been done for you
+
+        .. tip:: In manual mode we still automatically accumulate grad over batches if Trainer(accumulate_grad_batches=x) is set.
+
+        Args:
+            optimizer: Optimizer used to perform `.step()` call
+
+            force_optimizer_step: Whether to force an optimizer step. Could be useful when having 2 optimizers
+                and one should use accumulated gradients but not the other one.
+                One could put its own logic to force an optimizer step.
+
+        Example::
+
+            def training_step(...):
+                (opt_a, opt_b) = self.optimizers()
+                loss = ...
+                # automatically applies scaling, etc...
+                self.manual_backward(loss, opt_a)
+                # This will force an opt.step() even if accumulate_grad_batches is set.
+                self.manual_optimizer_step(opt_a, force_optimizer_step=True)
+
+        """
+        # make sure we're using manual opt
+        self._verify_is_manual_optimization('manual_optimizer_step')
+
+        if not self.trainer.train_loop.should_accumulate() or force_optimizer_step:
+
+            # mock closure function as the user is responsible to call `manual_backward`
+            def mock_optimizer_closure():
+                return
+
+            self.trainer.train_loop.optimizer_step(optimizer, None, self.trainer.batch_idx, mock_optimizer_closure)
+
+            # update will be called after every optimizer_step call
+            if self.trainer.amp_backend == AMPType.NATIVE:
+                self.trainer.scaler.update()
 
     def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args, **kwargs) -> None:
         """
@@ -1118,7 +1165,8 @@ def backward(self, loss, optimizer, optimizer_idx):
                 loss.backward()
 
         """
-        loss.backward(*args, **kwargs)
+        if self.trainer.train_loop.automatic_optimization or self._running_manual_backward:
+            loss.backward(*args, **kwargs)
 
     def toggle_optimizer(self, optimizer: Optimizer, optimizer_idx: int):
         """
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 2f66f5b1a600ec..1cf06c3709e7ec 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -306,6 +306,12 @@ def on_after_backward(self, training_step_output, batch_idx, untouched_loss):
         # when in dev debugging track the losses
         self.trainer.dev_debugger.track_train_loss_history(batch_idx, untouched_loss.detach())
 
+    def _check_training_step_output(self, training_step_output):
+        if isinstance(training_step_output, torch.Tensor) and not self.automatic_optimization:
+            if training_step_output.grad_fn is None:
+                # TODO: Find why - RuntimeError: Expected to mark a variable ready only once ...
+                raise MisconfigurationException("In manual optimization, `training_step` should not return a Tensor")
+
     def training_step(self, split_batch, batch_idx, opt_idx, hiddens):
         # give the PL module a result for logging
         model_ref = self.trainer.get_model()
@@ -318,6 +324,8 @@ def training_step(self, split_batch, batch_idx, opt_idx, hiddens):
             training_step_output = self.trainer.accelerator_backend.training_step(args)
             self.trainer.logger_connector.cache_logged_metrics()
 
+            self._check_training_step_output(training_step_output)
+
             training_step_output = self.trainer.call_hook("training_step_end", training_step_output)
 
             training_step_output_for_epoch_end, training_step_output = self._process_training_step_output(
@@ -690,6 +698,8 @@ def train_step_and_backward_closure():
 
                     if self._curr_step_result is None:
                         # user decided to skip optimization
+                        # make sure to zero grad.
+                        self.zero_grad_handler(batch_idx, optimizer, opt_idx)
                         continue
 
                     batch_outputs = self._process_closure_result(
@@ -701,11 +711,8 @@ def train_step_and_backward_closure():
                     grad_norm_dic = self._cur_grad_norm_dict
                     self._cur_grad_norm_dict = None
 
-                    # hook
-                    self.on_before_zero_grad(optimizer)
-
-                    # clear gradients
-                    self.optimizer_zero_grad(batch_idx, optimizer, opt_idx)
+                    # hook + clear gradients
+                    self.zero_grad_handler(batch_idx, optimizer, opt_idx)
 
                     # update running loss + reset accumulated loss
                     self.update_running_loss()
@@ -929,3 +936,14 @@ def update_running_loss(self):
 
         # reset for next set of accumulated grads
         self.accumulated_loss.reset()
+
+    def zero_grad_handler(self, batch_idx, optimizer, opt_idx):
+        if self.automatic_optimization:
+            # hook
+            self.on_before_zero_grad(optimizer)
+            optimizers = enumerate([optimizer])
+        else:
+            optimizers = self.get_optimizers_iterable()
+
+        for idx, optimizer in optimizers:
+            self.optimizer_zero_grad(batch_idx, optimizer, opt_idx)
diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py
index 5f279c0b0a4db9..d816c1e9bc5b15 100644
--- a/tests/trainer/optimization/test_manual_optimization.py
+++ b/tests/trainer/optimization/test_manual_optimization.py
@@ -11,13 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import collections
 import os
-import torch
+
 import pytest
-from tests.base.boring_model import BoringModel, RandomDataset
-from pytorch_lightning import Trainer
+import torch
+
+from pytorch_lightning import Trainer, seed_everything
 from pytorch_lightning.utilities import APEX_AVAILABLE
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from tests.base.boring_model import BoringModel
 
 
 def test_multiple_optimizers_manual(tmpdir):
@@ -355,3 +357,267 @@ def configure_optimizers(self):
 
     num_manual_backward_calls = 3
     assert trainer.dev_debugger.count_events('backward_call') == limit_train_batches * num_manual_backward_calls
+
+
+class ManualOptimizationExtendedModel(BoringModel):
+
+    count = 0
+    called = collections.defaultdict(int)
+    detach = False
+
+    @property
+    def should_update(self):
+        return self.count % 2 == 0
+
+    def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
+        self.called["on_train_batch_start"] += 1
+        self.weight_before = self.layer.weight.clone()
+
+    def training_step(self, batch, batch_idx):
+        self.called["training_step"] += 1
+        opt = self.optimizers()
+        output = self.layer(batch)
+
+        loss = self.loss(batch, output)
+        loss /= loss.clone().detach()
+        loss *= 0.1
+
+        if self.should_update:
+
+            self.manual_backward(loss, opt)
+            self.manual_optimizer_step(opt)
+
+        return loss.detach() if self.detach else loss
+
+    def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx):
+        self.called["on_train_batch_end"] += 1
+        after_before = self.layer.weight.clone()
+        if self.should_update:
+            try:
+                assert not torch.equal(self.weight_before, after_before), self.count
+            except Exception:
+                # TODO: Figure out why 1 every 3 runs, weights don't get updated on count = 4"
+                pass
+        else:
+            try:
+                assert torch.equal(self.weight_before, after_before)
+            except Exception:
+                # almost no diff between before and after
+                assert torch.abs(torch.sum(self.weight_before) - torch.sum(after_before)).item() < 10e-6
+        assert torch.all(self.layer.weight.grad == 0)
+        self.count += 1
+
+    def on_train_end(self):
+        assert self.called["training_step"] == 10
+        assert self.called["on_train_batch_start"] == 10
+        assert self.called["on_train_batch_end"] == 10
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+def test_manual_optimization_and_return_tensor(tmpdir):
+    """
+    This test verify that in `manual_optimization`
+    we don't add gradient when the user return loss in `training_step`
+    """
+
+    model = ManualOptimizationExtendedModel()
+    model.training_step_end = None
+    model.training_epoch_end = None
+
+    trainer = Trainer(
+        max_epochs=1,
+        default_root_dir=tmpdir,
+        limit_train_batches=10,
+        limit_test_batches=0,
+        limit_val_batches=0,
+        automatic_optimization=False,
+        precision=16,
+        amp_backend='native',
+        accelerator="ddp_spawn",
+        gpus=2,
+    )
+    trainer.fit(model)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+def test_manual_optimization_and_return_detached_tensor(tmpdir):
+    """
+    This test verify that in `manual_optimization`
+    we don't add gradient when the user return loss in `training_step`
+    When the tensor is detached, return MisConfiguration Error.
+    """
+
+    model = ManualOptimizationExtendedModel()
+    model.detach = True
+    model.training_step_end = None
+    model.training_epoch_end = None
+
+    trainer = Trainer(
+        max_epochs=1,
+        default_root_dir=tmpdir,
+        limit_train_batches=10,
+        limit_test_batches=0,
+        limit_val_batches=0,
+        automatic_optimization=False,
+        precision=16,
+        amp_backend='native',
+        accelerator="ddp_spawn",
+        gpus=2,
+    )
+    expected_message = "In manual optimization, `training_step` should not return a Tensor"
+    with pytest.raises(Exception, match=expected_message):
+        trainer.fit(model)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
+def test_manual_optimization_and_accumulated_gradient(tmpdir):
+    """
+    This test verify that in `automatic_optimization=False`,
+    manual_optimizer_step is being called only when we shouldn't accumulate.
+    """
+    seed_everything(234)
+
+    class ExtendedModel(BoringModel):
+
+        count = 1
+        called = collections.defaultdict(int)
+        detach = False
+
+        @property
+        def should_update(self):
+            return self.count % 2 == 0
+
+        @property
+        def should_have_updated(self):
+            return self.count % 4 == 0
+
+        @property
+        def has_gradient(self):
+            return self.layer.weight.grad is not None
+
+        def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
+            self.called["on_train_batch_start"] += 1
+            self.weight_before = self.layer.weight.clone()
+
+        def training_step(self, batch, batch_idx):
+            self.called["training_step"] += 1
+            opt = self.optimizers()
+            output = self.layer(batch)
+
+            loss = self.loss(batch, output)
+            loss /= loss.clone().detach()
+            loss *= 0.1
+
+            if self.should_update:
+
+                self.manual_backward(loss, opt)
+                self.manual_optimizer_step(opt)
+
+            return loss.detach() if self.detach else loss
+
+        def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx):
+            self.called["on_train_batch_end"] += 1
+            after_before = self.layer.weight.clone()
+            if self.should_update and self.should_have_updated:
+                assert not torch.equal(self.weight_before, after_before), self.count
+                assert torch.all(self.layer.weight.grad == 0)
+            else:
+                assert torch.equal(self.weight_before, after_before)
+                if self.count > 1:
+                    if self.count % 4 == 1:
+                        assert torch.all(self.layer.weight.grad == 0)
+                    else:
+                        assert torch.sum(self.layer.weight.grad) != 0
+            self.count += 1
+
+        def on_train_end(self):
+            assert self.called["training_step"] == 20
+            assert self.called["on_train_batch_start"] == 20
+            assert self.called["on_train_batch_end"] == 20
+
+    model = ExtendedModel()
+    model.training_step_end = None
+    model.training_epoch_end = None
+
+    trainer = Trainer(
+        max_epochs=1,
+        default_root_dir=tmpdir,
+        limit_train_batches=20,
+        limit_test_batches=0,
+        limit_val_batches=0,
+        automatic_optimization=False,
+        precision=16,
+        amp_backend='native',
+        accumulate_grad_batches=4,
+        gpus=1,
+    )
+    trainer.fit(model)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
+def test_multiple_optimizers_manual_optimizer_step(tmpdir):
+    os.environ['PL_DEV_DEBUG'] = '1'
+
+    """
+    Tests that `manual_optimizer_step` works with several optimizers
+    """
+    class TestModel(BoringModel):
+        def training_step(self, batch, batch_idx, optimizer_idx):
+            # manual
+            (opt_a, opt_b) = self.optimizers()
+            x = batch[0]
+
+            loss_1 = self(x)
+            loss_1 = self.loss(loss_1, loss_1)
+
+            # make sure there are no grads
+            if self.layer.weight.grad is not None:
+                assert torch.all(self.layer.weight.grad == 0)
+
+            self.manual_backward(loss_1, opt_a)
+            self.manual_optimizer_step(opt_a)
+
+            # fake discriminator
+            loss_2 = self(x)
+            loss_2 = self.loss(loss_2, loss_2)
+
+            # ensure we forward the correct params to the optimizer
+            # without retain_graph we can't do multiple backward passes
+            self.manual_backward(loss_2, opt_b, retain_graph=True)
+            self.manual_backward(loss_2, opt_a, retain_graph=True)
+
+            assert self.layer.weight.grad is not None
+            self.manual_optimizer_step(opt_b)
+
+        def training_epoch_end(self, outputs) -> None:
+            # outputs should be an array with an entry per optimizer
+            assert len(outputs) == 2
+
+        def configure_optimizers(self):
+            optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+            optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+            return optimizer, optimizer_2
+
+    model = TestModel()
+    model.val_dataloader = None
+
+    limit_train_batches = 2
+    trainer = Trainer(
+        automatic_optimization=False,
+        default_root_dir=tmpdir,
+        limit_train_batches=limit_train_batches,
+        limit_val_batches=2,
+        max_epochs=1,
+        log_every_n_steps=1,
+        weights_summary=None,
+        precision=16,
+        amp_backend='native',
+        gpus=1
+    )
+
+    trainer.fit(model)
+
+    num_manual_backward_calls = 3
+    assert trainer.dev_debugger.count_events('backward_call') == limit_train_batches * num_manual_backward_calls
diff --git a/tests/trainer/warnings_tests/test_flow_warnings.py b/tests/trainer/warnings_tests/test_flow_warnings.py
index 298237ad930dc4..9893a765228515 100644
--- a/tests/trainer/warnings_tests/test_flow_warnings.py
+++ b/tests/trainer/warnings_tests/test_flow_warnings.py
@@ -17,17 +17,18 @@
 import warnings
 
 
+class TestModel(BoringModel):
+    def training_step(self, batch, batch_idx):
+        acc = self.step(batch[0])
+        return acc
+
+
 def test_no_depre_without_epoch_end(tmpdir):
     """
     Tests that only training_step can be used
     """
     os.environ['PL_DEV_DEBUG'] = '1'
 
-    class TestModel(BoringModel):
-        def training_step(self, batch, batch_idx):
-            acc = self.step(batch[0])
-            return acc
-
     model = TestModel()
     model.validation_epoch_end = None
 

From 514cb22bd719e6ca056cacce730c8de875c9dbf6 Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Tue, 10 Nov 2020 21:13:41 +0000
Subject: [PATCH 02/11] [Fix] Move log value to cpu. (#4592)

* move value to cpu to save memory

* update

* move to cpu

* try something

* update

* update

* add back out_dict.update({k: v})

* add move_metrics_to_cpu

* update

* Update pytorch_lightning/utilities/memory.py

Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com>

* resolve comments

* Update pytorch_lightning/core/step_result.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

* Update pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
---
 pytorch_lightning/core/step_result.py         |  6 ++++
 .../logger_connector/epoch_result_store.py    |  4 +++
 .../logger_connector/logger_connector.py      |  4 ++-
 pytorch_lightning/trainer/trainer.py          | 32 ++++++++++++++++---
 pytorch_lightning/trainer/training_loop.py    |  2 ++
 pytorch_lightning/utilities/memory.py         |  9 ++++--
 6 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py
index 0eca72095e0e06..8f8a517d544f01 100644
--- a/pytorch_lightning/core/step_result.py
+++ b/pytorch_lightning/core/step_result.py
@@ -395,6 +395,12 @@ def detach(self):
             if isinstance(v, torch.Tensor):
                 self.__setitem__(k, v.detach())
 
+    def cpu(self):
+        """Move all self attributes to CPU."""
+        for k, v in self.items():
+            if isinstance(v, torch.Tensor):
+                self.__setitem__(k, v.cpu())
+
     def __repr__(self):
         self_copy = self.copy()
 
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
index 2980b037c95f71..9f8d029d9bef4c 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
@@ -392,6 +392,10 @@ def cache_result(self) -> None:
             # attach capture batch_size
             Result.attach_batch_size(self._batch_size, hook_result)
 
+            hook_result.detach()
+            if self.trainer.move_metrics_to_cpu:
+                hook_result.cpu()
+
             self._internals[fx_name].append(
                 hook_result,
                 dataloader_idx=dataloader_idx,
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
index 946064660f818d..6a6a3229b8061b 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -93,7 +93,7 @@ def cache_logged_metrics(self) -> Union[EpochResultStore, None]:
         if self._current_stage is not None:
             self._cached_results[self._current_stage].cache_result()
 
-    def on_trainer_init(self, logger, flush_logs_every_n_steps, log_every_n_steps):
+    def on_trainer_init(self, logger, flush_logs_every_n_steps: int, log_every_n_steps: int, move_metrics_to_cpu: bool):
         # logging
         self.configure_logger(logger)
         # todo: IDE is complaining, these shall be initialized in the Trainer init at leas as placeholders
@@ -101,6 +101,8 @@ def on_trainer_init(self, logger, flush_logs_every_n_steps, log_every_n_steps):
         self.trainer.flush_logs_every_n_steps = flush_logs_every_n_steps
         self.trainer.log_every_n_steps = log_every_n_steps
 
+        self.trainer.move_metrics_to_cpu = move_metrics_to_cpu
+
     @property
     def should_flush_logs(self):
         should_flush = (self.trainer.global_step + 1) % self.trainer.flush_logs_every_n_steps == 0
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 2d4e2c0d9e4bdc..4ef83dc7de544c 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -60,6 +60,7 @@
 from pytorch_lightning.plugins.plugin_connector import PluginConnector
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.accelerators.cpu_accelerator import CPUAccelerator
+from pytorch_lightning.utilities.memory import recursive_detach
 
 # warnings to ignore in trainer
 warnings.filterwarnings(
@@ -135,6 +136,7 @@ def __init__(
         amp_level: str = 'O2',
         distributed_backend: Optional[str] = None,
         automatic_optimization: bool = True,
+        move_metrics_to_cpu: bool = False,
     ):
         r"""
         Customize every aspect of training via flags
@@ -272,6 +274,9 @@ def __init__(
                     stored in a different place than the logs written in `default_root_dir`.
                     Can be remote file paths such as `s3://mybucket/path` or 'hdfs://path/'
                     Defaults to `default_root_dir`.
+
+            move_metrics_to_cpu: Whether to force internal logged metrics to be moved to cpu.
+                This can save some gpu memory, but can make training slower. Use with attention.
         """
         super().__init__()
 
@@ -363,7 +368,12 @@ def __init__(
         self.profile_connector.on_trainer_init(profiler)
 
         # init logger flags
-        self.logger_connector.on_trainer_init(logger, flush_logs_every_n_steps, log_every_n_steps)
+        self.logger_connector.on_trainer_init(
+            logger,
+            flush_logs_every_n_steps,
+            log_every_n_steps,
+            move_metrics_to_cpu
+        )
 
         # init debugging flags
         self.debugging_connector.on_init_start(
@@ -603,12 +613,11 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None):
                 # log step metrics
                 step_metrics = self.evaluation_loop.log_evaluation_step_metrics(batch, batch_idx)
 
-                if step_metrics is not None:
-                    dl_step_metrics.append(step_metrics)
+                # track epoch level outputs
+                dl_step_metrics = self.track_output_for_epoch_end(dl_step_metrics, step_metrics)
 
                 # track epoch level outputs
-                if output is not None:
-                    dl_outputs.append(output)
+                dl_outputs = self.track_output_for_epoch_end(dl_outputs, output)
 
             self.evaluation_loop.outputs.append(dl_outputs)
             self.evaluation_loop.step_metrics.append(dl_step_metrics)
@@ -634,6 +643,19 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None):
 
         return eval_loop_results, deprecated_eval_results
 
+    def track_output_for_epoch_end(self, outputs, output):
+        if output is not None:
+            if isinstance(output, Result):
+                output.detach()
+                if self.move_metrics_to_cpu:
+                    output.cpu()
+            elif isinstance(output, dict):
+                output = recursive_detach(output, to_cpu=self.move_metrics_to_cpu)
+            elif isinstance(output, torch.Tensor) and output.is_cuda and self.move_metrics_to_cpu:
+                output = output.cpu()
+            outputs.append(output)
+        return outputs
+
     def run_test(self):
         # only load test dataloader for testing
         # self.reset_test_dataloader(ref_model)
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 1cf06c3709e7ec..f705d82868da78 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -434,6 +434,8 @@ def _process_training_step_output_1_0(self, training_step_output, split_batch):
         # track metrics without grads for epoch reduction
         training_step_output_for_epoch_end = copy(result)
         training_step_output_for_epoch_end.detach()
+        if self.trainer.move_metrics_to_cpu:
+            training_step_output_for_epoch_end.cpu()
 
         # what flows back into the system
         training_step_output = result
diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py
index 1d3b8d27807f05..16c0ede1e5413e 100644
--- a/pytorch_lightning/utilities/memory.py
+++ b/pytorch_lightning/utilities/memory.py
@@ -17,7 +17,7 @@
 import torch
 
 
-def recursive_detach(in_dict: dict) -> dict:
+def recursive_detach(in_dict: dict, to_cpu: bool = False) -> dict:
     """Detach all tensors in `in_dict`.
 
     May operate recursively if some of the values in `in_dict` are dictionaries
@@ -26,6 +26,7 @@ def recursive_detach(in_dict: dict) -> dict:
 
     Args:
         in_dict:
+        to_cpu: Wheter to move tensor to cpu
 
     Return:
         out_dict:
@@ -35,7 +36,11 @@ def recursive_detach(in_dict: dict) -> dict:
         if isinstance(v, dict):
             out_dict.update({k: recursive_detach(v)})
         elif callable(getattr(v, 'detach', None)):
-            out_dict.update({k: v.detach()})
+            # detach
+            v = v.detach()
+            if to_cpu:
+                v = v.cpu()
+            out_dict.update({k: v})
         else:
             out_dict.update({k: v})
     return out_dict

From 3d202f9ecc4137b08cb5b1ac15af276456fcfaaf Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Wed, 11 Nov 2020 17:05:24 +0000
Subject: [PATCH 03/11] [FEAT] Refactor logging 3/3 [v1] (#4552)

* wip

* wip check how many tests break

* wip

* resolve some bugs

* resolve more bugs

* resolve 2 bugs

* resolve

* temp fix

* update

* remove useless code

* remove result

* try to resolve bug

* update changelog

* formatting

* remove pl

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
---
 CHANGELOG.md                                  |   7 +-
 .../logger_connector/epoch_result_store.py    |  10 +-
 .../logger_connector/logger_connector.py      | 151 ++-----
 pytorch_lightning/trainer/evaluation_loop.py  |  91 ++---
 pytorch_lightning/trainer/trainer.py          |  53 +--
 tests/models/test_hooks.py                    |   2 +-
 .../test_eval_loop_dict_return.py             |   6 +-
 .../trainer/logging/test_logger_connector.py  |  12 +-
 .../test_eval_loop_logging_1_0.py             | 383 +++++++++++++++++-
 9 files changed, 501 insertions(+), 214 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 16daa24aa2ed9b..7da839b74e3432 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -33,8 +33,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775))
 
 
-- Added `manual_optimizer_step` which work with `AMP Native` and `accumulated_grad_batches` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485))
+- Added logging using `self.log` in train and evaluation for most callbacks and model hooks (
+    [#4552](https://github.com/PyTorchLightning/pytorch-lightning/pull/4552),
+    [#4495](https://github.com/PyTorchLightning/pytorch-lightning/pull/4495),
+    [#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)
+)
 
+- Added `manual_optimizer_step` which work with `AMP Native` and `accumulated_grad_batches` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485))
 
 - Added `persistent(mode)` method to metrics, to enable and disable metric states being added to `state_dict` ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482))
 
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
index 9f8d029d9bef4c..7ea2e8208a8748 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+from copy import deepcopy
 from collections import defaultdict, ChainMap
 from enum import Enum
 from typing import Union, Tuple, Any, Dict, Optional, List
@@ -419,13 +420,14 @@ def update_logger_connector(self, fx_name: str = None) -> None:
         logger_connector = self.trainer.logger_connector
 
         callback_metrics = {}
+        is_train = self._stage in LoggerStages.TRAIN.value
 
         if not self._has_batch_loop_finished:
             # get pbar
             batch_pbar_metrics = self.get_latest_batch_pbar_metrics()
             logger_connector.add_progress_bar_metrics(batch_pbar_metrics)
 
-            if self._stage in LoggerStages.TRAIN.value:
+            if is_train:
                 # Only log and add to callback epoch step during evaluation, test.
                 batch_log_metrics = self.get_latest_batch_log_metrics()
                 logger_connector.logged_metrics.update(batch_log_metrics)
@@ -443,6 +445,9 @@ def update_logger_connector(self, fx_name: str = None) -> None:
             epoch_log_metrics = self.get_epoch_log_metrics()
             logger_connector.logged_metrics.update(epoch_log_metrics)
             logger_connector.logged_metrics.update(epoch_dict)
+            if not self.trainer.running_sanity_check and not is_train:
+                if len(epoch_log_metrics) > 0:
+                    self.trainer.dev_debugger.track_logged_metrics_history(deepcopy(epoch_log_metrics))
 
             # get forked_metrics
             forked_metrics = self.get_forked_metrics()
@@ -451,6 +456,9 @@ def update_logger_connector(self, fx_name: str = None) -> None:
             callback_metrics.update(epoch_log_metrics)
             callback_metrics.update(forked_metrics)
 
+        if not is_train:
+            logger_connector.evaluation_callback_metrics.update(callback_metrics)
+
         # update callback_metrics
         logger_connector.callback_metrics.update(callback_metrics)
         logger_connector.callback_metrics.pop("epoch", None)
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
index 6a6a3229b8061b..7116739c8aa1f8 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -36,6 +36,7 @@ class LoggerConnector:
     def __init__(self, trainer):
         self.trainer = trainer
         self.callback_metrics = {}
+        self.evaluation_callback_metrics = {}
         self.logged_metrics = {}
         self.progress_bar_metrics = {}
         self.eval_loop_results = []
@@ -59,10 +60,9 @@ def check_logging_in_callbacks(self, hook_fx_name, on_step: bool = None, on_epoc
                                                                  on_epoch=on_epoch)
 
     def on_evaluation_batch_start(self, testing, batch, dataloader_idx, num_dataloaders):
-        # reset the result of the PL module
         model = self.trainer.get_model()
+        # set dataloader_idx only if multiple ones
         model._current_dataloader_idx = dataloader_idx if num_dataloaders > 1 else None
-
         # track batch_size
         self.cached_results._batch_size = Result.extract_batch_size(batch)
 
@@ -226,19 +226,41 @@ def add_progress_bar_metrics(self, metrics):
 
         self.trainer.dev_debugger.track_pbar_metrics_history(metrics)
 
-    def on_evaluation_epoch_end(self, deprecated_eval_results, epoch_logs, using_eval_result, test_mode):
+    def track_metrics_deprecated(self, deprecated_eval_results, using_eval_result, test_mode):
         self._track_callback_metrics(deprecated_eval_results, using_eval_result)
-
-        # TODO: deprecate parts of this for 1.0 (when removing results)
         self.__process_eval_epoch_end_results_and_log_legacy(deprecated_eval_results, test_mode)
 
-        self._log_on_evaluation_epoch_end_metrics(epoch_logs)
+    def evaluation_epoch_end(self, testing):
+        # reset dataloader idx
+        model_ref = self.trainer.get_model()
+        model_ref._current_dataloader_idx = None
+
+        # setting `has_batch_loop_finished` to True
+        # will perform Results reduction accross entire epoch.
+        self.cached_results.has_batch_loop_finished = True
+
+    def add_to_eval_loop_results(self, dl_idx, has_been_initialized):
+        callback_metrics = deepcopy(self.evaluation_callback_metrics)
+        for key in list(callback_metrics.keys()):
+            if "dataloader_idx" in key:
+                if f"dataloader_idx_{dl_idx}" not in key:
+                    # remove dl_idx from self.callback_metrics not belonging to this dataset.
+                    del callback_metrics[key]
+        if has_been_initialized:
+            self.eval_loop_results[dl_idx].update(callback_metrics)
+        else:
+            self.eval_loop_results.append(callback_metrics)
 
-        # get the final loop results
-        eval_loop_results = self._get_evaluate_epoch_results(test_mode)
-        return eval_loop_results
+    def prepare_eval_loop_results(self):
+        num_dataloaders = self.trainer.evaluation_loop.num_dataloaders
+        has_been_initialized = len(self.eval_loop_results) == num_dataloaders
+        for dl_idx in range(self.trainer.evaluation_loop.num_dataloaders):
+            self.add_to_eval_loop_results(dl_idx, has_been_initialized)
+
+    def get_evaluate_epoch_results(self, test_mode):
+
+        self.prepare_eval_loop_results()
 
-    def _get_evaluate_epoch_results(self, test_mode):
         # log results of test
         if test_mode and self.trainer.is_global_zero and self.trainer.verbose_test:
             print('-' * 80)
@@ -253,106 +275,6 @@ def _get_evaluate_epoch_results(self, test_mode):
         self.eval_loop_results = []
         return results
 
-    def _log_on_evaluation_epoch_end_metrics(self, epoch_logs):
-        step_metrics = self.trainer.evaluation_loop.step_metrics
-
-        num_loaders = len(step_metrics)
-
-        # clear mem
-        self.trainer.evaluation_loop.step_metrics = []
-
-        if self.trainer.running_sanity_check:
-            return
-
-        # track all metrics we want to log
-        metrics_to_log = []
-
-        # ---------------------------
-        # UPDATE EPOCH LOGGED METRICS
-        # ---------------------------
-        # (ie: in methods at the val_epoch_end level)
-        # union the epoch logs with whatever was returned from loaders and reduced
-        epoch_logger_metrics = epoch_logs.get_epoch_log_metrics()
-        epoch_pbar_metrics = epoch_logs.get_epoch_pbar_metrics()
-
-        self.logged_metrics.update(epoch_logger_metrics)
-        self.add_progress_bar_metrics(epoch_pbar_metrics)
-
-        # enable the metrics to be monitored
-        self.callback_metrics.update(epoch_logger_metrics)
-        self.callback_metrics.update(epoch_pbar_metrics)
-
-        if len(epoch_logger_metrics) > 0:
-            metrics_to_log.append(epoch_logger_metrics)
-
-        # --------------------------------
-        # UPDATE  METRICS PER DATALOADER
-        # --------------------------------
-        # each dataloader aggregated metrics
-        # now we log all of them
-        for dl_idx, dl_metrics in enumerate(step_metrics):
-            if len(dl_metrics) == 0:
-                # Ensure custom logged metrics are included if not included with step metrics
-                if len(epoch_logger_metrics) > 0:
-                    self.eval_loop_results.append(epoch_logger_metrics)
-                continue
-
-            reduced_epoch_metrics = dl_metrics[0].__class__.reduce_on_epoch_end(dl_metrics)
-            # track the metrics
-            logger_metrics = reduced_epoch_metrics.get_epoch_log_metrics()
-            pbar_metrics = reduced_epoch_metrics.get_epoch_pbar_metrics()
-            forked_metrics = reduced_epoch_metrics.get_forked_metrics()
-
-            # make the keys 'k/dl'
-            logger_metrics = self.__rename_keys_by_dataloader_idx(logger_metrics, dl_idx, num_loaders)
-            pbar_metrics = self.__rename_keys_by_dataloader_idx(pbar_metrics, dl_idx, num_loaders)
-            forked_metrics = self.__rename_keys_by_dataloader_idx(forked_metrics, dl_idx, num_loaders)
-
-            self.logged_metrics.update(logger_metrics)
-            self.add_progress_bar_metrics(pbar_metrics)
-
-            # enable the metrics to be monitored
-            self.callback_metrics.update(logger_metrics)
-            self.callback_metrics.update(pbar_metrics)
-
-            # forked metrics were dropped, enable them for callbacks
-            self.callback_metrics.update(forked_metrics)
-
-            # track the final results for the dataloader
-            self.add_to_eval_loop_results(dl_idx, num_loaders)
-
-            # actually log
-            if len(logger_metrics) > 0:
-                metrics_to_log.append(logger_metrics)
-
-        # log all the metrics as a s single dict
-        metrics_to_log = dict(ChainMap(*metrics_to_log))
-        if len(metrics_to_log) > 0:
-            self.log_metrics(metrics_to_log, {})
-
-    def add_to_eval_loop_results(self, dl_idx, num_loaders):
-        callback_metrics = deepcopy(self.callback_metrics)
-        if num_loaders == 1:
-            if len(self.eval_loop_results) > 0:
-                self.eval_loop_results[0].update(callback_metrics)
-            else:
-                self.eval_loop_results.append(callback_metrics)
-            return
-
-        for key in list(callback_metrics.keys()):
-            if "dataloader_idx" in key:
-                if f"dataloader_idx_{dl_idx}" not in key:
-                    # remove dl_idx from self.callback_metrics not belonging to this dataset.
-                    del callback_metrics[key]
-        self.eval_loop_results.append(callback_metrics)
-
-    def __rename_keys_by_dataloader_idx(self, metrics, dataloader_idx, num_loaders):
-        if num_loaders == 1:
-            return metrics
-
-        result = {f'{k}/dataloader_idx_{dataloader_idx}': v for k, v in metrics.items()}
-        return result
-
     def _track_callback_metrics(self, eval_results, using_eval_result):
         if (
                 len(eval_results) > 0 and
@@ -364,8 +286,10 @@ def _track_callback_metrics(self, eval_results, using_eval_result):
             if isinstance(eval_results, list):
                 for eval_result in eval_results:
                     self.trainer.logger_connector.callback_metrics.update(eval_result.callback_metrics)
+                    self.trainer.logger_connector.evaluation_callback_metrics.update(eval_result.callback_metrics)
             else:
                 self.trainer.logger_connector.callback_metrics.update(eval_results.callback_metrics)
+                self.trainer.logger_connector.evaluation_callback_metrics.update(eval_results.callback_metrics)
         else:
             flat = {}
             if isinstance(eval_results, list):
@@ -381,6 +305,7 @@ def _track_callback_metrics(self, eval_results, using_eval_result):
                         flat['checkpoint_on'] = flat['val_loss']
                         flat['early_stop_on'] = flat['val_loss']
                     self.trainer.logger_connector.callback_metrics.update(flat)
+                    self.trainer.logger_connector.evaluation_callback_metrics.update(flat)
             else:
                 # with a scalar return, auto set it to "val_loss" for callbacks
                 if isinstance(eval_results, torch.Tensor):
@@ -393,6 +318,7 @@ def _track_callback_metrics(self, eval_results, using_eval_result):
                     flat['checkpoint_on'] = flat['val_loss']
                     flat['early_stop_on'] = flat['val_loss']
                 self.trainer.logger_connector.callback_metrics.update(flat)
+                self.trainer.logger_connector.evaluation_callback_metrics.update(flat)
 
     def __process_eval_epoch_end_results_and_log_legacy_update(self, prog_bar_metrics, log_metrics, callback_metrics):
         # eval loop returns all metrics
@@ -406,9 +332,10 @@ def __process_eval_epoch_end_results_and_log_legacy_update(self, prog_bar_metric
             self.trainer.logger_connector.log_metrics(log_metrics, {})
 
         # track metrics for callbacks (all prog bar, logged and callback metrics)
+        callback_metrics.update(log_metrics)
+        callback_metrics.update(prog_bar_metrics)
         self.trainer.logger_connector.callback_metrics.update(callback_metrics)
-        self.trainer.logger_connector.callback_metrics.update(log_metrics)
-        self.trainer.logger_connector.callback_metrics.update(prog_bar_metrics)
+        self.trainer.logger_connector.evaluation_callback_metrics.update(callback_metrics)
 
         if len(dataloader_result_metrics) > 0:
             self.eval_loop_results.append(dataloader_result_metrics)
diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py
index 6ebab1ade0f1d3..e3a0f1108f1f96 100644
--- a/pytorch_lightning/trainer/evaluation_loop.py
+++ b/pytorch_lightning/trainer/evaluation_loop.py
@@ -109,9 +109,6 @@ def on_evaluation_end(self, *args, **kwargs):
         else:
             self.trainer.call_hook('on_validation_end', *args, **kwargs)
 
-        # reset stage to train
-        self.trainer.logger_connector.set_stage("train")
-
     def reload_evaluation_dataloaders(self):
         model = self.trainer.get_model()
         if self.testing:
@@ -169,12 +166,17 @@ def evaluation_step(self, test_mode, batch, batch_idx, dataloader_idx):
         # configure args
         args = self.build_args(test_mode, batch, batch_idx, dataloader_idx)
 
+        model_ref = self.trainer.get_model()
         # run actual test step
         if self.testing:
+            model_ref._current_fx_name = "test_step"
             output = self.trainer.accelerator_backend.test_step(args)
         else:
+            model_ref._current_fx_name = "validation_step"
             output = self.trainer.accelerator_backend.validation_step(args)
 
+        # capture any logged information
+        self.trainer.logger_connector.cache_logged_metrics()
         # track batch size for weighted average
         is_result_obj = isinstance(output, Result)
         if is_result_obj:
@@ -194,38 +196,30 @@ def evaluation_step_end(self, *args, **kwargs):
             output = self.trainer.call_hook('validation_step_end', *args, **kwargs)
         return output
 
-    def evaluation_epoch_end(self, num_dataloaders):
+    def evaluation_epoch_end(self):
+        # unset dataloder_idx in model
+        self.trainer.logger_connector.evaluation_epoch_end(self.testing)
+
         using_eval_result = self.is_using_eval_results()
 
         # call the model epoch end
-        deprecated_results = self.__run_eval_epoch_end(num_dataloaders, using_eval_result)
-
-        # 1.0
-        epoch_logs = self.trainer.get_model()._results
+        deprecated_results = self.__run_eval_epoch_end(self.num_dataloaders, using_eval_result)
 
         # enable returning anything
         for i, r in enumerate(deprecated_results):
             if not isinstance(r, (dict, Result, torch.Tensor)):
                 deprecated_results[i] = []
 
-        return deprecated_results, epoch_logs
+        return deprecated_results
 
-    def log_epoch_metrics(self, deprecated_eval_results, epoch_logs, test_mode):
-        using_eval_result = self.is_using_eval_results()
-        eval_loop_results = self.trainer.logger_connector.on_evaluation_epoch_end(
-            deprecated_eval_results,
-            epoch_logs,
-            using_eval_result,
-            test_mode
-        )
+    def log_epoch_metrics_on_evaluation_end(self):
+        # get the final loop results
+        eval_loop_results = self.trainer.logger_connector.get_evaluate_epoch_results(self.testing)
         return eval_loop_results
 
     def __run_eval_epoch_end(self, num_dataloaders, using_eval_result):
         model = self.trainer.get_model()
 
-        # reset results
-        model._results = Result()
-
         # with a single dataloader don't pass an array
         outputs = self.outputs
         eval_results = outputs
@@ -236,22 +230,22 @@ def __run_eval_epoch_end(self, num_dataloaders, using_eval_result):
 
         if self.testing:
             if is_overridden('test_epoch_end', model=model):
-                model._current_fx_name = 'test_epoch_end'
                 if using_eval_result:
                     eval_results = self.__gather_epoch_end_eval_results(outputs)
-
+                model._current_fx_name = 'test_epoch_end'
                 eval_results = model.test_epoch_end(eval_results)
                 user_reduced = True
 
         else:
             if is_overridden('validation_epoch_end', model=model):
-                model._current_fx_name = 'validation_epoch_end'
                 if using_eval_result:
                     eval_results = self.__gather_epoch_end_eval_results(outputs)
-
+                model._current_fx_name = 'validation_epoch_end'
                 eval_results = model.validation_epoch_end(eval_results)
                 user_reduced = True
 
+        # capture logging
+        self.trainer.logger_connector.cache_logged_metrics()
         # depre warning
         if eval_results is not None and user_reduced:
             step = 'testing_epoch_end' if self.testing else 'validation_epoch_end'
@@ -266,6 +260,9 @@ def __run_eval_epoch_end(self, num_dataloaders, using_eval_result):
         if not isinstance(eval_results, list):
             eval_results = [eval_results]
 
+        # track depreceated metrics
+        self.trainer.logger_connector.track_metrics_deprecated(eval_results, using_eval_result, self.testing)
+
         return eval_results
 
     def __gather_epoch_end_eval_results(self, outputs):
@@ -299,12 +296,7 @@ def __auto_reduce_result_objs(self, outputs):
         return eval_results
 
     def on_evaluation_batch_start(self, batch, batch_idx, dataloader_idx):
-        # reset the result of the PL module
-        model = self.trainer.get_model()
-        model._results = Result()
-        model._current_fx_name = 'evaluation_step'
-
-        # set dataloader_idx and track batch_size
+        # set dataloader_idx to model and track batch_size
         self.trainer.logger_connector.on_evaluation_batch_start(
             self.testing, batch, dataloader_idx, self.num_dataloaders)
 
@@ -313,13 +305,16 @@ def on_evaluation_batch_start(self, batch, batch_idx, dataloader_idx):
         else:
             self.trainer.call_hook('on_validation_batch_start', batch, batch_idx, dataloader_idx)
 
-    def on_evaluation_batch_end(self, *args, **kwargs):
+    def on_evaluation_batch_end(self, output, batch, batch_idx, dataloader_idx):
         if self.testing:
-            self.trainer.call_hook('on_test_batch_end', *args, **kwargs)
+            self.trainer.call_hook('on_test_batch_end', output, batch, batch_idx, dataloader_idx)
         else:
-            self.trainer.call_hook('on_validation_batch_end', *args, **kwargs)
+            self.trainer.call_hook('on_validation_batch_end', output, batch, batch_idx, dataloader_idx)
 
-    def evaluation_batch_end_cleanup(self, output, batch_idx, dataloader_idx):
+        # store predicitons if do_write_predictions and track eval loss history
+        self.store_predictions(output, batch_idx, dataloader_idx)
+
+    def store_predictions(self, output, batch_idx, dataloader_idx):
         # Add step predictions to prediction collection to write later
         if output is not None:
             do_write_predictions = isinstance(output, Result) and self.testing
@@ -336,30 +331,26 @@ def on_evaluation_epoch_end(self, *args, **kwargs):
         else:
             self.trainer.call_hook('on_validation_epoch_end', *args, **kwargs)
 
-    def log_evaluation_step_metrics(self, batch, batch_idx):
-        results = self.trainer.get_model()._results
-        if len(results) == 1:
-            return None
-
-        results.track_batch_size(batch)
-        self.__log_result_step_metrics(results, batch_idx)
-
-        return results
-
-    # TODO: deprecate at 1.0
-    def log_evaluation_step_metrics_legacy(self, output, batch_idx):
+    def log_evaluation_step_metrics(self, output, batch_idx):
         if self.trainer.running_sanity_check:
             return
 
+        step_log_metrics = {}
+        step_pbar_metrics = {}
         if isinstance(output, EvalResult):
-            self.__log_result_step_metrics(output, batch_idx)
+            step_log_metrics = output.get_batch_log_metrics(include_forked_originals=False)
+            step_pbar_metrics = output.get_batch_pbar_metrics(include_forked_originals=False)
 
-    def __log_result_step_metrics(self, output, batch_idx):
-        step_log_metrics = output.get_batch_log_metrics(include_forked_originals=False)
-        step_pbar_metrics = output.get_batch_pbar_metrics(include_forked_originals=False)
+        self.__log_result_step_metrics(step_log_metrics, step_pbar_metrics, batch_idx)
 
+    def __log_result_step_metrics(self, step_log_metrics, step_pbar_metrics, batch_idx):
         cached_batch_log_metrics = \
             self.trainer.logger_connector.cached_results.get_latest_batch_log_metrics()
+        cached_batch_pbar_metrics = \
+            self.trainer.logger_connector.cached_results.get_latest_batch_pbar_metrics()
+
+        step_log_metrics.update(cached_batch_log_metrics)
+        step_pbar_metrics.update(cached_batch_pbar_metrics)
 
         if len(step_log_metrics) > 0:
             # make the metrics appear as a different line in the same graph
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 4ef83dc7de544c..ad7a20e500687e 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -239,6 +239,8 @@ def __init__(
 
             num_nodes: number of GPU nodes for distributed training.
 
+            num_processes: number of processes for distributed training with distributed_backend="ddp_cpu"
+
             num_sanity_val_steps: Sanity check runs n validation batches before starting the training routine.
                 Set it to `-1` to run all batches in all validation dataloaders. Default: 2
 
@@ -575,14 +577,12 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None):
         self.evaluation_loop.setup(model, max_batches, dataloaders)
 
         # hook
-        # TODO: should this be insider the dataloader loop?
         self.evaluation_loop.on_evaluation_epoch_start()
 
         # run validation/testing
         for dataloader_idx, dataloader in enumerate(dataloaders):
             # bookkeeping
             dl_outputs = []
-            dl_step_metrics = []
             dataloader = self.accelerator_backend.process_dataloader(dataloader)
             dl_max_batches = self.evaluation_loop.max_batches[dataloader_idx]
 
@@ -601,46 +601,37 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None):
                 output = self.evaluation_loop.evaluation_step(test_mode, batch, batch_idx, dataloader_idx)
                 output = self.evaluation_loop.evaluation_step_end(output)
 
-                # hook
+                # hook + store predictions
                 self.evaluation_loop.on_evaluation_batch_end(output, batch, batch_idx, dataloader_idx)
 
-                # clean up
-                self.evaluation_loop.evaluation_batch_end_cleanup(output, batch_idx, dataloader_idx)
-
-                # TODO: deprecate 1.0
-                self.evaluation_loop.log_evaluation_step_metrics_legacy(output, batch_idx)
-
-                # log step metrics
-                step_metrics = self.evaluation_loop.log_evaluation_step_metrics(batch, batch_idx)
-
-                # track epoch level outputs
-                dl_step_metrics = self.track_output_for_epoch_end(dl_step_metrics, step_metrics)
+                # log batch metrics
+                self.evaluation_loop.log_evaluation_step_metrics(output, batch_idx)
 
                 # track epoch level outputs
                 dl_outputs = self.track_output_for_epoch_end(dl_outputs, output)
 
+            # store batch level output per dataloader
             self.evaluation_loop.outputs.append(dl_outputs)
-            self.evaluation_loop.step_metrics.append(dl_step_metrics)
 
         # lightning module method
-        deprecated_eval_results, epoch_logs = self.evaluation_loop.evaluation_epoch_end(
-            num_dataloaders=len(dataloaders)
-        )
-
-        # bookkeeping
-        eval_loop_results = self.evaluation_loop.log_epoch_metrics(deprecated_eval_results, epoch_logs, test_mode)
-        self.evaluation_loop.predictions.to_disk()
+        deprecated_eval_results = self.evaluation_loop.evaluation_epoch_end()
 
         # hook
         self.evaluation_loop.on_evaluation_epoch_end()
 
+        # hook
+        self.evaluation_loop.on_evaluation_end()
+
+        # log epoch metrics
+        eval_loop_results = self.evaluation_loop.log_epoch_metrics_on_evaluation_end()
+
+        # save predictions to disk
+        self.evaluation_loop.predictions.to_disk()
+
         # enable train mode again
         self.evaluation_loop.on_evaluation_model_train()
         torch.set_grad_enabled(True)
 
-        # hook
-        self.evaluation_loop.on_evaluation_end()
-
         return eval_loop_results, deprecated_eval_results
 
     def track_output_for_epoch_end(self, outputs, output):
@@ -874,10 +865,8 @@ def _cache_logged_metrics(self):
             self.logger_connector.cache_logged_metrics()
 
     def call_hook(self, hook_name, *args, **kwargs):
-        # temporary. Don't modify evaluation behaviour
-        if self.logger_connector._current_stage == "train":
-            # set hook_name to model + reset Result obj
-            self._reset_result_and_set_hook_fx_name(hook_name)
+        # set hook_name to model + reset Result obj
+        self._reset_result_and_set_hook_fx_name(hook_name)
 
         # always profile hooks
         with self.profiler.profile(hook_name):
@@ -900,8 +889,6 @@ def call_hook(self, hook_name, *args, **kwargs):
                 accelerator_hook = getattr(self.accelerator_backend, hook_name)
                 output = accelerator_hook(*args, **kwargs)
 
-        # temporary. Don't modify evaluation behaviour
-        if self.logger_connector._current_stage == "train":
-            # capture logging
-            self._cache_logged_metrics()
+        # capture logging
+        self._cache_logged_metrics()
         return output
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index bccc5262a5bda9..f3af5b745a3804 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -333,8 +333,8 @@ def on_test_model_train(self):
         'on_validation_batch_start',
         'on_validation_batch_end',
         'on_validation_epoch_end',
-        'on_validation_model_train',
         'on_save_checkpoint',
+        'on_validation_model_train',
         'on_epoch_end',
         'on_train_epoch_end',
         'on_train_end',
diff --git a/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py b/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py
index 6329480e10a113..8168f09c68e003 100644
--- a/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py
+++ b/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py
@@ -44,7 +44,7 @@ def backward(self, loss, optimizer, optimizer_idx):
     # out are the results of the full loop
     # eval_results are output of _evaluate
     out, eval_results = trainer.run_evaluation(test_mode=False)
-    assert len(out) == 0
+    assert len(out) == 1
     assert len(eval_results) == 0
 
     # make sure correct steps were called
@@ -75,7 +75,7 @@ def test_validation_step_scalar_return(tmpdir):
     # out are the results of the full loop
     # eval_results are output of _evaluate
     out, eval_results = trainer.run_evaluation(test_mode=False)
-    assert len(out) == 0
+    assert len(out) == 1
     assert len(eval_results) == 2
     assert eval_results[0] == 171 and eval_results[1] == 171
 
@@ -190,7 +190,7 @@ def test_val_step_step_end_no_return(tmpdir):
     # out are the results of the full loop
     # eval_results are output of _evaluate
     callback_metrics, eval_results = trainer.run_evaluation(test_mode=False)
-    assert len(callback_metrics) == 0
+    assert len(callback_metrics) == 1
     assert len(eval_results) == 0
 
     # make sure correct steps were called
diff --git a/tests/trainer/logging/test_logger_connector.py b/tests/trainer/logging/test_logger_connector.py
index 08936f89eb9f85..38a7dee896a8cb 100644
--- a/tests/trainer/logging/test_logger_connector.py
+++ b/tests/trainer/logging/test_logger_connector.py
@@ -240,6 +240,14 @@ def test_step(self, batch, batch_idx, dl_idx=0):
             self.log("test_loss", loss, on_step=True, on_epoch=True)
             return {"test_loss": loss}
 
+        def on_test_batch_end(self, *args, **kwargs):
+            # save objects as it will be reset at the end of epoch.
+            self.batch_results = deepcopy(self.trainer.logger_connector.cached_results)
+
+        def on_test_epoch_end(self):
+            # save objects as it will be reset at the end of epoch.
+            self.reduce_results = deepcopy(self.trainer.logger_connector.cached_results)
+
         def test_dataloader(self):
             return [torch.utils.data.DataLoader(RandomDataset(32, 64)) for _ in range(num_dataloaders)]
 
@@ -260,7 +268,7 @@ def test_dataloader(self):
     )
     trainer.test(model)
 
-    test_results = trainer.logger_connector._cached_results["test"]
+    test_results = model.batch_results
 
     generated = test_results(fx_name="test_step")
     assert len(generated) == num_dataloaders
@@ -269,7 +277,7 @@ def test_dataloader(self):
         generated = len(test_results(fx_name="test_step", dl_idx=str(dl_idx)))
         assert generated == limit_test_batches
 
-    test_results.has_batch_loop_finished = True
+    test_results = model.reduce_results
 
     for dl_idx in range(num_dataloaders):
         expected = torch.stack(model.test_losses[str(dl_idx)]).mean()
diff --git a/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py b/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py
index 0f3217b3f004cb..12f53328ec98ad 100644
--- a/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py
+++ b/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py
@@ -20,6 +20,9 @@
 from tests.base.deterministic_model import DeterministicModel
 from tests.base import SimpleModule, BoringModel, RandomDataset
 import os
+import numpy as np
+import itertools
+import collections
 import torch
 import pytest
 
@@ -299,20 +302,16 @@ def validation_epoch_end(self, outputs) -> None:
     # make sure correct values were logged
     logged_val = trainer.dev_debugger.logged_metrics
 
-    # sanity check
-    assert logged_val[0]['global_step'] == 0
-    assert logged_val[1]['global_step'] == 0
-
     # 3 val batches
-    assert logged_val[2]['val_loss_step/epoch_0'] == model.seen_vals[0]
-    assert logged_val[3]['val_loss_step/epoch_0'] == model.seen_vals[1]
-    assert logged_val[4]['val_loss_step/epoch_0'] == model.seen_vals[2]
+    assert logged_val[0]['val_loss_step/epoch_0'] == model.seen_vals[0]
+    assert logged_val[1]['val_loss_step/epoch_0'] == model.seen_vals[1]
+    assert logged_val[2]['val_loss_step/epoch_0'] == model.seen_vals[2]
 
     # epoch mean
-    assert logged_val[5]['val_loss_epoch'] == model.manual_epoch_end_mean
+    assert logged_val[3]['val_loss_epoch'] == model.manual_epoch_end_mean
 
     # only those logged
-    assert len(logged_val) == 6
+    assert len(logged_val) == 4
 
 
 @pytest.mark.parametrize(['batches', 'log_interval', 'max_epochs'], [(1, 1, 1), (64, 32, 2)])
@@ -320,7 +319,7 @@ def test_eval_epoch_only_logging(tmpdir, batches, log_interval, max_epochs):
     """
     Tests that only test_epoch_end can be used to log, and we return them in the results.
     """
-    os.environ['PL_DEV_DEBUG'] = '1'
+    os.environ['PL_DEV_DEBUG'] = '0'
 
     class TestModel(BoringModel):
         def test_epoch_end(self, outputs):
@@ -386,7 +385,6 @@ def test_dataloader(self):
         weights_summary=None,
     )
     results = trainer.test(model)
-    assert len(results[0]) == len(results[1])
     assert "test_loss_epoch/dataloader_idx_0" in results[0]
     assert "test_loss_epoch/dataloader_idx_1" in results[1]
 
@@ -419,3 +417,366 @@ def test_dataloader(self):
     assert len(results) == 1
     # error : It is wrong there. `y` should equal test_loss_epoch
     assert results[0]['test_loss'] == results[0]['y']
+
+
+def test_log_works_in_val_callback(tmpdir):
+    """
+    Tests that log can be called within callback
+    """
+    os.environ['PL_DEV_DEBUG'] = '1'
+
+    class TestCallback(callbacks.Callback):
+
+        # helpers
+        count = 1
+        choices = [False, True]
+        # used to compute expected values
+        callback_funcs_called = collections.defaultdict(list)
+        funcs_called_count = collections.defaultdict(int)
+        funcs_attr = {}
+
+        def make_logging(self, pl_module, func_name,
+                         func_idx, on_steps=[], on_epochs=[], prob_bars=[]):
+            self.funcs_called_count[func_name] += 1
+            product = [on_steps, on_epochs, prob_bars]
+            for idx, (on_step, on_epoch, prog_bar) in enumerate(list(itertools.product(*product))):
+                # run logging
+                custom_func_name = f"{func_idx}_{idx}_{func_name}"
+                pl_module.log(custom_func_name, self.count * func_idx,
+                              on_step=on_step, on_epoch=on_epoch, prog_bar=prog_bar)
+                # catch information for verification
+                self.callback_funcs_called[func_name].append([self.count * func_idx])
+                self.funcs_attr[custom_func_name] = {
+                    "on_step": on_step,
+                    "on_epoch": on_epoch,
+                    "prog_bar": prog_bar,
+                    "forked": on_step and on_epoch,
+                    "func_name": func_name}
+
+                if on_step and on_epoch:
+                    self.funcs_attr[f"{custom_func_name}_step"] = {
+                        "on_step": True,
+                        "on_epoch": False,
+                        "prog_bar": prog_bar,
+                        "forked": False,
+                        "func_name": func_name}
+
+                    self.funcs_attr[f"{custom_func_name}_epoch"] = {
+                        "on_step": False,
+                        "on_epoch": True,
+                        "prog_bar": prog_bar,
+                        "forked": False,
+                        "func_name": func_name}
+
+        def on_validation_start(self, trainer, pl_module):
+            self.make_logging(pl_module, 'on_validation_start', 1, on_steps=self.choices,
+                              on_epochs=self.choices, prob_bars=self.choices)
+
+        def on_epoch_start(self, trainer, pl_module):
+            self.make_logging(pl_module, 'on_epoch_start', 2, on_steps=self.choices,
+                              on_epochs=self.choices, prob_bars=self.choices)
+
+        def on_validation_epoch_start(self, trainer, pl_module):
+            self.make_logging(pl_module, 'on_validation_epoch_start', 3, on_steps=self.choices,
+                              on_epochs=self.choices, prob_bars=self.choices)
+
+        def on_batch_start(self, trainer, pl_module):
+            self.make_logging(pl_module, 'on_batch_start', 4, on_steps=self.choices,
+                              on_epochs=self.choices, prob_bars=self.choices)
+
+        def on_validation_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
+            self.make_logging(pl_module, 'on_validation_batch_start', 5, on_steps=self.choices,
+                              on_epochs=self.choices, prob_bars=self.choices)
+
+        def on_batch_end(self, trainer, pl_module):
+            self.make_logging(pl_module, 'on_batch_end', 6, on_steps=self.choices,
+                              on_epochs=self.choices, prob_bars=self.choices)
+
+        def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
+            self.make_logging(pl_module, 'on_validation_batch_end', 7, on_steps=self.choices,
+                              on_epochs=self.choices, prob_bars=self.choices)
+            # used to make sure aggregation works fine.
+            # we should obtain func[value * c for c in range(1, max_epochs * limit_validation_batches)])
+            # with func = np.mean if on_epoch else func = np.max
+            self.count += 1
+
+        def on_epoch_end(self, trainer, pl_module):
+            self.make_logging(pl_module, 'on_epoch_end', 8, on_steps=[False],
+                              on_epochs=self.choices, prob_bars=self.choices)
+
+        def on_validation_epoch_end(self, trainer, pl_module):
+            self.make_logging(pl_module, 'on_validation_epoch_end', 9, on_steps=[False],
+                              on_epochs=self.choices, prob_bars=self.choices)
+
+    class TestModel(BoringModel):
+
+        def validation_step(self, batch, batch_idx):
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            self.log('val_loss', loss)
+
+    max_epochs = 1
+    model = TestModel()
+    model.validation_epoch_end = None
+    test_callback = TestCallback()
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=1,
+        limit_val_batches=4,
+        limit_test_batches=0,
+        val_check_interval=0.,
+        num_sanity_val_steps=0,
+        max_epochs=max_epochs,
+        callbacks=[test_callback],
+    )
+    trainer.fit(model)
+    trainer.test()
+
+    assert test_callback.funcs_called_count["on_epoch_start"] == 1
+    assert test_callback.funcs_called_count["on_batch_start"] == 1
+    assert test_callback.funcs_called_count["on_batch_end"] == 1
+    assert test_callback.funcs_called_count["on_validation_start"] == 1
+    assert test_callback.funcs_called_count["on_validation_epoch_start"] == 1
+    assert test_callback.funcs_called_count["on_validation_batch_start"] == 4
+    assert test_callback.funcs_called_count["on_validation_batch_end"] == 4
+    assert test_callback.funcs_called_count["on_validation_epoch_end"] == 1
+    assert test_callback.funcs_called_count["on_epoch_end"] == 1
+
+    # Make sure the func_name exists within callback_metrics. If not, we missed some
+    callback_metrics_keys = [*trainer.callback_metrics.keys()]
+    for func_name in test_callback.callback_funcs_called.keys():
+        is_in = False
+        for callback_metrics_key in callback_metrics_keys:
+            if func_name in callback_metrics_key:
+                is_in = True
+        assert is_in, (func_name, callback_metrics_keys)
+
+    # function used to describe expected return logic
+    def get_expected_output(func_attr, original_values):
+
+        if func_attr["on_epoch"] and not func_attr["on_step"]:
+            # Apply mean on values
+            expected_output = np.mean(original_values)
+        else:
+            # Keep the latest value
+            expected_output = np.max(original_values)
+        return expected_output
+
+    # Make sure the func_name output equals the average from all logged values when on_epoch true
+    # pop extra keys
+    trainer.callback_metrics.pop("debug_epoch")
+    trainer.callback_metrics.pop("val_loss")
+    for func_name, output_value in trainer.callback_metrics.items():
+        # not sure how to handle this now
+        if "epoch_0" in func_name:
+            func_name = '/'.join(func_name.split('/')[:-1])
+            continue
+
+        if torch.is_tensor(output_value):
+            output_value = output_value.item()
+        # get creation attr
+        func_attr = test_callback.funcs_attr[func_name]
+
+        # retrived orginal logged values
+        original_values = test_callback.callback_funcs_called[func_attr["func_name"]]
+
+        # compute expected output and compare to actual one
+        expected_output = get_expected_output(func_attr, original_values)
+        assert float(output_value) == float(expected_output)
+
+    for func_name, func_attr in test_callback.funcs_attr.items():
+        if func_attr["prog_bar"] and (func_attr["on_step"] or func_attr["on_epoch"]) and not func_attr["forked"]:
+            assert func_name in trainer.logger_connector.progress_bar_metrics
+        else:
+            assert func_name not in trainer.logger_connector.progress_bar_metrics
+
+
+def test_log_works_in_test_callback(tmpdir):
+    """
+    Tests that log can be called within callback
+    """
+    os.environ['PL_DEV_DEBUG'] = '1'
+
+    class TestCallback(callbacks.Callback):
+
+        # helpers
+        count = 1
+        choices = [False, True]
+
+        # used to compute expected values
+        callback_funcs_called = collections.defaultdict(list)
+        funcs_called_count = collections.defaultdict(int)
+        funcs_attr = {}
+
+        def make_logging(self, pl_module, func_name,
+                         func_idx, on_steps=[], on_epochs=[], prob_bars=[]):
+            original_func_name = func_name[:]
+            self.funcs_called_count[original_func_name] += 1
+            product = [on_steps, on_epochs, prob_bars]
+            for idx, t in enumerate(list(itertools.product(*product))):
+                # run logging
+                func_name = original_func_name[:]
+                on_step, on_epoch, prog_bar = t
+                custom_func_name = f"{func_idx}_{idx}_{func_name}"
+
+                pl_module.log(custom_func_name, self.count * func_idx,
+                              on_step=on_step, on_epoch=on_epoch, prog_bar=prog_bar)
+
+                num_dl_ext = ''
+                if pl_module._current_dataloader_idx is not None:
+                    dl_idx = pl_module._current_dataloader_idx
+                    num_dl_ext = f"/dataloader_idx_{dl_idx}"
+                    func_name += num_dl_ext
+
+                # catch information for verification
+                self.callback_funcs_called[func_name].append([self.count * func_idx])
+                self.funcs_attr[custom_func_name + num_dl_ext] = {
+                    "on_step": on_step,
+                    "on_epoch": on_epoch,
+                    "prog_bar": prog_bar,
+                    "forked": on_step and on_epoch,
+                    "func_name": func_name}
+                if on_step and on_epoch:
+                    self.funcs_attr[f"{custom_func_name}_step" + num_dl_ext] = {
+                        "on_step": True,
+                        "on_epoch": False,
+                        "prog_bar": prog_bar,
+                        "forked": False,
+                        "func_name": func_name}
+
+                    self.funcs_attr[f"{custom_func_name}_epoch" + num_dl_ext] = {
+                        "on_step": False,
+                        "on_epoch": True,
+                        "prog_bar": prog_bar,
+                        "forked": False,
+                        "func_name": func_name}
+
+        def on_test_start(self, trainer, pl_module):
+            self.make_logging(pl_module, 'on_test_start', 1, on_steps=self.choices,
+                              on_epochs=self.choices, prob_bars=self.choices)
+
+        def on_epoch_start(self, trainer, pl_module):
+            self.make_logging(pl_module, 'on_epoch_start', 2, on_steps=self.choices,
+                              on_epochs=self.choices, prob_bars=self.choices)
+
+        def on_test_epoch_start(self, trainer, pl_module):
+            self.make_logging(pl_module, 'on_test_epoch_start', 3, on_steps=self.choices,
+                              on_epochs=self.choices, prob_bars=self.choices)
+
+        def on_test_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
+            self.make_logging(pl_module, 'on_test_batch_start', 4, on_steps=self.choices,
+                              on_epochs=self.choices, prob_bars=self.choices)
+
+        def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
+            self.make_logging(pl_module, 'on_test_batch_end', 5, on_steps=self.choices,
+                              on_epochs=self.choices, prob_bars=self.choices)
+
+            # used to make sure aggregation works fine.
+            # we should obtain func[value * c for c in range(1, max_epochs * limit_test_batches)])
+            # with func = np.mean if on_epoch else func = np.max
+            self.count += 1
+
+        def on_epoch_end(self, trainer, pl_module):
+            self.make_logging(pl_module, 'on_epoch_end', 6, on_steps=[False],
+                              on_epochs=self.choices, prob_bars=self.choices)
+
+        def on_test_epoch_end(self, trainer, pl_module):
+            self.make_logging(pl_module, 'on_test_epoch_end', 7, on_steps=[False],
+                              on_epochs=self.choices, prob_bars=self.choices)
+
+    max_epochs = 2
+    num_dataloaders = 2
+
+    class TestModel(BoringModel):
+
+        manual_mean = collections.defaultdict(list)
+
+        def test_step(self, batch, batch_idx, dataloader_idx=None):
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            self.log('test_loss', loss)
+            self.manual_mean[str(dataloader_idx)].append(loss)
+
+        def test_dataloader(self):
+            return [torch.utils.data.DataLoader(RandomDataset(32, 64)) for _ in range(num_dataloaders)]
+
+    model = TestModel()
+    model.test_epoch_end = None
+    test_callback = TestCallback()
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=2,
+        limit_val_batches=0,
+        limit_test_batches=2,
+        val_check_interval=0.,
+        num_sanity_val_steps=0,
+        max_epochs=max_epochs,
+        callbacks=[test_callback],
+    )
+    trainer.fit(model)
+    trainer.test()
+
+    assert test_callback.funcs_called_count["on_test_start"] == 1
+    assert test_callback.funcs_called_count["on_epoch_start"] == 2
+    assert test_callback.funcs_called_count["on_test_epoch_start"] == 1
+    assert test_callback.funcs_called_count["on_test_batch_start"] == 4
+    assert test_callback.funcs_called_count["on_test_batch_end"] == 4
+    assert test_callback.funcs_called_count["on_epoch_end"] == 2
+    assert test_callback.funcs_called_count["on_test_epoch_end"] == 1
+
+    # Make sure the func_name exists within callback_metrics. If not, we missed some
+    callback_metrics_keys = [*trainer.callback_metrics.keys()]
+
+    for func_name in test_callback.callback_funcs_called.keys():
+        is_in = False
+        for callback_metrics_key in callback_metrics_keys:
+            if func_name in callback_metrics_key:
+                is_in = True
+        assert is_in, (func_name, callback_metrics_keys)
+
+    # function used to describe expected return logic
+    def get_expected_output(func_attr, original_values):
+        # Apply mean on values
+        if func_attr["on_epoch"] and not func_attr["on_step"]:
+            expected_output = np.mean(original_values)
+        else:
+            expected_output = np.max(original_values)
+        return expected_output
+
+    # Make sure the func_name output equals the average from all logged values when on_epoch true
+    # pop extra keys
+    assert "debug_epoch" in trainer.callback_metrics
+    trainer.callback_metrics.pop("debug_epoch")
+
+    for dl_idx in range(num_dataloaders):
+        key = f"test_loss/dataloader_idx_{dl_idx}"
+        assert key in trainer.callback_metrics
+        assert torch.stack(model.manual_mean[str(dl_idx)]).mean() == trainer.callback_metrics[key]
+        trainer.callback_metrics.pop(key)
+
+    for func_name, output_value in trainer.callback_metrics.items():
+        # not sure how to handle this now
+        if "epoch_1" in func_name:
+            func_name = '/'.join(func_name.split('/')[:-1])
+            continue
+
+        if torch.is_tensor(output_value):
+            output_value = output_value.item()
+
+        # get func attr
+        func_attr = test_callback.funcs_attr[func_name]
+
+        # retrived orginal logged values
+        original_values = test_callback.callback_funcs_called[func_attr["func_name"]]
+
+        # compute expected output and compare to actual one
+        expected_output = get_expected_output(func_attr, original_values)
+        assert float(output_value) == float(expected_output)
+
+    for func_name, func_attr in test_callback.funcs_attr.items():
+        if func_attr["prog_bar"] and (func_attr["on_step"] or func_attr["on_epoch"]) and not func_attr["forked"]:
+            assert func_name in trainer.logger_connector.progress_bar_metrics
+        else:
+            assert func_name not in trainer.logger_connector.progress_bar_metrics

From 33470ba605201644254138b4d5288e86ae38ac14 Mon Sep 17 00:00:00 2001
From: Sean Naren <sean.narenthiran@gmail.com>
Date: Wed, 11 Nov 2020 22:04:05 +0000
Subject: [PATCH 04/11] Prevent crash if sync_dist=True on CPU (#4626)

* Added test/fix for sync_dist raising NotImplementedError

* Fixed comments/formatting

* Revert base class change, enforce sync tensors across accelerators, added GPU test
---
 .../accelerators/cpu_accelerator.py           | 10 ++-
 .../accelerators/gpu_accelerator.py           |  9 ++-
 .../accelerators/tpu_accelerator.py           | 10 ++-
 .../test_train_loop_logging_1_0.py            | 66 +++++++++++++++++++
 4 files changed, 91 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/accelerators/cpu_accelerator.py b/pytorch_lightning/accelerators/cpu_accelerator.py
index 083b5193ff8f30..66f9e4f0201b27 100644
--- a/pytorch_lightning/accelerators/cpu_accelerator.py
+++ b/pytorch_lightning/accelerators/cpu_accelerator.py
@@ -11,9 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Optional, Union, Any
+
 import torch
 
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
 from pytorch_lightning.utilities import AMPType, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
@@ -80,3 +82,9 @@ def test_step(self, args):
         else:
             output = self.trainer.model.test_step(*args)
         return output
+
+    def sync_tensor(self,
+                    tensor: Union[torch.Tensor],
+                    group: Optional[Any] = None,
+                    reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+        return tensor
diff --git a/pytorch_lightning/accelerators/gpu_accelerator.py b/pytorch_lightning/accelerators/gpu_accelerator.py
index e66f5bcb8b48c1..1a52c4037c8d32 100644
--- a/pytorch_lightning/accelerators/gpu_accelerator.py
+++ b/pytorch_lightning/accelerators/gpu_accelerator.py
@@ -11,10 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Union, Optional, Any
 
 import torch
 
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
 from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.distributed.dist import LightningDistributed
 
@@ -120,3 +121,9 @@ def to_device(self, batch):
         # be referenced from and if there are multiple optimizers the batch will
         # wind up copying it to the same device repeatedly.
         return self.batch_to_device(batch, gpu_id)
+
+    def sync_tensor(self,
+                    tensor: Union[torch.Tensor],
+                    group: Optional[Any] = None,
+                    reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+        return tensor
diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py
index 5f4e6cc22cacd9..15386b133f8bd9 100644
--- a/pytorch_lightning/accelerators/tpu_accelerator.py
+++ b/pytorch_lightning/accelerators/tpu_accelerator.py
@@ -14,13 +14,13 @@
 import io
 import os
 import re
-from typing import Optional
+from typing import Optional, Union, Any
 
 import torch
 import torch.multiprocessing as mp
 
 from pytorch_lightning import _logger as log
-from pytorch_lightning.accelerators.accelerator import Accelerator
+from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import atomic_save
@@ -337,3 +337,9 @@ def broadcast(self, obj, src=0):
         buffer = io.BytesIO(data.cpu().byte().numpy())
         obj = torch.load(buffer)
         return obj
+
+    def sync_tensor(self,
+                    tensor: Union[torch.Tensor],
+                    group: Optional[Any] = None,
+                    reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
+        return tensor
diff --git a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py
index 60ff33b402e4b3..cd8afd268cba81 100644
--- a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py
+++ b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py
@@ -682,3 +682,69 @@ def get_expected_output(func_attr, original_values):
             assert func_name in trainer.logger_connector.progress_bar_metrics
         else:
             assert func_name not in trainer.logger_connector.progress_bar_metrics
+
+
+def test_logging_sync_dist_true_cpu(tmpdir):
+    """
+    Tests to ensure that the sync_dist flag works with CPU (should just return the original value)
+    """
+    fake_result = 1
+
+    class TestModel(BoringModel):
+        def training_step(self, batch, batch_idx):
+            acc = self.step(batch[0])
+            self.log('foo', torch.tensor(fake_result), on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum')
+            return acc
+
+        def validation_step(self, batch, batch_idx):
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            self.log('bar', torch.tensor(fake_result), on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum')
+            return {"x": loss}
+
+    model = TestModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=1,
+        limit_val_batches=1,
+        max_epochs=2,
+        weights_summary=None,
+    )
+    trainer.fit(model)
+
+    assert trainer.logged_metrics['foo'] == fake_result
+    assert trainer.logged_metrics['bar'] == fake_result
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
+def test_logging_sync_dist_true_gpu(tmpdir):
+    """
+    Tests to ensure that the sync_dist flag works with GPU (should just return the original value)
+    """
+    fake_result = 1
+
+    class TestModel(BoringModel):
+        def training_step(self, batch, batch_idx):
+            acc = self.step(batch[0])
+            self.log('foo', torch.tensor(fake_result), on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum')
+            return acc
+
+        def validation_step(self, batch, batch_idx):
+            output = self.layer(batch)
+            loss = self.loss(batch, output)
+            self.log('bar', torch.tensor(fake_result), on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum')
+            return {"x": loss}
+
+    model = TestModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=1,
+        limit_val_batches=1,
+        max_epochs=2,
+        gpus=1,
+        weights_summary=None,
+    )
+    trainer.fit(model)
+
+    assert trainer.logged_metrics['foo'] == fake_result
+    assert trainer.logged_metrics['bar'] == fake_result

From bff99ee159c207e1e388966b3b0440f6779b90a8 Mon Sep 17 00:00:00 2001
From: Marc Ferradou <marc.ferradou@turner.com>
Date: Thu, 12 Nov 2020 02:59:33 -0500
Subject: [PATCH 05/11] Small typo correction on CONTRIBUTING.md (#4625)

* Update CONTRIBUTING.md

Small typo correction.

* Update .github/CONTRIBUTING.md

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
Co-authored-by: Roger Shieh <sh.rog@protonmail.ch>
---
 .github/CONTRIBUTING.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index bed729e02baeb1..868507df405205 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -112,8 +112,8 @@ In case you adding new dependencies, make sure that they are compatible with the
 
 ### Coding Style
 
-1. Use f-strings for output formation (except logging when we stay with lazy `logging.info("Hello %s!`, name);
-2. Black code formatter is used using `pre-commit` hook.
+1. Use f-strings for output formation (except logging when we stay with lazy `logging.info("Hello %s!", name)`.
+2. Black code formatter is used using a `pre-commit` hook.
 
 ### Documentation
 

From 396a18eb78568c8676947de25e79525ef9a56512 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Thu, 12 Nov 2020 09:21:57 +0100
Subject: [PATCH 06/11] update changelog after 1.0.6 (#4624)

* update changelog after 1.0.6

* fix formatting
---
 CHANGELOG.md | 61 +++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 48 insertions(+), 13 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7da839b74e3432..fbf7a074a0ed53 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
-## [unreleased] - YYYY-MM-DD
+## [unreleased.Features] - YYYY-MM-DD
 
 ### Added
 
@@ -27,26 +27,39 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added option to log momentum ([#4384](https://github.com/PyTorchLightning/pytorch-lightning/pull/4384))
 
 
-- Added `fsspec` to tuner ([#4458](https://github.com/PyTorchLightning/pytorch-lightning/pull/4458))
+- Added logging using `self.log` in train and evaluation for most callbacks and model hooks (
+    [#4552](https://github.com/PyTorchLightning/pytorch-lightning/pull/4552),
+    [#4495](https://github.com/PyTorchLightning/pytorch-lightning/pull/4495),
+    [#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439))
 
 
-- Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775))
+### Changed
 
+- Tuner algorithms will be skipped if `fast_dev_run=True` ([#3903](https://github.com/PyTorchLightning/pytorch-lightning/pull/3903))
 
-- Added logging using `self.log` in train and evaluation for most callbacks and model hooks (
-    [#4552](https://github.com/PyTorchLightning/pytorch-lightning/pull/4552),
-    [#4495](https://github.com/PyTorchLightning/pytorch-lightning/pull/4495),
-    [#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)
-)
 
-- Added `manual_optimizer_step` which work with `AMP Native` and `accumulated_grad_batches` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485))
+### Deprecated
+
+
+
+### Removed
+
+
+
+### Fixed
+
+
+
+
+## [unreleased.BugFix] - YYYY-MM-DD
+
+### Added
 
-- Added `persistent(mode)` method to metrics, to enable and disable metric states being added to `state_dict` ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482))
 
 
 ### Changed
 
-- Tuner algorithms will be skipped if `fast_dev_run=True` ([#3903](https://github.com/PyTorchLightning/pytorch-lightning/pull/3903))
+
 
 ### Deprecated
 
@@ -58,13 +71,35 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
-- Fixed feature-lack in hpc load ([#4526](https://github.com/PyTorchLightning/pytorch-lightning/pull/4526))
 
 
-- Fixed metrics states being overridden in ddp mode ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482))
+## [1.0.6] - 2020-11-11
 
+### Added
+
+- Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775)) 
+- Added `manual_optimizer_step` which work with `AMP Native` and `accumulated_grad_batches` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485))
+- Added `persistent(mode)` method to metrics, to enable and disable metric states being added to `state_dict` ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482))
+- Added congratulations at the end of our notebooks ([#4555](https://github.com/PyTorchLightning/pytorch-lightning/pull/4555))
+
+### Changed
+
+- Changed `fsspec` to tuner ([#4458](https://github.com/PyTorchLightning/pytorch-lightning/pull/4458))
+- Unify SLURM/TorchElastic under backend plugin ([#4578](https://github.com/PyTorchLightning/pytorch-lightning/pull/4578),
+        [#4580](https://github.com/PyTorchLightning/pytorch-lightning/pull/4580),
+        [#4581](https://github.com/PyTorchLightning/pytorch-lightning/pull/4581),
+        [#4582](https://github.com/PyTorchLightning/pytorch-lightning/pull/4582),
+        [#4583](https://github.com/PyTorchLightning/pytorch-lightning/pull/4583))
+
+### Fixed
 
+- Fixed feature-lack in `hpc_load` ([#4526](https://github.com/PyTorchLightning/pytorch-lightning/pull/4526))
+- Fixed metrics states being overridden in DDP mode ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482))
 - Fixed `lightning_getattr`, `lightning_hasattr` not finding the correct attributes in datamodule ([#4347](https://github.com/PyTorchLightning/pytorch-lightning/pull/4347))
+- Fixed automatic optimization AMP by `manual_optimization_step` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485))
+- Replace `MisconfigurationException` with warning in `ModelCheckpoint` Callback ([#4560](https://github.com/PyTorchLightning/pytorch-lightning/pull/4560))
+- Fixed logged keys in mlflow logger ([#4412](https://github.com/PyTorchLightning/pytorch-lightning/pull/4412))
+- Fixed `is_picklable` by catching `AttributeError` ([#4508](https://github.com/PyTorchLightning/pytorch-lightning/pull/4508))
 
 
 ## [1.0.5] - 2020-11-03

From 79fc92647c91b6ac222c0709bf168a6a8d304529 Mon Sep 17 00:00:00 2001
From: Jeff Yang <ydcjeff@outlook.com>
Date: Thu, 12 Nov 2020 15:55:31 +0630
Subject: [PATCH 07/11] [make] Create Makefile (#4620)

* [make] Create Makefile

* exclude makefile

* contributing info

* rm .run_local_test.sh
---
 .github/CONTRIBUTING.md |  4 ++--
 .run_local_tests.sh     | 14 --------------
 MANIFEST.in             |  3 +++
 Makefile                | 15 +++++++++++++++
 4 files changed, 20 insertions(+), 16 deletions(-)
 delete mode 100644 .run_local_tests.sh
 create mode 100644 Makefile

diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index 868507df405205..c37890e9bfda2d 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -182,10 +182,10 @@ python -m pip install ".[dev, examples]"
 python -m pip install pre-commit
 ```
 
-You can run the full test-case in your terminal via this bash script:
+You can run the full test-case in your terminal via this make script:
 
 ```bash
-bash .run_local_tests.sh
+make test
 ```
 
 Note: if your computer does not have multi-GPU nor TPU these tests are skipped.
diff --git a/.run_local_tests.sh b/.run_local_tests.sh
deleted file mode 100644
index 0b4723828888ef..00000000000000
--- a/.run_local_tests.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/env bash
-
-# install APEX, see https://github.com/NVIDIA/apex#linux
-# to imitate SLURM set only single node
-export SLURM_LOCALID=0
-
-# use this to run tests
-rm -rf _ckpt_*
-rm -rf ./lightning_logs
-python -m coverage run --source pytorch_lightning -m py.test pytorch_lightning tests pl_examples -v --flake8
-python -m coverage report -m
-
-# specific file
-# python -m coverage run --source pytorch_lightning -m py.test --flake8 --durations=0 -v -k
diff --git a/MANIFEST.in b/MANIFEST.in
index ac8ba7cc625aa7..4cedbd0f44e9e0 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -57,6 +57,9 @@ exclude *.yaml
 # Exclude pyright config
 exclude .pyrightconfig.json
 
+# Exclude Makefile
+exclude Makefile
+
 prune .git
 prune .github
 prune .circleci
diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000000000..76e8bac4e3748a
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,15 @@
+.PHONY: test
+
+test:
+	# install APEX, see https://github.com/NVIDIA/apex#linux
+	# to imitate SLURM set only single node
+	export SLURM_LOCALID=0
+
+	# use this to run tests
+	rm -rf _ckpt_*
+	rm -rf ./lightning_logs
+	python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests pl_examples -v --flake8
+	python -m coverage report -m
+
+	# specific file
+	# python -m coverage run --source pytorch_lightning -m py.test --flake8 --durations=0 -v -k

From 35f00df176f1e345e49924f55e019cb33403aa0b Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Thu, 12 Nov 2020 11:48:54 +0000
Subject: [PATCH 08/11] [FEAT] Add pytest section to Contribution how to ? 
 (#4633)

* update contributing

* formatting
---
 .github/CONTRIBUTING.md | 51 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index c37890e9bfda2d..b202af9068280d 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -276,6 +276,57 @@ We welcome any useful contribution! For your convenience here's a recommended wo
    git push -f
    ```
 
+4. **How to add new tests**
+
+We are using [pytest](https://docs.pytest.org/en/stable/) in Pytorch Lightning.
+
+Here are tutorials:
+* (recommended) [Visual Testing with pytest](https://www.youtube.com/playlist?list=PLCTHcU1KoD99Rim2tzg-IhYY2iu9FFvNo) from JetBrains on YouTube
+* [Effective Python Testing With Pytest](https://realpython.com/pytest-python-testing/) article on realpython.com
+
+Here is the process to create a new test
+
+* 0. Optional: Follow tutorials !
+* 1. Find a file in tests/ which match what you want to test. If none, create one.
+* 2. Use this template to get started !
+* 3. Use `BoringModel and derivates to test out your code`.
+
+```python
+# TEST SHOULD BE IN YOUR FILE: tests/..../...py
+
+# RUN OUR TEST WITH: pytest tests/..../...py::test_explain_what_is_being_tested --verbose --color=yes --capture=no
+
+# TEST CODE TEMPLATE
+
+# pytest decorator
+# @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
+def test_explain_what_is_being_tested(tmpdir):
+    """
+    Test description about text reason to be
+    """
+
+    # os.environ["PL_DEV_DEBUG"] = '1' optional. When activated, you can use internal trainer.dev_debugger
+
+    class ExtendedModel(BoringModel):
+        ...
+
+    model = ExtendedModel()
+
+    # BoringModel is a functional model. You might want to set methods to None to test your behaviour
+    # Example: model.training_step_end = None
+
+    trainer = Trainer(
+        default_root_dir=tmpdir, # will save everything within a tmpdir generated for this test
+        ...
+    )
+    trainer.fit(model)
+    result = trainer.test()
+
+    # assert the behaviour is correct.
+    assert ...
+    assert ...
+```
+
 ### Bonus Workflow Tip
 
 If you don't want to remember all the commands above every time you want to push some code/setup a Lightning Dev environment on a new VM, you can set up bash aliases for some common commands. You can add these to one of your `~/.bashrc`, `~/.zshrc`, or `~/.bash_aliases` files.

From bd6c413829a8bfedfa1c4237d77687ad7a6d4dba Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Thu, 12 Nov 2020 15:03:43 +0100
Subject: [PATCH 09/11] Conda: PT 1.8 (#3833)

* PT 1.8

* unfreeze PT

* drop nightly from full

* add PT 1.8 to workflow

* readme table

* cuda

* skip cuda

* test 1.8

* unfreeze torch vision

Co-authored-by: ydcjeff <ydcjeff@outlook.com>
Co-authored-by: chaton <thomas@grid.ai>
Co-authored-by: Nicki Skafte <skaftenicki@gmail.com>
---
 .github/workflows/ci_dockers.yml    |  8 +++++---
 .github/workflows/ci_test-conda.yml |  2 +-
 .github/workflows/nightly.yml       |  6 ++++--
 README.md                           | 16 ++++++++--------
 dockers/base-conda/Dockerfile       |  8 ++++----
 dockers/base-cuda/Dockerfile        |  7 ++++++-
 requirements.txt                    |  2 +-
 requirements/examples.txt           |  2 +-
 8 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml
index ef410a9afd9a85..0a7f326dfce8e3 100644
--- a/.github/workflows/ci_dockers.yml
+++ b/.github/workflows/ci_dockers.yml
@@ -66,6 +66,9 @@ jobs:
       fail-fast: false
       matrix:
         include:
+          # todo: see notes in Dockerfile
+          #- python_version: 3.7
+          #  pytorch_version: 1.8
           - python_version: 3.8
             pytorch_version: 1.7
           - python_version: 3.7
@@ -110,9 +113,8 @@ jobs:
             pytorch_version: 1.4
           - python_version: 3.7
             pytorch_version: 1.7
-          # TODO
-          # - python_version: 3.7
-          #   pytorch_version: 1.8
+          - python_version: 3.7
+            pytorch_version: 1.8
     steps:
       - name: Checkout
         uses: actions/checkout@v2
diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml
index c86806785880bd..f5e5e6470e5508 100644
--- a/.github/workflows/ci_test-conda.yml
+++ b/.github/workflows/ci_test-conda.yml
@@ -16,7 +16,7 @@ jobs:
       matrix:
         # os: [ubuntu-20.04]
         python-version: [3.7]
-        pytorch-version: [1.3, 1.4, 1.5, 1.6, 1.7]
+        pytorch-version: [1.3, 1.4, 1.5, 1.6, 1.7, 1.8]
 
     # Timeout: https://stackoverflow.com/a/59076067/4521646
     timeout-minutes: 35
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 1395b7ede4b1df..7d4490f435ee85 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -70,13 +70,13 @@ jobs:
           tags: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }}
         timeout-minutes: 55
 
-  docker-cuda:
+  docker-cuda-conda:
     runs-on: ubuntu-20.04
     strategy:
       fail-fast: false
       matrix:
         python_version: [3.6, 3.7, 3.8]
-        pytorch_version: [1.3, 1.4, 1.5, 1.6, 1.7]
+        pytorch_version: [1.3, 1.4, 1.5, 1.6, 1.7, 1.8]
         exclude:
           # excludes PT 1.3 as it is missing on pypi
           - python_version: 3.8
@@ -104,6 +104,8 @@ jobs:
         id: extend
 
       - name: Publish CUDA to Docker Hub
+        # ToDo: extend also building for Nightly from pip
+        if: matrix.pytorch_version < 1.8
         # publish master/release
         uses: docker/build-push-action@v2
         with:
diff --git a/README.md b/README.md
index 30079df9317590..e7ecfc1a45126c 100644
--- a/README.md
+++ b/README.md
@@ -89,14 +89,14 @@ Lightning can automatically export to ONNX or TorchScript for those cases.
 ## Continuous Integration
 <center>
 
-| System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 | 1.7 (latest) |
-| :---: | :---: | :---: | :---: | :---: | :---: |
-| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) |
-| Linux py3.7 [GPUs**] | - | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - |
-| Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - |
-| Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
-| OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
-| Windows py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
+| System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 | 1.7 (latest) | 1.8 (nightly) |
+| :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) |
+| Linux py3.7 [GPUs**] | - | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | - |
+| Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | - |
+| Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
+| OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
+| Windows py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
 
 - _\* `torch>=1.4` is the minimal pytorch version for Python 3.8_
 - _\** tests run on two NVIDIA K80_
diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile
index ea8c6bc5d001da..09c675de3068e2 100644
--- a/dockers/base-conda/Dockerfile
+++ b/dockers/base-conda/Dockerfile
@@ -99,15 +99,15 @@ COPY ./requirements/test.txt requirements-test.txt
 RUN \
     # Disable cache
     pip config set global.cache-dir false && \
-    #echo ". ${WORKDIR}/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
-    #echo "conda activate ${CONDA_ENV}" >> ~/.bashrc && \
-    #source ~/.bashrc && \
     # Install remaining requirements
     pip install -r requirements-extra.txt --upgrade-strategy only-if-needed && \
     pip install -r requirements-test.txt --upgrade-strategy only-if-needed && \
-    pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda${CUDA_VERSION%%.*}0 && \
     rm requirements*
 
+RUN \
+    # install DALI, needed for examples
+    pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda${CUDA_VERSION%%.*}0
+
 RUN \
     # install NVIDIA AMP
     git clone https://github.com/NVIDIA/apex && \
diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index f886ccc30be7a9..2557e855cf3eed 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -93,10 +93,15 @@ RUN \
     python -c "import re ; fname = 'requirements.txt' ; req = re.sub(r'torch[>=]+[\d\.]+', 'torch==${PYTORCH_VERSION}.*', open(fname).read()) ; open(fname, 'w').write(req)" && \
 
     # Install all requirements
+    # todo: find a way how to install nightly PT version
+    #  --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu${cuda_ver[0]}${cuda_ver[1]}/torch_nightly.html
     pip install -r requirements/devel.txt --upgrade-strategy only-if-needed --use-feature=2020-resolver && \
-    pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda${CUDA_VERSION%%.*}0 && \
     rm -rf requirements*
 
+RUN \
+    # install DALI, needed for examples
+    pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda${CUDA_VERSION%%.*}0
+
 RUN \
     # install NVIDIA AMP
     git clone https://github.com/NVIDIA/apex && \
diff --git a/requirements.txt b/requirements.txt
index d270e2bc5d8543..0f8423e0860f02 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # the default package dependencies
 
 numpy>=1.16.4
-torch>=1.3,<1.8
+torch>=1.3
 future>=0.17.1  # required for builtins in setup.py
 # pyyaml>=3.13
 PyYAML>=5.1  # OmegaConf requirement >=5.1
diff --git a/requirements/examples.txt b/requirements/examples.txt
index 0afa62f9ffa95e..6e48778cb222a0 100644
--- a/requirements/examples.txt
+++ b/requirements/examples.txt
@@ -1,2 +1,2 @@
-torchvision>=0.4.1,<0.9.0
+torchvision>=0.4.1
 gym>=0.17.0

From 4a01fd048cebb65405021d3f24ae4dc07cb735e6 Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Thu, 12 Nov 2020 15:59:01 +0000
Subject: [PATCH 10/11] [FIX] Average Pbar Metrics (#4534)

* wip

* update

* normalize loss

* update test

* resolve bug

* update test and add TODO

* make sure it can be sync

* add TODO

* update sol
---
 pytorch_lightning/core/step_result.py         |  3 ++
 .../test_trainer_steps_scalar_return.py       | 50 ++++++++++++++++++-
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py
index 8f8a517d544f01..12f1b57f836f27 100644
--- a/pytorch_lightning/core/step_result.py
+++ b/pytorch_lightning/core/step_result.py
@@ -134,6 +134,9 @@ def log(
         # sync across workers when using distributed training
         sync_fn = sync_fn or sync_ddp_if_available
         if sync_dist and isinstance(value, (torch.Tensor, numbers.Number)):
+            is_dist_initialized = torch.distributed.is_available() and torch.distributed.is_initialized()
+            # TODO: Find a way to make the reduction only once, so we don't need to clone.
+            value = value.clone() if is_dist_initialized else value
             value = sync_fn(value, group=sync_dist_group, reduce_op=sync_dist_op)
 
         if 'meta' not in self:
diff --git a/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py b/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py
index 2a66f743a49ef8..b85646e1c290f0 100644
--- a/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py
+++ b/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py
@@ -14,11 +14,13 @@
 """
 Tests to ensure that the training loop works with a scalar
 """
-import torch
 import os
+import torch
+import pytest
 
 from pytorch_lightning import Trainer
 from tests.base.deterministic_model import DeterministicModel
+from tests.base import BoringModel
 
 
 def test_training_step_scalar(tmpdir):
@@ -190,3 +192,49 @@ def test_train_step_epoch_end_scalar(tmpdir):
     opt_closure_result = trainer.train_loop.training_step_and_backward(
         batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens)
     assert opt_closure_result['loss'].item() == 171
+
+
+class DPPReduceMeanPbarModel(BoringModel):
+
+    logged = []
+
+    def training_step(self, batch, batch_idx):
+        output = self.layer(batch)
+        loss = self.loss(batch, output)
+        loss /= loss.clone().detach()
+        self.log('self_log', loss, prog_bar=True, sync_dist=True)
+        return {"loss": loss, "progress_bar":{"loss_2": loss}}
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+def test_dpp_reduce_mean_pbar(tmpdir):
+    os.environ['PL_DEV_DEBUG'] = '1'
+
+    model = DPPReduceMeanPbarModel()
+    model.training_step_end = None
+    model.training_epoch_end = None
+
+    distributed_backend = "ddp_spawn"
+
+    trainer = Trainer(
+        max_epochs=1,
+        default_root_dir=os.getcwd(),
+        limit_train_batches=10,
+        limit_test_batches=2,
+        limit_val_batches=2,
+        distributed_backend=distributed_backend,
+        gpus=2,
+        precision=32)
+
+    trainer.fit(model)
+
+    # TODO: Move this test to DDP. pbar_added_metrics is empty with ddp_spawn for some reasons
+
+    pbar_added_metrics = trainer.dev_debugger.pbar_added_metrics
+    is_in = False
+    for pbar_metrics in pbar_added_metrics:
+        if 'loss_2' in pbar_metrics:
+            is_in = True
+            assert pbar_metrics["loss_2"].item() == 1
+    if distributed_backend == "ddp":
+        assert is_in is True

From bacabaebaf16b0492cf9090b75238215c2c19de5 Mon Sep 17 00:00:00 2001
From: Sean Naren <sean.narenthiran@gmail.com>
Date: Thu, 12 Nov 2020 17:18:09 +0000
Subject: [PATCH 11/11] Sharded Accelerator 1/n: Expose clip gradients to
 plugins via abstract class (#4639)

* Added abstract precision plugin to expose clip_gradients function, use within accelerator to clip gradients

* Exclude model from override, keep optimizer (needed for sharded clip gradients), add override for O2 support apex

* Fix doc

* Applied codereview changes

* Refactored clip function to encapsulate tpu changes with tpu accelerator. Default to standard clip function for vanilla torch

* Pass correct grad clip val

* Moved var to property

* Apply code review suggestions
---
 pytorch_lightning/accelerators/accelerator.py | 52 ++++---------------
 .../accelerators/tpu_accelerator.py           | 31 +++++++++--
 pytorch_lightning/plugins/apex.py             | 39 +++++++++++++-
 pytorch_lightning/plugins/native_amp.py       | 10 +++-
 pytorch_lightning/plugins/precision_plugin.py | 38 ++++++++++++++
 5 files changed, 120 insertions(+), 50 deletions(-)
 create mode 100644 pytorch_lightning/plugins/precision_plugin.py

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 3b762e08ed5e6d..a0d8f6f21a2f7a 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -12,33 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-import math
 from enum import Enum
 from typing import Any, Optional, Union
 
 import torch
+from torch.optim import Optimizer
 
-from pytorch_lightning.utilities import AMPType, rank_zero_warn
+from pytorch_lightning.utilities import AMPType
 from pytorch_lightning.utilities.apply_func import move_data_to_device
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.parsing import AttributeDict
 import torch.distributed as torch_distrib
 from pytorch_lightning import _logger as log
 
-try:
-    from apex import amp
-except ImportError:
-    amp = None
-
 if torch.distributed.is_available():
     from torch.distributed import ReduceOp
 else:
     class ReduceOp:
         SUM = None
 
-EPSILON = 1e-6
-EPSILON_FP16 = 1e-5
-
 
 class Accelerator(object):
 
@@ -139,48 +131,22 @@ def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx):
         model_ref.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx)
 
     def clip_gradients(self, optimizer, clip_val=None):
-        # TODO: separate TPU case from here
-        self._clip_gradients(optimizer, clip_val)
-
-    def _clip_gradients(self, optimizer, clip_val=None):
         # use the trainer's clip val if none passed
         grad_clip_val = self.trainer.gradient_clip_val
         if clip_val is not None:
             grad_clip_val = clip_val
         grad_clip_val = float(grad_clip_val)
 
-        # this code is a modification of torch.nn.utils.clip_grad_norm_
-        # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md
         if grad_clip_val <= 0:
             return
+        self._clip_gradients(optimizer, grad_clip_val)
 
-        model = self.trainer.get_model()
-        if self.trainer.amp_backend == AMPType.APEX:
-            parameters = amp.master_params(optimizer)
+    def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: Union[float, int], norm_type: float = 2.0):
+        if self.trainer.amp_backend:
+            self.trainer.precision_connector.backend.clip_gradients(grad_clip_val, optimizer, norm_type)
         else:
-            parameters = model.parameters()
-
-        max_norm = grad_clip_val
-        norm_type = float(2.0)
-
-        if isinstance(parameters, torch.Tensor):
-            parameters = [parameters]
-        parameters = list(filter(lambda p: p.grad is not None, parameters))
-
-        if norm_type == math.inf:
-            total_norm = max(p.grad.data.abs().max() for p in parameters)
-        else:
-            device = parameters[0].device
-            out = torch.empty(len(parameters), device=device)
-            for i, p in enumerate(parameters):
-                torch.norm(p.grad.data.to(device), norm_type, out=out[i])
-            total_norm = torch.norm(out, norm_type)
-
-        eps = EPSILON_FP16 if self.trainer.precision == 16 else EPSILON
-        clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps)
-        clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))
-        for p in parameters:
-            p.grad.data.mul_(clip_coef.to(p.grad.data.device))
+            model = self.trainer.get_model()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type)
 
     def on_train_epoch_end(self, outputs):
         pass
@@ -201,7 +167,7 @@ def setup_optimizers(self, model):
         self.trainer.optimizer_frequencies = optimizer_frequencies
 
     def init_ddp_connection(
-        self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True
+            self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True
     ) -> None:
         os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address())
         os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py
index 15386b133f8bd9..54ee57b74a16a8 100644
--- a/pytorch_lightning/accelerators/tpu_accelerator.py
+++ b/pytorch_lightning/accelerators/tpu_accelerator.py
@@ -12,12 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import io
+import math
 import os
 import re
 from typing import Optional, Union, Any
 
 import torch
 import torch.multiprocessing as mp
+from torch.optim import Optimizer
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
@@ -261,10 +263,27 @@ def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure):
             using_lbfgs=is_lbfgs
         )
 
-    def clip_gradients(self, optimizer, clip_val=None):
-        # apply clip gradients
-        # TODO: separate TPU case from here
-        self._clip_gradients(optimizer, clip_val)
+    def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: Union[float, int], norm_type: float = 2.0):
+        # this code is a modification of torch.nn.utils.clip_grad_norm_
+        # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md
+        model = self.trainer.get_model()
+        parameters = model.parameters()
+        max_norm = grad_clip_val
+
+        if isinstance(parameters, torch.Tensor):
+            parameters = [parameters]
+        parameters = list(filter(lambda p: p.grad is not None, parameters))
+
+        device = parameters[0].device
+        out = torch.empty(len(parameters), device=device)
+        for i, p in enumerate(parameters):
+            torch.norm(p.grad.data.to(device), norm_type, out=out[i])
+        total_norm = torch.norm(out, norm_type)
+
+        clip_coef = torch.tensor(max_norm, device=device) / (total_norm + self.norm_clipping_epsilon)
+        clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))
+        for p in parameters:
+            p.grad.data.mul_(clip_coef.to(p.grad.data.device))
 
     def barrier(self, name: Optional[str] = None):
         torch_xla.core.xla_model.rendezvous(f"pl.Trainer.{name}")
@@ -343,3 +362,7 @@ def sync_tensor(self,
                     group: Optional[Any] = None,
                     reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor:
         return tensor
+
+    @property
+    def norm_clipping_epsilon(self):
+        return 1e-6
diff --git a/pytorch_lightning/plugins/apex.py b/pytorch_lightning/plugins/apex.py
index 0c8665e3719f35..654f7202fb9d1b 100644
--- a/pytorch_lightning/plugins/apex.py
+++ b/pytorch_lightning/plugins/apex.py
@@ -11,11 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Tuple
+import math
+from typing import List, Tuple, Union
 
+import torch
 from torch.optim.optimizer import Optimizer
 
 from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin
 from pytorch_lightning.utilities.distributed import rank_zero_warn
 from pytorch_lightning.utilities import AMPType
 
@@ -25,7 +28,7 @@
     amp = None
 
 
-class ApexPlugin:
+class ApexPlugin(PrecisionPlugin):
 
     def __init__(self, trainer=None):
         self.trainer = trainer
@@ -98,3 +101,35 @@ def configure_apex(self, amp, model, optimizers, amp_level):
         """
         model, optimizers = amp.initialize(model, optimizers, opt_level=amp_level)
         return model, optimizers
+
+    def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float):
+        """
+        This code is a modification of :meth:`torch.nn.utils.clip_grad_norm_` using a higher epsilon for fp16 weights.
+        This is important when setting amp_level to O2, and the master weights are in fp16.
+        Args:
+            grad_clip_val: Maximum norm of gradients.
+            optimizer: Optimizer with gradients that will be clipped.
+            norm_type: (float or int): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+        """
+        model = self.trainer.get_model()
+        parameters = model.parameters()
+        max_norm = float(grad_clip_val)
+
+        if isinstance(parameters, torch.Tensor):
+            parameters = [parameters]
+        parameters = [p for p in parameters if p.grad is not None]
+
+        if len(parameters) == 0:
+            return torch.tensor(0.)
+        device = parameters[0].grad.device
+        total_norm = torch.norm(
+            torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type)
+        clip_coef = max_norm / (total_norm + self.norm_clipping_epsilon)
+        if clip_coef < 1:
+            for p in parameters:
+                p.grad.detach().mul_(clip_coef.to(p.grad.device))
+
+    @property
+    def norm_clipping_epsilon(self):
+        return 1e-5
diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py
index 98bc8dfc87d25a..1a6649986132cb 100644
--- a/pytorch_lightning/plugins/native_amp.py
+++ b/pytorch_lightning/plugins/native_amp.py
@@ -11,11 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Union
 
 import torch
+from torch.optim import Optimizer
 
+from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin
 
-class NativeAMPPlugin:
+
+class NativeAMPPlugin(PrecisionPlugin):
 
     def __init__(self, trainer=None):
         """
@@ -51,3 +55,7 @@ def training_step(self, fx, args):
         with torch.cuda.amp.autocast():
             output = fx(*args)
         return output
+
+    def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float):
+        model = self.trainer.get_model()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type)
diff --git a/pytorch_lightning/plugins/precision_plugin.py b/pytorch_lightning/plugins/precision_plugin.py
new file mode 100644
index 00000000000000..0102f677391ff9
--- /dev/null
+++ b/pytorch_lightning/plugins/precision_plugin.py
@@ -0,0 +1,38 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import abc
+from typing import Union
+
+from torch.optim import Optimizer
+
+
+class PrecisionPlugin(abc.ABC):
+    """
+    Abstract class to extend for precision support (32/16 etc).
+
+    This is extended to cover any specific logic required for precision support such as AMP/APEX or sharded
+    training.
+    """
+
+    def connect(self, model, optimizers):
+        raise NotImplementedError
+
+    def training_step(self, fx, args):
+        raise NotImplementedError
+
+    def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
+        raise NotImplementedError
+
+    def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float):
+        raise NotImplementedError