From a5f2e6b20858dcba011e9631e9a1782b068cce99 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 22 Jun 2021 01:08:07 +0200
Subject: [PATCH 01/39] Parametrize fit hook test with different precision
 plugins

---
 tests/models/test_hooks.py | 39 ++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 37e4867c7b6b9..d7a962b1a708c 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -283,7 +283,8 @@ def test_epoch_end(self, *args, **kwargs):
         pass
 
     @staticmethod
-    def _train_batch(trainer, model, batches):
+    def _train_batch(trainer, model, batches, device=torch.device('cpu'), **kwargs):
+        using_native_amp = kwargs.get('amp_backend') == 'native'
         out = []
         for i in range(batches):
             out.extend([
@@ -292,7 +293,7 @@ def _train_batch(trainer, model, batches):
                 dict(name='Callback.on_train_batch_start', args=(trainer, model, ANY, i, 0)),
                 dict(name='on_train_batch_start', args=(ANY, i, 0)),
                 dict(name='on_before_batch_transfer', args=(ANY, None)),
-                dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), None)),
+                dict(name='transfer_batch_to_device', args=(ANY, device, None)),
                 dict(name='on_after_batch_transfer', args=(ANY, None)),
                 dict(name='forward', args=(ANY, )),
                 dict(name='training_step', args=(ANY, i)),
@@ -308,7 +309,7 @@ def _train_batch(trainer, model, batches):
                 dict(
                     name='optimizer_step',
                     args=(0, i, ANY, 0, ANY),
-                    kwargs=dict(on_tpu=False, using_lbfgs=False, using_native_amp=False)
+                    kwargs=dict(on_tpu=False, using_lbfgs=False, using_native_amp=using_native_amp)
                 ),
                 dict(name='Callback.on_train_batch_end', args=(trainer, model, dict(loss=ANY), ANY, i, 0)),
                 dict(name='on_train_batch_end', args=(dict(loss=ANY), ANY, i, 0)),
@@ -317,14 +318,14 @@ def _train_batch(trainer, model, batches):
         return out
 
     @staticmethod
-    def _eval_epoch(fn, trainer, model, batches, key):
+    def _eval_epoch(fn, trainer, model, batches, key, device=torch.device('cpu')):
         outputs = {key: ANY}
         return [
             dict(name='Callback.on_epoch_start', args=(trainer, model)),
             dict(name='on_epoch_start'),
             dict(name=f'Callback.on_{fn}_epoch_start', args=(trainer, model)),
             dict(name=f'on_{fn}_epoch_start'),
-            *HookedModel._eval_batch(fn, trainer, model, batches, key),
+            *HookedModel._eval_batch(fn, trainer, model, batches, key, device=device),
             dict(name=f'{fn}_epoch_end', args=([outputs] * batches, )),
             dict(name=f'Callback.on_{fn}_epoch_end', args=(trainer, model)),
             dict(name=f'on_{fn}_epoch_end'),
@@ -333,7 +334,7 @@ def _eval_epoch(fn, trainer, model, batches, key):
         ]
 
     @staticmethod
-    def _eval_batch(fn, trainer, model, batches, key):
+    def _eval_batch(fn, trainer, model, batches, key, device=torch.device('cpu')):
         out = []
         outputs = {key: ANY}
         for i in range(batches):
@@ -342,7 +343,7 @@ def _eval_batch(fn, trainer, model, batches, key):
                 dict(name=f'Callback.on_{fn}_batch_start', args=(trainer, model, ANY, i, 0)),
                 dict(name=f'on_{fn}_batch_start', args=(ANY, i, 0)),
                 dict(name='on_before_batch_transfer', args=(ANY, None)),
-                dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), None)),
+                dict(name='transfer_batch_to_device', args=(ANY, device, None)),
                 dict(name='on_after_batch_transfer', args=(ANY, None)),
                 dict(name='forward', args=(ANY, )),
                 dict(name=f'{fn}_step', args=(ANY, i)),
@@ -372,7 +373,17 @@ def _predict_batch(trainer, model, batches):
         return out
 
 
-def test_trainer_model_hook_system_fit(tmpdir):
+@pytest.mark.parametrize(
+    'kwargs',
+    [
+        {},
+        # these precision plugins modify the optimization flow, so testing them explicitly
+        pytest.param(dict(gpus=1, precision=16, plugins='deepspeed'), marks=RunIf(deepspeed=True, min_gpus=1)),
+        pytest.param(dict(gpus=1, precision=16, amp_backend='native'), marks=RunIf(amp_native=True, min_gpus=1)),
+        pytest.param(dict(gpus=1, precision=16, amp_backend='apex'), marks=RunIf(amp_apex=True, min_gpus=1)),
+    ]
+)
+def test_trainer_model_hook_system_fit(tmpdir, kwargs):
     called = []
     model = HookedModel(called)
     callback = HookedCallback(called)
@@ -385,7 +396,8 @@ def test_trainer_model_hook_system_fit(tmpdir):
         limit_val_batches=val_batches,
         progress_bar_refresh_rate=0,
         weights_summary=None,
-        callbacks=[callback]
+        callbacks=[callback],
+        **kwargs,
     )
     assert called == [
         dict(name='Callback.on_init_start', args=(trainer, )),
@@ -401,6 +413,9 @@ def test_trainer_model_hook_system_fit(tmpdir):
         'pytorch-lightning_version': __version__,
         'state_dict': ANY,
     }
+    if kwargs.get('amp_backend') == 'native':
+        saved_ckpt['native_amp_scaling_state'] = ANY
+    device = torch.device('cuda:0' if 'gpus' in kwargs else 'cpu')
     expected = [
         dict(name='Callback.on_init_start', args=(trainer, )),
         dict(name='Callback.on_init_end', args=(trainer, )),
@@ -426,7 +441,7 @@ def test_trainer_model_hook_system_fit(tmpdir):
         dict(name='zero_grad'),
         dict(name='Callback.on_validation_start', args=(trainer, model)),
         dict(name='on_validation_start'),
-        *model._eval_epoch('validation', trainer, model, val_batches, 'x'),
+        *model._eval_epoch('validation', trainer, model, val_batches, 'x', device=device),
         dict(name='Callback.on_validation_end', args=(trainer, model)),
         dict(name='on_validation_end'),
         dict(name='train'),
@@ -442,13 +457,13 @@ def test_trainer_model_hook_system_fit(tmpdir):
         dict(name='on_epoch_start'),
         dict(name='Callback.on_train_epoch_start', args=(trainer, model)),
         dict(name='on_train_epoch_start'),
-        *model._train_batch(trainer, model, train_batches),
+        *model._train_batch(trainer, model, train_batches, device=device, **kwargs),
         dict(name='train', args=(False, )),
         dict(name='on_validation_model_eval'),
         dict(name='zero_grad'),
         dict(name='Callback.on_validation_start', args=(trainer, model)),
         dict(name='on_validation_start'),
-        *model._eval_epoch('validation', trainer, model, val_batches, 'x'),
+        *model._eval_epoch('validation', trainer, model, val_batches, 'x', device=device),
         dict(name='Callback.on_validation_end', args=(trainer, model)),
         # `ModelCheckpoint.save_checkpoint` is called here from `Callback.on_validation_end`
         dict(name='Callback.on_save_checkpoint', args=(trainer, model, saved_ckpt)),

From 0ce2295bed423a97dfaae34173d0d1d56c271e01 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 22 Jun 2021 02:07:25 +0200
Subject: [PATCH 02/39] Fix tests

---
 tests/models/test_hooks.py | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index d7a962b1a708c..c8b2a79f893e7 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -265,6 +265,10 @@ def call(hook, fn, *args, **kwargs):
             d = {'name': hook}
             if args:
                 d['args'] = args
+            elif hook == 'train':
+                # DeepSpeed calls `train(mode)` but we do not. Standardize
+                # https://github.com/microsoft/DeepSpeed/pull/571
+                d['args'] = (True, )
             if kwargs:
                 d['kwargs'] = kwargs
             called.append(d)
@@ -302,7 +306,7 @@ def _train_batch(trainer, model, batches, device=torch.device('cpu'), **kwargs):
                 dict(name='on_before_zero_grad', args=(ANY, )),
                 dict(name='optimizer_zero_grad', args=(0, i, ANY, 0)),
                 # TODO: `on_before_backward`
-                dict(name='backward', args=(ANY, ANY, 0)),
+                *([dict(name='backward', args=(ANY, ANY, 0))] if kwargs.get('plugins') != 'deepspeed' else []),
                 dict(name='Callback.on_after_backward', args=(trainer, model)),
                 dict(name='on_after_backward'),
                 # TODO: `on_before_optimizer_step`
@@ -399,11 +403,14 @@ def test_trainer_model_hook_system_fit(tmpdir, kwargs):
         callbacks=[callback],
         **kwargs,
     )
+
     assert called == [
         dict(name='Callback.on_init_start', args=(trainer, )),
         dict(name='Callback.on_init_end', args=(trainer, )),
     ]
+
     trainer.fit(model)
+
     saved_ckpt = {
         'callbacks': ANY,
         'epoch': 1,
@@ -415,20 +422,28 @@ def test_trainer_model_hook_system_fit(tmpdir, kwargs):
     }
     if kwargs.get('amp_backend') == 'native':
         saved_ckpt['native_amp_scaling_state'] = ANY
+    elif kwargs.get('amp_backend') == 'apex':
+        saved_ckpt['amp_scaling_state'] = ANY
     device = torch.device('cuda:0' if 'gpus' in kwargs else 'cpu')
+
     expected = [
         dict(name='Callback.on_init_start', args=(trainer, )),
         dict(name='Callback.on_init_end', args=(trainer, )),
         dict(name='prepare_data'),
         dict(name='configure_callbacks'),
         dict(name='Callback.on_before_accelerator_backend_setup', args=(trainer, model)),
+        # FIXME
+        *([dict(name='train_dataloader')] if kwargs.get('plugins') == 'deepspeed' else []),
         dict(name='Callback.setup', args=(trainer, model), kwargs=dict(stage='fit')),
         dict(name='setup', kwargs=dict(stage='fit')),
         dict(name='configure_sharded_model'),
         dict(name='Callback.on_configure_sharded_model', args=(trainer, model)),
-        dict(name='configure_optimizers'),
+        # FIXME
+        *([dict(name='configure_optimizers')] if kwargs.get('plugins') != 'deepspeed' else []),
         dict(name='Callback.on_fit_start', args=(trainer, model)),
         dict(name='on_fit_start'),
+        # FIXME
+        *([dict(name='configure_optimizers')] if kwargs.get('plugins') == 'deepspeed' else []),
         dict(name='Callback.on_pretrain_routine_start', args=(trainer, model)),
         dict(name='on_pretrain_routine_start'),
         dict(name='Callback.on_pretrain_routine_end', args=(trainer, model)),
@@ -444,11 +459,11 @@ def test_trainer_model_hook_system_fit(tmpdir, kwargs):
         *model._eval_epoch('validation', trainer, model, val_batches, 'x', device=device),
         dict(name='Callback.on_validation_end', args=(trainer, model)),
         dict(name='on_validation_end'),
-        dict(name='train'),
+        dict(name='train', args=(True, )),
         dict(name='on_validation_model_train'),
         dict(name='Callback.on_sanity_check_end', args=(trainer, model)),
         # duplicate `train` because `_run_train` calls it again in case validation wasn't run
-        dict(name='train'),
+        dict(name='train', args=(True, )),
         dict(name='on_train_dataloader'),
         dict(name='train_dataloader'),
         dict(name='Callback.on_train_start', args=(trainer, model)),
@@ -469,7 +484,7 @@ def test_trainer_model_hook_system_fit(tmpdir, kwargs):
         dict(name='Callback.on_save_checkpoint', args=(trainer, model, saved_ckpt)),
         dict(name='on_save_checkpoint', args=(saved_ckpt, )),
         dict(name='on_validation_end'),
-        dict(name='train'),
+        dict(name='train', args=(True, )),
         dict(name='on_validation_model_train'),
         dict(name='training_epoch_end', args=([dict(loss=ANY)] * train_batches, )),
         dict(name='Callback.on_train_epoch_end', args=(trainer, model, [dict(loss=ANY)] * train_batches)),
@@ -582,7 +597,7 @@ def test_trainer_model_hook_system_eval(tmpdir, batches, verb, noun, dataloader,
         *model._eval_epoch(noun, trainer, model, batches, key),
         dict(name=f'Callback.on_{noun}_end', args=(trainer, model)),
         dict(name=f'on_{noun}_end'),
-        dict(name='train'),
+        dict(name='train', args=(True, )),
         dict(name=f'on_{noun}_model_train'),
     ]
     expected = [

From 72d5ee3b998f2c988a3c2c18915dce82e540170a Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 22 Jun 2021 13:48:58 +0200
Subject: [PATCH 03/39] Comments

---
 tests/models/test_hooks.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index c8b2a79f893e7..9e557abeb879e 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -306,6 +306,7 @@ def _train_batch(trainer, model, batches, device=torch.device('cpu'), **kwargs):
                 dict(name='on_before_zero_grad', args=(ANY, )),
                 dict(name='optimizer_zero_grad', args=(0, i, ANY, 0)),
                 # TODO: `on_before_backward`
+                # DeepSpeed handles backward internally
                 *([dict(name='backward', args=(ANY, ANY, 0))] if kwargs.get('plugins') != 'deepspeed' else []),
                 dict(name='Callback.on_after_backward', args=(trainer, model)),
                 dict(name='on_after_backward'),
@@ -432,17 +433,18 @@ def test_trainer_model_hook_system_fit(tmpdir, kwargs):
         dict(name='prepare_data'),
         dict(name='configure_callbacks'),
         dict(name='Callback.on_before_accelerator_backend_setup', args=(trainer, model)),
-        # FIXME
+        # DeepSpeed needs the batch size to figure out throughput logging
         *([dict(name='train_dataloader')] if kwargs.get('plugins') == 'deepspeed' else []),
         dict(name='Callback.setup', args=(trainer, model), kwargs=dict(stage='fit')),
         dict(name='setup', kwargs=dict(stage='fit')),
         dict(name='configure_sharded_model'),
         dict(name='Callback.on_configure_sharded_model', args=(trainer, model)),
-        # FIXME
+        # DeepSpeed skips initializing optimizers here as they are handled via config
         *([dict(name='configure_optimizers')] if kwargs.get('plugins') != 'deepspeed' else []),
         dict(name='Callback.on_fit_start', args=(trainer, model)),
         dict(name='on_fit_start'),
-        # FIXME
+        # TODO: explore whether DeepSpeed can have the same flow for optimizers
+        # DeepSpeed did not find any optimizer in the config so they are loaded here
         *([dict(name='configure_optimizers')] if kwargs.get('plugins') == 'deepspeed' else []),
         dict(name='Callback.on_pretrain_routine_start', args=(trainer, model)),
         dict(name='on_pretrain_routine_start'),

From f34ee7e7dddb94fdf0deec008edf0749db1215fb Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 22 Jun 2021 16:56:15 +0200
Subject: [PATCH 04/39] Fix message

---
 pytorch_lightning/plugins/precision/deepspeed_precision.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/plugins/precision/deepspeed_precision.py b/pytorch_lightning/plugins/precision/deepspeed_precision.py
index f05fd4d54b811..3edcc6866e219 100644
--- a/pytorch_lightning/plugins/precision/deepspeed_precision.py
+++ b/pytorch_lightning/plugins/precision/deepspeed_precision.py
@@ -64,15 +64,14 @@ def backward(
     ) -> Tensor:
         if is_overridden('backward', model):
             warning_cache.warn(
-                "Overridden backward hook in the LightningModule will be ignored since DeepSpeed handles"
-                "backward logic outside of the LightningModule"
+                "You have overridden the `LightningModule.backward` hook but it will be ignored since DeepSpeed handles"
+                " the backward logic internally."
             )
         # todo: hack around for deepspeed engine to call backward
         deepspeed_engine = model.trainer.model
         deepspeed_engine.backward(closure_loss, *args, **kwargs)
         # once backward has been applied, release graph
         closure_loss = closure_loss.detach()
-
         return closure_loss
 
     def clip_gradients(

From 39c4a85a83cf32081b721f939ff83500b93f2dd3 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 22 Jun 2021 16:32:38 +0200
Subject: [PATCH 05/39] Test CI error

---
 .azure-pipelines/gpu-tests.yml | 82 +++++++++++++++++-----------------
 tests/models/test_hooks.py     | 22 +++++++++
 2 files changed, 63 insertions(+), 41 deletions(-)

diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index 5333bfd867da0..5a2276f315391 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -74,46 +74,46 @@ jobs:
       displayName: 'Get legacy checkpoints'
 
     - bash: |
-        python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
+        pytest tests/accelerators/test_ddp.py::test_ddp_torch_dist_is_available_in_setup tests/models/test_hooks.py::test_ci_bug -v
       displayName: 'Testing: standard'
 
-    - bash: |
-        bash tests/special_tests.sh
-      displayName: 'Testing: special'
-
-    - bash: |
-        python -m coverage report
-        python -m coverage xml
-        python -m coverage html
-        python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
-        ls -l
-      displayName: 'Statistics'
-
-    - task: PublishTestResults@2
-      displayName: 'Publish test results'
-      inputs:
-        testResultsFiles: '$(Build.StagingDirectory)/test-results.xml'
-        testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
-      condition: succeededOrFailed()
-
-    - task: PublishCodeCoverageResults@1
-      displayName: 'Publish coverage report'
-      inputs:
-        codeCoverageTool: 'cobertura'
-        summaryFileLocation: 'coverage.xml'
-        reportDirectory: '$(Build.SourcesDirectory)/htmlcov'
-        testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)'
-      condition: succeededOrFailed()
-
-    - bash: |
-        python -m pytest benchmarks -v --maxfail=2 --durations=0
-      displayName: 'Testing: benchmarks'
-
-    - script: |
-        set -e
-        python -m pytest pl_examples -v --maxfail=2 --durations=0
-        bash pl_examples/run_examples-args.sh --trainer.gpus 1 --trainer.max_epochs 1 --data.batch_size 64 --trainer.limit_train_batches 5 --trainer.limit_val_batches 3
-        bash pl_examples/run_ddp-examples.sh --trainer.max_epochs 1 --data.batch_size 32 --trainer.limit_train_batches 2 --trainer.limit_val_batches 2
-      env:
-        PL_USE_MOCKED_MNIST: "1"
-      displayName: 'Examples'
+    #- bash: |
+    #    bash tests/special_tests.sh
+    #  displayName: 'Testing: special'
+#
+    #- bash: |
+    #    python -m coverage report
+    #    python -m coverage xml
+    #    python -m coverage html
+    #    python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
+    #    ls -l
+    #  displayName: 'Statistics'
+#
+    #- task: PublishTestResults@2
+    #  displayName: 'Publish test results'
+    #  inputs:
+    #    testResultsFiles: '$(Build.StagingDirectory)/test-results.xml'
+    #    testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
+    #  condition: succeededOrFailed()
+#
+    #- task: PublishCodeCoverageResults@1
+    #  displayName: 'Publish coverage report'
+    #  inputs:
+    #    codeCoverageTool: 'cobertura'
+    #    summaryFileLocation: 'coverage.xml'
+    #    reportDirectory: '$(Build.SourcesDirectory)/htmlcov'
+    #    testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)'
+    #  condition: succeededOrFailed()
+#
+    #- bash: |
+    #    python -m pytest benchmarks -v --maxfail=2 --durations=0
+    #  displayName: 'Testing: benchmarks'
+#
+    #- script: |
+    #    set -e
+    #    python -m pytest pl_examples -v --maxfail=2 --durations=0
+    #    bash pl_examples/run_examples-args.sh --trainer.gpus 1 --trainer.max_epochs 1 --data.batch_size 64 --trainer.limit_train_batches 5 --trainer.limit_val_batches 3
+    #    bash pl_examples/run_ddp-examples.sh --trainer.max_epochs 1 --data.batch_size 32 --trainer.limit_train_batches 2 --trainer.limit_val_batches 2
+    #  env:
+    #    PL_USE_MOCKED_MNIST: "1"
+    #  displayName: 'Examples'
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 9e557abeb879e..08a6dd40f49ae 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -378,6 +378,28 @@ def _predict_batch(trainer, model, batches):
         return out
 
 
+@RunIf(deepspeed=True, min_gpus=1)
+def test_ci_bug(tmpdir):
+    called = []
+    model = HookedModel(called)
+    callback = HookedCallback(called)
+    train_batches = 2
+    val_batches = 2
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=train_batches,
+        limit_val_batches=val_batches,
+        progress_bar_refresh_rate=0,
+        weights_summary=None,
+        callbacks=[callback],
+        gpus=1,
+        precision=16,
+        plugins='deepspeed',
+    )
+    trainer.fit(model)
+
+
 @pytest.mark.parametrize(
     'kwargs',
     [

From c3b458d1d9dd939d921f7e27864d023b11cac5a2 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 22 Jun 2021 18:58:51 +0200
Subject: [PATCH 06/39] Revert "Test CI error"

This reverts commit 39c4a85a83cf32081b721f939ff83500b93f2dd3.
---
 .azure-pipelines/gpu-tests.yml | 82 +++++++++++++++++-----------------
 tests/models/test_hooks.py     | 22 ---------
 2 files changed, 41 insertions(+), 63 deletions(-)

diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index 5a2276f315391..5333bfd867da0 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -74,46 +74,46 @@ jobs:
       displayName: 'Get legacy checkpoints'
 
     - bash: |
-        pytest tests/accelerators/test_ddp.py::test_ddp_torch_dist_is_available_in_setup tests/models/test_hooks.py::test_ci_bug -v
+        python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
       displayName: 'Testing: standard'
 
-    #- bash: |
-    #    bash tests/special_tests.sh
-    #  displayName: 'Testing: special'
-#
-    #- bash: |
-    #    python -m coverage report
-    #    python -m coverage xml
-    #    python -m coverage html
-    #    python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
-    #    ls -l
-    #  displayName: 'Statistics'
-#
-    #- task: PublishTestResults@2
-    #  displayName: 'Publish test results'
-    #  inputs:
-    #    testResultsFiles: '$(Build.StagingDirectory)/test-results.xml'
-    #    testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
-    #  condition: succeededOrFailed()
-#
-    #- task: PublishCodeCoverageResults@1
-    #  displayName: 'Publish coverage report'
-    #  inputs:
-    #    codeCoverageTool: 'cobertura'
-    #    summaryFileLocation: 'coverage.xml'
-    #    reportDirectory: '$(Build.SourcesDirectory)/htmlcov'
-    #    testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)'
-    #  condition: succeededOrFailed()
-#
-    #- bash: |
-    #    python -m pytest benchmarks -v --maxfail=2 --durations=0
-    #  displayName: 'Testing: benchmarks'
-#
-    #- script: |
-    #    set -e
-    #    python -m pytest pl_examples -v --maxfail=2 --durations=0
-    #    bash pl_examples/run_examples-args.sh --trainer.gpus 1 --trainer.max_epochs 1 --data.batch_size 64 --trainer.limit_train_batches 5 --trainer.limit_val_batches 3
-    #    bash pl_examples/run_ddp-examples.sh --trainer.max_epochs 1 --data.batch_size 32 --trainer.limit_train_batches 2 --trainer.limit_val_batches 2
-    #  env:
-    #    PL_USE_MOCKED_MNIST: "1"
-    #  displayName: 'Examples'
+    - bash: |
+        bash tests/special_tests.sh
+      displayName: 'Testing: special'
+
+    - bash: |
+        python -m coverage report
+        python -m coverage xml
+        python -m coverage html
+        python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
+        ls -l
+      displayName: 'Statistics'
+
+    - task: PublishTestResults@2
+      displayName: 'Publish test results'
+      inputs:
+        testResultsFiles: '$(Build.StagingDirectory)/test-results.xml'
+        testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
+      condition: succeededOrFailed()
+
+    - task: PublishCodeCoverageResults@1
+      displayName: 'Publish coverage report'
+      inputs:
+        codeCoverageTool: 'cobertura'
+        summaryFileLocation: 'coverage.xml'
+        reportDirectory: '$(Build.SourcesDirectory)/htmlcov'
+        testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)'
+      condition: succeededOrFailed()
+
+    - bash: |
+        python -m pytest benchmarks -v --maxfail=2 --durations=0
+      displayName: 'Testing: benchmarks'
+
+    - script: |
+        set -e
+        python -m pytest pl_examples -v --maxfail=2 --durations=0
+        bash pl_examples/run_examples-args.sh --trainer.gpus 1 --trainer.max_epochs 1 --data.batch_size 64 --trainer.limit_train_batches 5 --trainer.limit_val_batches 3
+        bash pl_examples/run_ddp-examples.sh --trainer.max_epochs 1 --data.batch_size 32 --trainer.limit_train_batches 2 --trainer.limit_val_batches 2
+      env:
+        PL_USE_MOCKED_MNIST: "1"
+      displayName: 'Examples'
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 08a6dd40f49ae..9e557abeb879e 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -378,28 +378,6 @@ def _predict_batch(trainer, model, batches):
         return out
 
 
-@RunIf(deepspeed=True, min_gpus=1)
-def test_ci_bug(tmpdir):
-    called = []
-    model = HookedModel(called)
-    callback = HookedCallback(called)
-    train_batches = 2
-    val_batches = 2
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        max_epochs=1,
-        limit_train_batches=train_batches,
-        limit_val_batches=val_batches,
-        progress_bar_refresh_rate=0,
-        weights_summary=None,
-        callbacks=[callback],
-        gpus=1,
-        precision=16,
-        plugins='deepspeed',
-    )
-    trainer.fit(model)
-
-
 @pytest.mark.parametrize(
     'kwargs',
     [

From c700cabcc985f9780843d5a28f9d5974be588017 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 22 Jun 2021 19:02:04 +0200
Subject: [PATCH 07/39] Add ddp training type teardown

---
 pytorch_lightning/plugins/training_type/ddp.py | 9 +++++++--
 tests/accelerators/test_ddp.py                 | 4 +---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index ed320f37d7006..1ebed7c55405b 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -227,7 +227,7 @@ def setup_distributed(self):
         self.init_ddp_connection()
 
         # on world_size=0 let everyone know training is starting
-        if self.is_global_zero and not torch.distributed.is_initialized():
+        if self.is_global_zero and not torch_distrib.is_initialized():
             log.info("-" * 100)
             log.info(f"distributed_backend={self.distributed_backend}")
             log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes")
@@ -297,7 +297,7 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt
         world_size = world_size if world_size is not None else self.cluster_environment.world_size()
         os.environ["MASTER_ADDR"] = self.cluster_environment.master_address()
         os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
-        if not torch.distributed.is_initialized():
+        if not torch_distrib.is_initialized():
             log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
             torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size)
 
@@ -373,3 +373,8 @@ def register_plugins(cls, plugin_registry: Dict) -> None:
             description="DDP Plugin with `find_unused_parameters` as False",
             find_unused_parameters=False
         )
+
+    def teardown(self) -> None:
+        if torch_distrib.is_initialized():
+            torch_distrib.destroy_process_group()
+        super().teardown()
diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py
index 3f335964a5eee..9f6b160567a84 100644
--- a/tests/accelerators/test_ddp.py
+++ b/tests/accelerators/test_ddp.py
@@ -109,7 +109,6 @@ class TestModel(BoringModel):
 
         def setup(self, stage: Optional[str] = None) -> None:
             assert torch.distributed.is_initialized()
-            raise SystemExit()
 
     model = TestModel()
     trainer = Trainer(
@@ -118,8 +117,7 @@ def setup(self, stage: Optional[str] = None) -> None:
         accelerator="ddp",
         gpus=1,
     )
-    with pytest.raises(SystemExit):
-        trainer.fit(model)
+    trainer.fit(model)
 
 
 @RunIf(min_gpus=2, min_torch="1.8.1", special=True)

From e5602c92942caf6d54b9e0b3ce7f2fd90783950a Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 22 Jun 2021 19:10:42 +0200
Subject: [PATCH 08/39] Update CHANGELOG

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 176413cd55e76..d2cd3926381e3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -285,6 +285,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Support manual optimization with DeepSpeed ([#7970](https://github.com/PyTorchLightning/pytorch-lightning/pull/7970))
 
 
+- Destroy the distributed process group on DDP teardown ([#8080](https://github.com/PyTorchLightning/pytorch-lightning/pull/8080))
+
+
 - Fixed `dataloader_idx` argument value when predicting with only one `DataLoader` ([#7941](https://github.com/PyTorchLightning/pytorch-lightning/pull/7941))
 
 

From 52b2256164c08db4565d33b0aeded2678a781de8 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 22 Jun 2021 19:00:07 +0200
Subject: [PATCH 09/39] Adrian's fix

---
 tests/conftest.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index 7f6407ecfd82b..3f767d8b6fad2 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,6 +18,7 @@
 from http.server import SimpleHTTPRequestHandler
 
 import pytest
+import torch.distributed
 import torch.multiprocessing as mp
 
 
@@ -41,6 +42,14 @@ def restore_env_variables():
     os.environ.update(env_backup)
 
 
+@pytest.fixture(scope="function", autouse=True)
+def teardown_process_group():
+    """ Ensures that the distributed process group gets closed before the next test runs. """
+    yield
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        torch.distributed.destroy_process_group()
+
+
 def pytest_configure(config):
     config.addinivalue_line("markers", "spawn: spawn test in a separate process using torch.multiprocessing.spawn")
 

From 0b94b6c269cd04c3ec495a0beebd58bcda949b29 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 23 Jun 2021 14:12:08 +0200
Subject: [PATCH 10/39] Use destructor

---
 pytorch_lightning/plugins/training_type/ddp.py | 3 +--
 tests/accelerators/test_ddp.py                 | 4 +++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index 1ebed7c55405b..2ea19fb0c781b 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -374,7 +374,6 @@ def register_plugins(cls, plugin_registry: Dict) -> None:
             find_unused_parameters=False
         )
 
-    def teardown(self) -> None:
+    def __del__(self) -> None:
         if torch_distrib.is_initialized():
             torch_distrib.destroy_process_group()
-        super().teardown()
diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py
index 9f6b160567a84..3f335964a5eee 100644
--- a/tests/accelerators/test_ddp.py
+++ b/tests/accelerators/test_ddp.py
@@ -109,6 +109,7 @@ class TestModel(BoringModel):
 
         def setup(self, stage: Optional[str] = None) -> None:
             assert torch.distributed.is_initialized()
+            raise SystemExit()
 
     model = TestModel()
     trainer = Trainer(
@@ -117,7 +118,8 @@ def setup(self, stage: Optional[str] = None) -> None:
         accelerator="ddp",
         gpus=1,
     )
-    trainer.fit(model)
+    with pytest.raises(SystemExit):
+        trainer.fit(model)
 
 
 @RunIf(min_gpus=2, min_torch="1.8.1", special=True)

From aaf32abde54cfb1bf205ae8bea878e9ebe282ad3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Wed, 23 Jun 2021 14:14:54 +0200
Subject: [PATCH 11/39] Update CHANGELOG.md

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d2cd3926381e3..1c6633fe88508 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -285,7 +285,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Support manual optimization with DeepSpeed ([#7970](https://github.com/PyTorchLightning/pytorch-lightning/pull/7970))
 
 
-- Destroy the distributed process group on DDP teardown ([#8080](https://github.com/PyTorchLightning/pytorch-lightning/pull/8080))
+- Destroy the distributed process group on DDP destructor ([#8080](https://github.com/PyTorchLightning/pytorch-lightning/pull/8080))
 
 
 - Fixed `dataloader_idx` argument value when predicting with only one `DataLoader` ([#7941](https://github.com/PyTorchLightning/pytorch-lightning/pull/7941))

From 0444d541c67e4f236c2de084924bbf4fef36c5e9 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 23 Jun 2021 16:28:22 +0200
Subject: [PATCH 12/39] RPC destructor

---
 pytorch_lightning/plugins/training_type/rpc.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py
index 3e0f57daef001..d8698e71bd261 100644
--- a/pytorch_lightning/plugins/training_type/rpc.py
+++ b/pytorch_lightning/plugins/training_type/rpc.py
@@ -83,3 +83,7 @@ def exit_rpc_process(self):
     @property
     def rpc_enabled(self) -> bool:
         return True
+
+    def __del__(self):
+        self.exit_rpc_process()
+        super().__del__()

From 5d4f811cd865ee8952b7f388c487af671e919bbd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= <carlossmocholi@gmail.com>
Date: Wed, 23 Jun 2021 16:28:38 +0200
Subject: [PATCH 13/39] Update pytorch_lightning/plugins/training_type/ddp.py

---
 pytorch_lightning/plugins/training_type/ddp.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index 2ea19fb0c781b..11601dec1fc6c 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -377,3 +377,6 @@ def register_plugins(cls, plugin_registry: Dict) -> None:
     def __del__(self) -> None:
         if torch_distrib.is_initialized():
             torch_distrib.destroy_process_group()
+        # clean up memory
+        with torch.cuda.device(self.root_device):
+            torch.cuda.empty_cache()

From bf8766d392c021a4b20b4641ca9ee83bb042386c Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 23 Jun 2021 16:56:20 +0200
Subject: [PATCH 14/39] Why do you not work :(

---
 pytorch_lightning/plugins/training_type/rpc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py
index d8698e71bd261..f825732f7e316 100644
--- a/pytorch_lightning/plugins/training_type/rpc.py
+++ b/pytorch_lightning/plugins/training_type/rpc.py
@@ -85,5 +85,5 @@ def rpc_enabled(self) -> bool:
         return True
 
     def __del__(self):
-        self.exit_rpc_process()
-        super().__del__()
+        # avoid hang
+        ...

From 48bcb7ed2ea7ac463e0dc39e5808416c56957404 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 23 Jun 2021 18:05:16 +0200
Subject: [PATCH 15/39] Missing condition

---
 pytorch_lightning/plugins/training_type/ddp.py | 7 ++++---
 pytorch_lightning/plugins/training_type/rpc.py | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index 11601dec1fc6c..88fe86420069e 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -377,6 +377,7 @@ def register_plugins(cls, plugin_registry: Dict) -> None:
     def __del__(self) -> None:
         if torch_distrib.is_initialized():
             torch_distrib.destroy_process_group()
-        # clean up memory
-        with torch.cuda.device(self.root_device):
-            torch.cuda.empty_cache()
+        if self.on_gpu:
+            # clean up memory
+            with torch.cuda.device(self.root_device):
+                torch.cuda.empty_cache()
diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py
index f825732f7e316..f20ece7ebbcf7 100644
--- a/pytorch_lightning/plugins/training_type/rpc.py
+++ b/pytorch_lightning/plugins/training_type/rpc.py
@@ -86,4 +86,4 @@ def rpc_enabled(self) -> bool:
 
     def __del__(self):
         # avoid hang
-        ...
+        pass

From 21ad2d8234e053e08d97efd74a261fa76d6e8b56 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 04:46:44 +0200
Subject: [PATCH 16/39] Fix deepspeed test

---
 tests/plugins/test_deepspeed_plugin.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index c5eaadd1e5985..2e96ced4c0c26 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -1,3 +1,4 @@
+import gc
 import json
 import os
 from typing import Any, Dict
@@ -265,6 +266,10 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args
                                                     (RandomIterableDataset, "auto"), (RandomIterableDataset, 10)])
 def test_deepspeed_auto_batch_size_config_select(tmpdir, dataset_cls, value):
     """Test to ensure that the batch size is correctly set as expected for deepspeed logging purposes."""
+    # the previous parametrization can impact the current one as it's not guaranteed that resources will be released
+    # between parametrizations. This is important as we call `destroy_process_group` in `DDPPlugin.__del__`.
+    # Another option would be to not use `parametrize`: https://github.com/pytest-dev/pytest/discussions/8153
+    gc.collect()
 
     class TestModel(BoringModel):
 

From bbc489e313dccc95018d2f750a89f00c656a43d2 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 04:57:54 +0200
Subject: [PATCH 17/39] GC collect in conftest

---
 tests/conftest.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index 7f6407ecfd82b..6cbdc3c3783c2 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import gc
 import os
 import sys
 import threading
@@ -36,6 +37,9 @@ def restore_env_variables():
     """ Ensures that environment variables set during the test do not leak out. """
     env_backup = os.environ.copy()
     yield
+    # if a destructor accesses an environment variable, we need to make sure that `os.environ` is not cleared
+    # before `__del__` is called. Force the call by triggering garbage collection.
+    gc.collect()
     # restore environment as it was before running the test
     os.environ.clear()
     os.environ.update(env_backup)

From 5b06fd2c2528e29929e56fcc765ad708a8a77586 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 05:43:43 +0200
Subject: [PATCH 18/39] Do not show warnings for special tests

---
 tests/special_tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/special_tests.sh b/tests/special_tests.sh
index 9fca3b62bad40..a87f50548d06b 100755
--- a/tests/special_tests.sh
+++ b/tests/special_tests.sh
@@ -17,7 +17,7 @@ set -e
 # this environment variable allows special tests to run
 export PL_RUNNING_SPECIAL_TESTS=1
 # python arguments
-defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no'
+defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no --disable-warnings'
 
 # find tests marked as `@RunIf(special=True)`
 grep_output=$(grep --recursive --line-number --word-regexp 'tests' 'benchmarks' --regexp 'special=True')

From 5e69ed84f9b09bcd25fe70909dc94470287982c6 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 05:44:05 +0200
Subject: [PATCH 19/39] Needs to run on 1.8

To avoid: "RuntimeError: NCCL error in: /pytorch/torch/lib/c10d/ProcessGroupNCCL.cpp:32, unhandled cuda error, NCCL version 2.4.8"
---
 .azure-pipelines/gpu-tests.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index 5333bfd867da0..bc7120bbc2ae6 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -32,12 +32,9 @@ jobs:
     #      python.version: '3.7'
 
     # ToDo: this need to have installed docker in the base image...
-    #container: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6
-    #container: "pytorchlightning/pytorch_lightning:base-cuda-py$[ variables['python.version'] ]-torch1.6"
     container:
       # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.8-torch1.6"
-      #endpoint: azureContainerRegistryConnection
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all"
 
     workspace:

From aed51a2c09213267283455efefb50f111c33384b Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 15:45:22 +0200
Subject: [PATCH 20/39] Run torch 1.8

---
 .azure-pipelines/gpu-tests.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index 5333bfd867da0..bc7120bbc2ae6 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -32,12 +32,9 @@ jobs:
     #      python.version: '3.7'
 
     # ToDo: this need to have installed docker in the base image...
-    #container: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6
-    #container: "pytorchlightning/pytorch_lightning:base-cuda-py$[ variables['python.version'] ]-torch1.6"
     container:
       # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.8-torch1.6"
-      #endpoint: azureContainerRegistryConnection
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all"
 
     workspace:

From e0a3e8785d2fecd63667da433a648f958d60ef89 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 16:01:33 +0200
Subject: [PATCH 21/39] Skip test due to 'Python bus error'

---
 tests/helpers/test_models.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/helpers/test_models.py b/tests/helpers/test_models.py
index e4bb7e7df0827..61b33265d1458 100644
--- a/tests/helpers/test_models.py
+++ b/tests/helpers/test_models.py
@@ -23,11 +23,12 @@
 
 
 @pytest.mark.parametrize(
-    "data_class,model_class", [
+    "data_class,model_class",
+    [
         (None, BoringModel),
         (None, BasicGAN),
         (None, ParityModuleRNN),
-        (None, ParityModuleMNIST),
+        # (None, ParityModuleMNIST),
         (ClassifDataModule, ClassificationModel),
         (RegressDataModule, RegressionModel),
     ]

From 9ee2d193832d022dd95096e932476dedcbd990d4 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 16:34:26 +0200
Subject: [PATCH 22/39] Debug NCCL

---
 .azure-pipelines/gpu-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index bc7120bbc2ae6..f1b57f9233ae3 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -71,7 +71,7 @@ jobs:
       displayName: 'Get legacy checkpoints'
 
     - bash: |
-        python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
+        NCCL_DEBUG=INFO python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
       displayName: 'Testing: standard'
 
     - bash: |

From 3588aaa37723db12ee17969a80e4c90028c071ba Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 17:06:20 +0200
Subject: [PATCH 23/39] shm size

---
 .azure-pipelines/gpu-tests.yml | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index f1b57f9233ae3..421ad96688d5a 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -25,17 +25,11 @@ jobs:
 
     pool: gridai-spot-pool
 
-    #strategy:
-    #  matrix:
-    #    PT16:
-    #      torch.version: '1.6'
-    #      python.version: '3.7'
-
     # ToDo: this need to have installed docker in the base image...
     container:
       # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
       image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
-      options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all"
+      options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
 
     workspace:
       clean: all

From 067bf1ae9eee271aaf3c4e4ac6bf9a50ba807fa2 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 17:28:56 +0200
Subject: [PATCH 24/39] Disable warnings for special tests

---
 tests/special_tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/special_tests.sh b/tests/special_tests.sh
index 9fca3b62bad40..a87f50548d06b 100755
--- a/tests/special_tests.sh
+++ b/tests/special_tests.sh
@@ -17,7 +17,7 @@ set -e
 # this environment variable allows special tests to run
 export PL_RUNNING_SPECIAL_TESTS=1
 # python arguments
-defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no'
+defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no --disable-warnings'
 
 # find tests marked as `@RunIf(special=True)`
 grep_output=$(grep --recursive --line-number --word-regexp 'tests' 'benchmarks' --regexp 'special=True')

From 6060b05215f0b824944bcabb2d7a4f3440625a96 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 17:29:25 +0200
Subject: [PATCH 25/39] Remove NCCL_DEBUG statement

---
 .azure-pipelines/gpu-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index 421ad96688d5a..5499202bc690e 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -65,7 +65,7 @@ jobs:
       displayName: 'Get legacy checkpoints'
 
     - bash: |
-        NCCL_DEBUG=INFO python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
+        python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
       displayName: 'Testing: standard'
 
     - bash: |

From f0fa1b74d0790a397702305a8cdd93ad7bcf18b7 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 17:30:06 +0200
Subject: [PATCH 26/39] Try smaller shm size

---
 .azure-pipelines/gpu-tests.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml
index 5499202bc690e..b1fedd578bc85 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@@ -28,8 +28,11 @@ jobs:
     # ToDo: this need to have installed docker in the base image...
     container:
       # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
+      # run on torch 1.8 as it's the LTS version
       image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
-      options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
+      # default shm size is 64m. Increase it to avoid:
+      # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
+      options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=256m"
 
     workspace:
       clean: all

From 6dd70381ce88f8ac3459de4b9795a875d596c9f5 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 17:31:05 +0200
Subject: [PATCH 27/39] Revert "Skip test due to 'Python bus error'"

This reverts commit e0a3e8785d2fecd63667da433a648f958d60ef89.
---
 tests/helpers/test_models.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/helpers/test_models.py b/tests/helpers/test_models.py
index 61b33265d1458..e4bb7e7df0827 100644
--- a/tests/helpers/test_models.py
+++ b/tests/helpers/test_models.py
@@ -23,12 +23,11 @@
 
 
 @pytest.mark.parametrize(
-    "data_class,model_class",
-    [
+    "data_class,model_class", [
         (None, BoringModel),
         (None, BasicGAN),
         (None, ParityModuleRNN),
-        # (None, ParityModuleMNIST),
+        (None, ParityModuleMNIST),
         (ClassifDataModule, ClassificationModel),
         (RegressDataModule, RegressionModel),
     ]

From 73e62f8aba385a3cad540c438fb500a46ded9648 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 18:15:47 +0200
Subject: [PATCH 28/39] README and adjust versions

---
 README.md                       | 4 ++--
 requirements/adjust_versions.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 7a540adadd327..78175f95c28fd 100644
--- a/README.md
+++ b/README.md
@@ -74,10 +74,10 @@ Lightning is rigorously tested across multiple GPUs, TPUs CPUs and against major
 
   <center>
 
-  | System / PyTorch ver. | 1.4 (min. req.) | 1.5 | 1.6 | 1.7 | 1.8 (latest) | 1.9 (nightly) |
+  | System / PyTorch ver. | 1.4 (min. req.) | 1.5 | 1.6 | 1.7 | 1.8 (LTS) | 1.9 (latest) |
   | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
   | Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) |
-  | Linux py3.7 [GPUs**] | - | - | [![Build Status](https://dev.azure.com/PytorchLightning/pytorch-lightning/_apis/build/status/PL.pytorch-lightning%20(GPUs)?branchName=master)](https://dev.azure.com/PytorchLightning/pytorch-lightning/_build/latest?definitionId=6&branchName=master) | - | - | - |
+  | Linux py3.7 [GPUs**] | - | - | - | - | [![Build Status](https://dev.azure.com/PytorchLightning/pytorch-lightning/_apis/build/status/PL.pytorch-lightning%20(GPUs)?branchName=master)](https://dev.azure.com/PytorchLightning/pytorch-lightning/_build/latest?definitionId=6&branchName=master) | - |
   | Linux py3.{6,7} [TPUs***] | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - |
   | Linux py3.{6,7,8,9} | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
   | OSX py3.{6,7,8,9} | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
diff --git a/requirements/adjust_versions.py b/requirements/adjust_versions.py
index a09128c6200db..84879b4e48a34 100644
--- a/requirements/adjust_versions.py
+++ b/requirements/adjust_versions.py
@@ -4,7 +4,8 @@
 from typing import Dict, Optional
 
 VERSIONS = [
-    dict(torch="1.9.0", torchvision="", torchtext=""),  # nightly
+    dict(torch="1.10.0", torchvision="", torchtext=""),  # nightly
+    dict(torch="1.9.0", torchvision="0.10.0", torchtext="0.10.0"),
     dict(torch="1.8.1", torchvision="0.9.1", torchtext="0.9.1"),
     dict(torch="1.8.0", torchvision="0.9.0", torchtext="0.9.0"),
     dict(torch="1.7.1", torchvision="0.8.2", torchtext="0.8.1"),

From 902ef02b95fee49275b60a04ac8dbe9d6f682933 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 18:22:21 +0200
Subject: [PATCH 29/39] Avoid self.on_gpu call

---
 pytorch_lightning/plugins/training_type/ddp.py | 6 ++----
 tests/conftest.py                              | 4 ----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index dcc78f7bc5d40..c04a4ab111a20 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -384,7 +384,5 @@ def register_plugins(cls, plugin_registry: Dict) -> None:
     def __del__(self) -> None:
         if torch_distrib.is_initialized():
             torch_distrib.destroy_process_group()
-        if self.on_gpu:
-            # clean up memory
-            with torch.cuda.device(self.root_device):
-                torch.cuda.empty_cache()
+        # `is_initialized` is checked inside and we already set the default device with `set_device(self.root_device)`
+        torch.cuda.empty_cache()
diff --git a/tests/conftest.py b/tests/conftest.py
index 6cbdc3c3783c2..7f6407ecfd82b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import gc
 import os
 import sys
 import threading
@@ -37,9 +36,6 @@ def restore_env_variables():
     """ Ensures that environment variables set during the test do not leak out. """
     env_backup = os.environ.copy()
     yield
-    # if a destructor accesses an environment variable, we need to make sure that `os.environ` is not cleared
-    # before `__del__` is called. Force the call by triggering garbage collection.
-    gc.collect()
     # restore environment as it was before running the test
     os.environ.clear()
     os.environ.update(env_backup)

From 4ce0f9a1feaa8e85e536f5dead658d17c65611c8 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 18:41:53 +0200
Subject: [PATCH 30/39] empty cache cleanup

---
 pytorch_lightning/accelerators/gpu.py             |  5 +----
 .../plugins/training_type/parallel.py             |  3 +--
 .../plugins/training_type/single_device.py        |  3 +--
 .../trainer/connectors/checkpoint_connector.py    | 14 +++-----------
 pytorch_lightning/utilities/memory.py             | 15 +++++++--------
 5 files changed, 13 insertions(+), 27 deletions(-)

diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py
index 7543a2b794b5d..1c5ff56d805a6 100644
--- a/pytorch_lightning/accelerators/gpu.py
+++ b/pytorch_lightning/accelerators/gpu.py
@@ -42,10 +42,7 @@ def setup(self, trainer: 'pl.Trainer', model: 'pl.LightningModule') -> None:
 
     def on_train_start(self) -> None:
         # clear cache before training
-        # use context because of:
-        # https://discuss.pytorch.org/t/out-of-memory-when-i-use-torch-cuda-empty-cache/57898
-        with torch.cuda.device(self.root_device):
-            torch.cuda.empty_cache()
+        torch.cuda.empty_cache()
 
     @staticmethod
     def set_nvidia_flags(local_rank: int) -> None:
diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py
index 09e48a760e868..122a1423c2817 100644
--- a/pytorch_lightning/plugins/training_type/parallel.py
+++ b/pytorch_lightning/plugins/training_type/parallel.py
@@ -132,5 +132,4 @@ def teardown(self) -> None:
             # GPU teardown
             self.lightning_module.cpu()
             # clean up memory
-            with torch.cuda.device(self.root_device):
-                torch.cuda.empty_cache()
+            torch.cuda.empty_cache()
diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py
index 1816f5838c948..d4a328902eba0 100644
--- a/pytorch_lightning/plugins/training_type/single_device.py
+++ b/pytorch_lightning/plugins/training_type/single_device.py
@@ -85,5 +85,4 @@ def teardown(self) -> None:
             # GPU teardown
             self.lightning_module.cpu()
             # clean up memory
-            with torch.cuda.device(self.root_device):
-                torch.cuda.empty_cache()
+            torch.cuda.empty_cache()
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index c2a0411c0df36..0bc3145a99e59 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -21,13 +21,7 @@
 
 import pytorch_lightning
 from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.utilities import (
-    _OMEGACONF_AVAILABLE,
-    DeviceType,
-    rank_zero_deprecation,
-    rank_zero_info,
-    rank_zero_warn,
-)
+from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, rank_zero_deprecation, rank_zero_info, rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.upgrade_checkpoint import KEYS_MAPPING as DEPRECATED_CHECKPOINT_KEYS
@@ -69,8 +63,7 @@ def resume_start(self) -> None:
             return
 
         # clear cache before restore
-        if self.trainer._device_type == DeviceType.GPU:
-            torch.cuda.empty_cache()
+        torch.cuda.empty_cache()
 
         # Try to read the checkpoint file at `checkpoint_path`. If not exist, do not restore checkpoint.
         fs = get_filesystem(checkpoint_path)
@@ -88,8 +81,7 @@ def resume_end(self) -> None:
         self._loaded_checkpoint = dict()
 
         # clear cache after restore
-        if self.trainer._device_type == DeviceType.GPU:
-            torch.cuda.empty_cache()
+        torch.cuda.empty_cache()
 
         # wait for all to catch up
         self.trainer.training_type_plugin.barrier("CheckpointConnector.resume_end")
diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py
index 6c01390a8c81e..0ae88e8995614 100644
--- a/pytorch_lightning/utilities/memory.py
+++ b/pytorch_lightning/utilities/memory.py
@@ -76,11 +76,10 @@ def is_out_of_cpu_memory(exception):
 def garbage_collection_cuda():
     """Garbage collection Torch (CUDA) memory."""
     gc.collect()
-    if torch.cuda.is_available():
-        try:
-            # This is the last thing that should cause an OOM error, but seemingly it can.
-            torch.cuda.empty_cache()
-        except RuntimeError as exception:
-            if not is_oom_error(exception):
-                # Only handle OOM errors
-                raise
+    try:
+        # This is the last thing that should cause an OOM error, but seemingly it can.
+        torch.cuda.empty_cache()
+    except RuntimeError as exception:
+        if not is_oom_error(exception):
+            # Only handle OOM errors
+            raise

From 738daa5bbccd5e00f65ea5f4c8b9218fea15839d Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 24 Jun 2021 19:06:25 +0200
Subject: [PATCH 31/39] More garbage collection

---
 tests/plugins/test_deepspeed_plugin.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 2e96ced4c0c26..b609bc78d74fc 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -650,6 +650,8 @@ def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_opt
     """
     Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works.
     """
+    gc.collect()
+
     seed_everything(42)
 
     class VerificationCallback(Callback):

From 236aa97bf35af324a43ca4f729f7ebdecff5fa3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Thu, 24 Jun 2021 21:23:50 +0200
Subject: [PATCH 32/39] Unroll parametrizations

---
 tests/callbacks/test_pruning.py               | 41 ++++++++++++++++---
 .../test_checkpoint_callback_frequency.py     | 14 +++++--
 tests/plugins/test_deepspeed_plugin.py        | 16 +++++---
 3 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py
index f198b29d24e84..1a5ddad64106e 100644
--- a/tests/callbacks/test_pruning.py
+++ b/tests/callbacks/test_pruning.py
@@ -162,13 +162,44 @@ def test_pruning_callback(
 
 
 @RunIf(special=True, min_gpus=2)
-@pytest.mark.parametrize("parameters_to_prune", [False, True])
-@pytest.mark.parametrize("use_global_unstructured", [False, True])
-def test_pruning_callback_ddp(tmpdir, use_global_unstructured: bool, parameters_to_prune: bool):
+def test_pruning_callback_ddp_0(tmpdir):
     train_with_pruning_callback(
         tmpdir,
-        parameters_to_prune=parameters_to_prune,
-        use_global_unstructured=use_global_unstructured,
+        parameters_to_prune=False,
+        use_global_unstructured=False,
+        accelerator="ddp",
+        gpus=2,
+    )
+
+
+@RunIf(special=True, min_gpus=2)
+def test_pruning_callback_ddp_1(tmpdir):
+    train_with_pruning_callback(
+        tmpdir,
+        parameters_to_prune=False,
+        use_global_unstructured=True,
+        accelerator="ddp",
+        gpus=2,
+    )
+
+
+@RunIf(special=True, min_gpus=2)
+def test_pruning_callback_ddp_2(tmpdir):
+    train_with_pruning_callback(
+        tmpdir,
+        parameters_to_prune=True,
+        use_global_unstructured=False,
+        accelerator="ddp",
+        gpus=2,
+    )
+
+
+@RunIf(special=True, min_gpus=2)
+def test_pruning_callback_ddp_3(tmpdir):
+    train_with_pruning_callback(
+        tmpdir,
+        parameters_to_prune=True,
+        use_global_unstructured=True,
         accelerator="ddp",
         gpus=2,
     )
diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py
index 9fdd69dba7a9a..c5afecc2b4bf3 100644
--- a/tests/checkpointing/test_checkpoint_callback_frequency.py
+++ b/tests/checkpointing/test_checkpoint_callback_frequency.py
@@ -105,10 +105,18 @@ def training_step(self, batch, batch_idx):
     assert save_mock.call_count == expected
 
 
-@mock.patch('torch.save')
 @RunIf(special=True, min_gpus=2)
-@pytest.mark.parametrize(['k', 'epochs', 'val_check_interval', 'expected'], [(1, 1, 1.0, 1), (2, 2, 0.3, 5)])
-def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected):
+def test_top_k_ddp_0(tmpdir):
+    _top_k_ddp(tmpdir, k=1, epochs=1, val_check_interval=1.0, expected=1)
+
+
+@RunIf(special=True, min_gpus=2)
+def test_top_k_ddp_1(tmpdir):
+    _top_k_ddp(tmpdir, k=2, epochs=2, val_check_interval=0.3, expected=5)
+
+
+@mock.patch('torch.save')
+def _top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected):
 
     class TestModel(BoringModel):
 
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index b609bc78d74fc..b443827cac70c 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -644,14 +644,10 @@ def test_deepspeed_multigpu_stage_3_checkpointing_full_weights_manual(tmpdir):
     run_checkpoint_test(tmpdir, save_full_weights=True, automatic_optimization=False, accumulate_grad_batches=1)
 
 
-@RunIf(min_gpus=2, deepspeed=True, special=True)
-@pytest.mark.parametrize('offload_optimizer', [True, False])
-def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer):
+def _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer):
     """
     Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works.
     """
-    gc.collect()
-
     seed_everything(42)
 
     class VerificationCallback(Callback):
@@ -678,6 +674,16 @@ def on_train_batch_start(
     trainer.fit(model, datamodule=dm)
 
 
+@RunIf(min_gpus=2, deepspeed=True, special=True)
+def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir):
+    _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=False)
+
+
+@RunIf(min_gpus=2, deepspeed=True, special=True)
+def test_deepspeed_multigpu_stage_2_accumulated_grad_batches_offload_optimizer(tmpdir):
+    _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=True)
+
+
 @RunIf(min_gpus=2, deepspeed=True, special=True)
 def test_deepspeed_multigpu_test(tmpdir, deepspeed_config):
     """

From ffa532d3bb888606438577f98e7ae512fa28a0cd Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Fri, 25 Jun 2021 00:30:21 +0200
Subject: [PATCH 33/39] Do not reuse mock

---
 .../test_checkpoint_callback_frequency.py             | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py
index c5afecc2b4bf3..67db594aa2539 100644
--- a/tests/checkpointing/test_checkpoint_callback_frequency.py
+++ b/tests/checkpointing/test_checkpoint_callback_frequency.py
@@ -105,17 +105,18 @@ def training_step(self, batch, batch_idx):
     assert save_mock.call_count == expected
 
 
+@mock.patch('torch.save')
 @RunIf(special=True, min_gpus=2)
-def test_top_k_ddp_0(tmpdir):
-    _top_k_ddp(tmpdir, k=1, epochs=1, val_check_interval=1.0, expected=1)
+def test_top_k_ddp_0(save_mock, tmpdir):
+    _top_k_ddp(save_mock, tmpdir, k=1, epochs=1, val_check_interval=1.0, expected=1)
 
 
+@mock.patch('torch.save')
 @RunIf(special=True, min_gpus=2)
-def test_top_k_ddp_1(tmpdir):
-    _top_k_ddp(tmpdir, k=2, epochs=2, val_check_interval=0.3, expected=5)
+def test_top_k_ddp_1(save_mock, tmpdir):
+    _top_k_ddp(save_mock, tmpdir, k=2, epochs=2, val_check_interval=0.3, expected=5)
 
 
-@mock.patch('torch.save')
 def _top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected):
 
     class TestModel(BoringModel):

From e190089054817f6d62cfd7433774146ac82f9a81 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Fri, 25 Jun 2021 01:35:15 +0200
Subject: [PATCH 34/39] Undo changes

---
 tests/conftest.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 3f767d8b6fad2..7f6407ecfd82b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -18,7 +18,6 @@
 from http.server import SimpleHTTPRequestHandler
 
 import pytest
-import torch.distributed
 import torch.multiprocessing as mp
 
 
@@ -42,14 +41,6 @@ def restore_env_variables():
     os.environ.update(env_backup)
 
 
-@pytest.fixture(scope="function", autouse=True)
-def teardown_process_group():
-    """ Ensures that the distributed process group gets closed before the next test runs. """
-    yield
-    if torch.distributed.is_available() and torch.distributed.is_initialized():
-        torch.distributed.destroy_process_group()
-
-
 def pytest_configure(config):
     config.addinivalue_line("markers", "spawn: spawn test in a separate process using torch.multiprocessing.spawn")
 

From 261a166194872fb3d031496dd552d463a9a135ed Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Fri, 25 Jun 2021 01:40:40 +0200
Subject: [PATCH 35/39] Undo notebooks modification

---
 _notebooks | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_notebooks b/_notebooks
index 3321b468e7816..29aea106edefc 160000
--- a/_notebooks
+++ b/_notebooks
@@ -1 +1 @@
-Subproject commit 3321b468e78167aaf056894e92ed6d649c76e89e
+Subproject commit 29aea106edefc9d1904c0c17223a8ac2b15c48e7

From 33a68d43e2e99cfe5aa130dee0cc8451dff9e713 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Sat, 3 Jul 2021 20:07:23 +0200
Subject: [PATCH 36/39] Undo

---
 CHANGELOG.md                                   | 3 ---
 pytorch_lightning/plugins/training_type/ddp.py | 2 +-
 tests/plugins/test_deepspeed_plugin.py         | 5 -----
 3 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a3e162ae1bea4..2256dcefeac31 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -327,9 +327,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Support manual optimization with DeepSpeed ([#7970](https://github.com/PyTorchLightning/pytorch-lightning/pull/7970))
 
 
-- Destroy the distributed process group on DDP destructor ([#8080](https://github.com/PyTorchLightning/pytorch-lightning/pull/8080))
-
-
 - Fixed `dataloader_idx` argument value when predicting with only one `DataLoader` ([#7941](https://github.com/PyTorchLightning/pytorch-lightning/pull/7941))
 
 
diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py
index 98ac485c205c2..a882390b78b0d 100644
--- a/pytorch_lightning/plugins/training_type/ddp.py
+++ b/pytorch_lightning/plugins/training_type/ddp.py
@@ -321,7 +321,7 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt
         world_size = world_size if world_size is not None else self.cluster_environment.world_size()
         os.environ["MASTER_ADDR"] = self.cluster_environment.master_address()
         os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port())
-        if not torch_distrib.is_initialized():
+        if not torch.distributed.is_initialized():
             log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}")
             torch.distributed.init_process_group(
                 self.torch_distributed_backend, rank=global_rank, world_size=world_size
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 6c238ab747350..efe8da981c9eb 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -1,4 +1,3 @@
-import gc
 import json
 import os
 from typing import Any, Dict
@@ -266,10 +265,6 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args
                                                     (RandomIterableDataset, "auto"), (RandomIterableDataset, 10)])
 def test_deepspeed_auto_batch_size_config_select(tmpdir, dataset_cls, value):
     """Test to ensure that the batch size is correctly set as expected for deepspeed logging purposes."""
-    # the previous parametrization can impact the current one as it's not guaranteed that resources will be released
-    # between parametrizations. This is important as we call `destroy_process_group` in `DDPPlugin.__del__`.
-    # Another option would be to not use `parametrize`: https://github.com/pytest-dev/pytest/discussions/8153
-    gc.collect()
 
     class TestModel(BoringModel):
 

From ac006c75a6105ebd461c7461d1e9258c69f5e8fb Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Sat, 3 Jul 2021 20:09:24 +0200
Subject: [PATCH 37/39] Fix test

---
 tests/models/test_hooks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 859ca504ad5f6..630166f4e40a9 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -574,7 +574,7 @@ def test_trainer_model_hook_system_fit_no_val_and_resume(tmpdir):
         dict(name='on_pretrain_routine_start'),
         dict(name='Callback.on_pretrain_routine_end', args=(trainer, model)),
         dict(name='on_pretrain_routine_end'),
-        dict(name='train'),
+        dict(name='train', args=(True, )),
         dict(name='on_train_dataloader'),
         dict(name='train_dataloader'),
         # even though no validation runs, we initialize the val dataloader for properties like `num_val_batches`

From a5becf4afdd619753f3d6533a52385871270b4e6 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Sat, 3 Jul 2021 20:34:06 +0200
Subject: [PATCH 38/39] Update test

---
 tests/plugins/test_deepspeed_plugin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index efe8da981c9eb..dcb4ff00b219b 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -256,7 +256,7 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args
         gpus=1,
         precision=16,
     )
-    with pytest.warns(UserWarning, match='Overridden backward hook in the LightningModule will be ignored'):
+    with pytest.warns(UserWarning, match='will be ignored since DeepSpeed handles the backward'):
         trainer.fit(model)
 
 

From 88b3183122525c7e66196204479bfa77b0731c0c Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Mon, 5 Jul 2021 12:19:21 +0200
Subject: [PATCH 39/39] Fix merge

---
 tests/models/test_hooks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 08d4d86b3c931..789959e38908a 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -293,7 +293,7 @@ def _train_batch(trainer, model, batches, device=torch.device('cpu'), current_ep
         for i in range(batches):
             out.extend([
                 dict(name='on_before_batch_transfer', args=(ANY, 0)),
-                dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), 0)),
+                dict(name='transfer_batch_to_device', args=(ANY, device, 0)),
                 dict(name='on_after_batch_transfer', args=(ANY, 0)),
                 # TODO: `on_batch_{start,end}`
                 dict(name='Callback.on_batch_start', args=(trainer, model)),
@@ -345,7 +345,7 @@ def _eval_batch(fn, trainer, model, batches, key, device=torch.device('cpu')):
         for i in range(batches):
             out.extend([
                 dict(name='on_before_batch_transfer', args=(ANY, 0)),
-                dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), 0)),
+                dict(name='transfer_batch_to_device', args=(ANY, device, 0)),
                 dict(name='on_after_batch_transfer', args=(ANY, 0)),
                 # TODO: `{,Callback}.on_batch_{start,end}`
                 dict(name=f'Callback.on_{fn}_batch_start', args=(trainer, model, ANY, i, 0)),