From a5f2e6b20858dcba011e9631e9a1782b068cce99 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 22 Jun 2021 01:08:07 +0200 Subject: [PATCH 01/39] Parametrize fit hook test with different precision plugins --- tests/models/test_hooks.py | 39 ++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 37e4867c7b6b9..d7a962b1a708c 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -283,7 +283,8 @@ def test_epoch_end(self, *args, **kwargs): pass @staticmethod - def _train_batch(trainer, model, batches): + def _train_batch(trainer, model, batches, device=torch.device('cpu'), **kwargs): + using_native_amp = kwargs.get('amp_backend') == 'native' out = [] for i in range(batches): out.extend([ @@ -292,7 +293,7 @@ def _train_batch(trainer, model, batches): dict(name='Callback.on_train_batch_start', args=(trainer, model, ANY, i, 0)), dict(name='on_train_batch_start', args=(ANY, i, 0)), dict(name='on_before_batch_transfer', args=(ANY, None)), - dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), None)), + dict(name='transfer_batch_to_device', args=(ANY, device, None)), dict(name='on_after_batch_transfer', args=(ANY, None)), dict(name='forward', args=(ANY, )), dict(name='training_step', args=(ANY, i)), @@ -308,7 +309,7 @@ def _train_batch(trainer, model, batches): dict( name='optimizer_step', args=(0, i, ANY, 0, ANY), - kwargs=dict(on_tpu=False, using_lbfgs=False, using_native_amp=False) + kwargs=dict(on_tpu=False, using_lbfgs=False, using_native_amp=using_native_amp) ), dict(name='Callback.on_train_batch_end', args=(trainer, model, dict(loss=ANY), ANY, i, 0)), dict(name='on_train_batch_end', args=(dict(loss=ANY), ANY, i, 0)), @@ -317,14 +318,14 @@ def _train_batch(trainer, model, batches): return out @staticmethod - def _eval_epoch(fn, trainer, model, batches, key): + def _eval_epoch(fn, trainer, model, batches, key, device=torch.device('cpu')): outputs = {key: ANY} return [ dict(name='Callback.on_epoch_start', args=(trainer, model)), dict(name='on_epoch_start'), dict(name=f'Callback.on_{fn}_epoch_start', args=(trainer, model)), dict(name=f'on_{fn}_epoch_start'), - *HookedModel._eval_batch(fn, trainer, model, batches, key), + *HookedModel._eval_batch(fn, trainer, model, batches, key, device=device), dict(name=f'{fn}_epoch_end', args=([outputs] * batches, )), dict(name=f'Callback.on_{fn}_epoch_end', args=(trainer, model)), dict(name=f'on_{fn}_epoch_end'), @@ -333,7 +334,7 @@ def _eval_epoch(fn, trainer, model, batches, key): ] @staticmethod - def _eval_batch(fn, trainer, model, batches, key): + def _eval_batch(fn, trainer, model, batches, key, device=torch.device('cpu')): out = [] outputs = {key: ANY} for i in range(batches): @@ -342,7 +343,7 @@ def _eval_batch(fn, trainer, model, batches, key): dict(name=f'Callback.on_{fn}_batch_start', args=(trainer, model, ANY, i, 0)), dict(name=f'on_{fn}_batch_start', args=(ANY, i, 0)), dict(name='on_before_batch_transfer', args=(ANY, None)), - dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), None)), + dict(name='transfer_batch_to_device', args=(ANY, device, None)), dict(name='on_after_batch_transfer', args=(ANY, None)), dict(name='forward', args=(ANY, )), dict(name=f'{fn}_step', args=(ANY, i)), @@ -372,7 +373,17 @@ def _predict_batch(trainer, model, batches): return out -def test_trainer_model_hook_system_fit(tmpdir): +@pytest.mark.parametrize( + 'kwargs', + [ + {}, + # these precision plugins modify the optimization flow, so testing them explicitly + pytest.param(dict(gpus=1, precision=16, plugins='deepspeed'), marks=RunIf(deepspeed=True, min_gpus=1)), + pytest.param(dict(gpus=1, precision=16, amp_backend='native'), marks=RunIf(amp_native=True, min_gpus=1)), + pytest.param(dict(gpus=1, precision=16, amp_backend='apex'), marks=RunIf(amp_apex=True, min_gpus=1)), + ] +) +def test_trainer_model_hook_system_fit(tmpdir, kwargs): called = [] model = HookedModel(called) callback = HookedCallback(called) @@ -385,7 +396,8 @@ def test_trainer_model_hook_system_fit(tmpdir): limit_val_batches=val_batches, progress_bar_refresh_rate=0, weights_summary=None, - callbacks=[callback] + callbacks=[callback], + **kwargs, ) assert called == [ dict(name='Callback.on_init_start', args=(trainer, )), @@ -401,6 +413,9 @@ def test_trainer_model_hook_system_fit(tmpdir): 'pytorch-lightning_version': __version__, 'state_dict': ANY, } + if kwargs.get('amp_backend') == 'native': + saved_ckpt['native_amp_scaling_state'] = ANY + device = torch.device('cuda:0' if 'gpus' in kwargs else 'cpu') expected = [ dict(name='Callback.on_init_start', args=(trainer, )), dict(name='Callback.on_init_end', args=(trainer, )), @@ -426,7 +441,7 @@ def test_trainer_model_hook_system_fit(tmpdir): dict(name='zero_grad'), dict(name='Callback.on_validation_start', args=(trainer, model)), dict(name='on_validation_start'), - *model._eval_epoch('validation', trainer, model, val_batches, 'x'), + *model._eval_epoch('validation', trainer, model, val_batches, 'x', device=device), dict(name='Callback.on_validation_end', args=(trainer, model)), dict(name='on_validation_end'), dict(name='train'), @@ -442,13 +457,13 @@ def test_trainer_model_hook_system_fit(tmpdir): dict(name='on_epoch_start'), dict(name='Callback.on_train_epoch_start', args=(trainer, model)), dict(name='on_train_epoch_start'), - *model._train_batch(trainer, model, train_batches), + *model._train_batch(trainer, model, train_batches, device=device, **kwargs), dict(name='train', args=(False, )), dict(name='on_validation_model_eval'), dict(name='zero_grad'), dict(name='Callback.on_validation_start', args=(trainer, model)), dict(name='on_validation_start'), - *model._eval_epoch('validation', trainer, model, val_batches, 'x'), + *model._eval_epoch('validation', trainer, model, val_batches, 'x', device=device), dict(name='Callback.on_validation_end', args=(trainer, model)), # `ModelCheckpoint.save_checkpoint` is called here from `Callback.on_validation_end` dict(name='Callback.on_save_checkpoint', args=(trainer, model, saved_ckpt)), From 0ce2295bed423a97dfaae34173d0d1d56c271e01 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 22 Jun 2021 02:07:25 +0200 Subject: [PATCH 02/39] Fix tests --- tests/models/test_hooks.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index d7a962b1a708c..c8b2a79f893e7 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -265,6 +265,10 @@ def call(hook, fn, *args, **kwargs): d = {'name': hook} if args: d['args'] = args + elif hook == 'train': + # DeepSpeed calls `train(mode)` but we do not. Standardize + # https://github.com/microsoft/DeepSpeed/pull/571 + d['args'] = (True, ) if kwargs: d['kwargs'] = kwargs called.append(d) @@ -302,7 +306,7 @@ def _train_batch(trainer, model, batches, device=torch.device('cpu'), **kwargs): dict(name='on_before_zero_grad', args=(ANY, )), dict(name='optimizer_zero_grad', args=(0, i, ANY, 0)), # TODO: `on_before_backward` - dict(name='backward', args=(ANY, ANY, 0)), + *([dict(name='backward', args=(ANY, ANY, 0))] if kwargs.get('plugins') != 'deepspeed' else []), dict(name='Callback.on_after_backward', args=(trainer, model)), dict(name='on_after_backward'), # TODO: `on_before_optimizer_step` @@ -399,11 +403,14 @@ def test_trainer_model_hook_system_fit(tmpdir, kwargs): callbacks=[callback], **kwargs, ) + assert called == [ dict(name='Callback.on_init_start', args=(trainer, )), dict(name='Callback.on_init_end', args=(trainer, )), ] + trainer.fit(model) + saved_ckpt = { 'callbacks': ANY, 'epoch': 1, @@ -415,20 +422,28 @@ def test_trainer_model_hook_system_fit(tmpdir, kwargs): } if kwargs.get('amp_backend') == 'native': saved_ckpt['native_amp_scaling_state'] = ANY + elif kwargs.get('amp_backend') == 'apex': + saved_ckpt['amp_scaling_state'] = ANY device = torch.device('cuda:0' if 'gpus' in kwargs else 'cpu') + expected = [ dict(name='Callback.on_init_start', args=(trainer, )), dict(name='Callback.on_init_end', args=(trainer, )), dict(name='prepare_data'), dict(name='configure_callbacks'), dict(name='Callback.on_before_accelerator_backend_setup', args=(trainer, model)), + # FIXME + *([dict(name='train_dataloader')] if kwargs.get('plugins') == 'deepspeed' else []), dict(name='Callback.setup', args=(trainer, model), kwargs=dict(stage='fit')), dict(name='setup', kwargs=dict(stage='fit')), dict(name='configure_sharded_model'), dict(name='Callback.on_configure_sharded_model', args=(trainer, model)), - dict(name='configure_optimizers'), + # FIXME + *([dict(name='configure_optimizers')] if kwargs.get('plugins') != 'deepspeed' else []), dict(name='Callback.on_fit_start', args=(trainer, model)), dict(name='on_fit_start'), + # FIXME + *([dict(name='configure_optimizers')] if kwargs.get('plugins') == 'deepspeed' else []), dict(name='Callback.on_pretrain_routine_start', args=(trainer, model)), dict(name='on_pretrain_routine_start'), dict(name='Callback.on_pretrain_routine_end', args=(trainer, model)), @@ -444,11 +459,11 @@ def test_trainer_model_hook_system_fit(tmpdir, kwargs): *model._eval_epoch('validation', trainer, model, val_batches, 'x', device=device), dict(name='Callback.on_validation_end', args=(trainer, model)), dict(name='on_validation_end'), - dict(name='train'), + dict(name='train', args=(True, )), dict(name='on_validation_model_train'), dict(name='Callback.on_sanity_check_end', args=(trainer, model)), # duplicate `train` because `_run_train` calls it again in case validation wasn't run - dict(name='train'), + dict(name='train', args=(True, )), dict(name='on_train_dataloader'), dict(name='train_dataloader'), dict(name='Callback.on_train_start', args=(trainer, model)), @@ -469,7 +484,7 @@ def test_trainer_model_hook_system_fit(tmpdir, kwargs): dict(name='Callback.on_save_checkpoint', args=(trainer, model, saved_ckpt)), dict(name='on_save_checkpoint', args=(saved_ckpt, )), dict(name='on_validation_end'), - dict(name='train'), + dict(name='train', args=(True, )), dict(name='on_validation_model_train'), dict(name='training_epoch_end', args=([dict(loss=ANY)] * train_batches, )), dict(name='Callback.on_train_epoch_end', args=(trainer, model, [dict(loss=ANY)] * train_batches)), @@ -582,7 +597,7 @@ def test_trainer_model_hook_system_eval(tmpdir, batches, verb, noun, dataloader, *model._eval_epoch(noun, trainer, model, batches, key), dict(name=f'Callback.on_{noun}_end', args=(trainer, model)), dict(name=f'on_{noun}_end'), - dict(name='train'), + dict(name='train', args=(True, )), dict(name=f'on_{noun}_model_train'), ] expected = [ From 72d5ee3b998f2c988a3c2c18915dce82e540170a Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 22 Jun 2021 13:48:58 +0200 Subject: [PATCH 03/39] Comments --- tests/models/test_hooks.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index c8b2a79f893e7..9e557abeb879e 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -306,6 +306,7 @@ def _train_batch(trainer, model, batches, device=torch.device('cpu'), **kwargs): dict(name='on_before_zero_grad', args=(ANY, )), dict(name='optimizer_zero_grad', args=(0, i, ANY, 0)), # TODO: `on_before_backward` + # DeepSpeed handles backward internally *([dict(name='backward', args=(ANY, ANY, 0))] if kwargs.get('plugins') != 'deepspeed' else []), dict(name='Callback.on_after_backward', args=(trainer, model)), dict(name='on_after_backward'), @@ -432,17 +433,18 @@ def test_trainer_model_hook_system_fit(tmpdir, kwargs): dict(name='prepare_data'), dict(name='configure_callbacks'), dict(name='Callback.on_before_accelerator_backend_setup', args=(trainer, model)), - # FIXME + # DeepSpeed needs the batch size to figure out throughput logging *([dict(name='train_dataloader')] if kwargs.get('plugins') == 'deepspeed' else []), dict(name='Callback.setup', args=(trainer, model), kwargs=dict(stage='fit')), dict(name='setup', kwargs=dict(stage='fit')), dict(name='configure_sharded_model'), dict(name='Callback.on_configure_sharded_model', args=(trainer, model)), - # FIXME + # DeepSpeed skips initializing optimizers here as they are handled via config *([dict(name='configure_optimizers')] if kwargs.get('plugins') != 'deepspeed' else []), dict(name='Callback.on_fit_start', args=(trainer, model)), dict(name='on_fit_start'), - # FIXME + # TODO: explore whether DeepSpeed can have the same flow for optimizers + # DeepSpeed did not find any optimizer in the config so they are loaded here *([dict(name='configure_optimizers')] if kwargs.get('plugins') == 'deepspeed' else []), dict(name='Callback.on_pretrain_routine_start', args=(trainer, model)), dict(name='on_pretrain_routine_start'), From f34ee7e7dddb94fdf0deec008edf0749db1215fb Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 22 Jun 2021 16:56:15 +0200 Subject: [PATCH 04/39] Fix message --- pytorch_lightning/plugins/precision/deepspeed_precision.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/plugins/precision/deepspeed_precision.py b/pytorch_lightning/plugins/precision/deepspeed_precision.py index f05fd4d54b811..3edcc6866e219 100644 --- a/pytorch_lightning/plugins/precision/deepspeed_precision.py +++ b/pytorch_lightning/plugins/precision/deepspeed_precision.py @@ -64,15 +64,14 @@ def backward( ) -> Tensor: if is_overridden('backward', model): warning_cache.warn( - "Overridden backward hook in the LightningModule will be ignored since DeepSpeed handles" - "backward logic outside of the LightningModule" + "You have overridden the `LightningModule.backward` hook but it will be ignored since DeepSpeed handles" + " the backward logic internally." ) # todo: hack around for deepspeed engine to call backward deepspeed_engine = model.trainer.model deepspeed_engine.backward(closure_loss, *args, **kwargs) # once backward has been applied, release graph closure_loss = closure_loss.detach() - return closure_loss def clip_gradients( From 39c4a85a83cf32081b721f939ff83500b93f2dd3 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 22 Jun 2021 16:32:38 +0200 Subject: [PATCH 05/39] Test CI error --- .azure-pipelines/gpu-tests.yml | 82 +++++++++++++++++----------------- tests/models/test_hooks.py | 22 +++++++++ 2 files changed, 63 insertions(+), 41 deletions(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 5333bfd867da0..5a2276f315391 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -74,46 +74,46 @@ jobs: displayName: 'Get legacy checkpoints' - bash: | - python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 + pytest tests/accelerators/test_ddp.py::test_ddp_torch_dist_is_available_in_setup tests/models/test_hooks.py::test_ci_bug -v displayName: 'Testing: standard' - - bash: | - bash tests/special_tests.sh - displayName: 'Testing: special' - - - bash: | - python -m coverage report - python -m coverage xml - python -m coverage html - python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure - ls -l - displayName: 'Statistics' - - - task: PublishTestResults@2 - displayName: 'Publish test results' - inputs: - testResultsFiles: '$(Build.StagingDirectory)/test-results.xml' - testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' - condition: succeededOrFailed() - - - task: PublishCodeCoverageResults@1 - displayName: 'Publish coverage report' - inputs: - codeCoverageTool: 'cobertura' - summaryFileLocation: 'coverage.xml' - reportDirectory: '$(Build.SourcesDirectory)/htmlcov' - testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)' - condition: succeededOrFailed() - - - bash: | - python -m pytest benchmarks -v --maxfail=2 --durations=0 - displayName: 'Testing: benchmarks' - - - script: | - set -e - python -m pytest pl_examples -v --maxfail=2 --durations=0 - bash pl_examples/run_examples-args.sh --trainer.gpus 1 --trainer.max_epochs 1 --data.batch_size 64 --trainer.limit_train_batches 5 --trainer.limit_val_batches 3 - bash pl_examples/run_ddp-examples.sh --trainer.max_epochs 1 --data.batch_size 32 --trainer.limit_train_batches 2 --trainer.limit_val_batches 2 - env: - PL_USE_MOCKED_MNIST: "1" - displayName: 'Examples' + #- bash: | + # bash tests/special_tests.sh + # displayName: 'Testing: special' +# + #- bash: | + # python -m coverage report + # python -m coverage xml + # python -m coverage html + # python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure + # ls -l + # displayName: 'Statistics' +# + #- task: PublishTestResults@2 + # displayName: 'Publish test results' + # inputs: + # testResultsFiles: '$(Build.StagingDirectory)/test-results.xml' + # testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' + # condition: succeededOrFailed() +# + #- task: PublishCodeCoverageResults@1 + # displayName: 'Publish coverage report' + # inputs: + # codeCoverageTool: 'cobertura' + # summaryFileLocation: 'coverage.xml' + # reportDirectory: '$(Build.SourcesDirectory)/htmlcov' + # testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)' + # condition: succeededOrFailed() +# + #- bash: | + # python -m pytest benchmarks -v --maxfail=2 --durations=0 + # displayName: 'Testing: benchmarks' +# + #- script: | + # set -e + # python -m pytest pl_examples -v --maxfail=2 --durations=0 + # bash pl_examples/run_examples-args.sh --trainer.gpus 1 --trainer.max_epochs 1 --data.batch_size 64 --trainer.limit_train_batches 5 --trainer.limit_val_batches 3 + # bash pl_examples/run_ddp-examples.sh --trainer.max_epochs 1 --data.batch_size 32 --trainer.limit_train_batches 2 --trainer.limit_val_batches 2 + # env: + # PL_USE_MOCKED_MNIST: "1" + # displayName: 'Examples' diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 9e557abeb879e..08a6dd40f49ae 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -378,6 +378,28 @@ def _predict_batch(trainer, model, batches): return out +@RunIf(deepspeed=True, min_gpus=1) +def test_ci_bug(tmpdir): + called = [] + model = HookedModel(called) + callback = HookedCallback(called) + train_batches = 2 + val_batches = 2 + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=train_batches, + limit_val_batches=val_batches, + progress_bar_refresh_rate=0, + weights_summary=None, + callbacks=[callback], + gpus=1, + precision=16, + plugins='deepspeed', + ) + trainer.fit(model) + + @pytest.mark.parametrize( 'kwargs', [ From c3b458d1d9dd939d921f7e27864d023b11cac5a2 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 22 Jun 2021 18:58:51 +0200 Subject: [PATCH 06/39] Revert "Test CI error" This reverts commit 39c4a85a83cf32081b721f939ff83500b93f2dd3. --- .azure-pipelines/gpu-tests.yml | 82 +++++++++++++++++----------------- tests/models/test_hooks.py | 22 --------- 2 files changed, 41 insertions(+), 63 deletions(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 5a2276f315391..5333bfd867da0 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -74,46 +74,46 @@ jobs: displayName: 'Get legacy checkpoints' - bash: | - pytest tests/accelerators/test_ddp.py::test_ddp_torch_dist_is_available_in_setup tests/models/test_hooks.py::test_ci_bug -v + python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 displayName: 'Testing: standard' - #- bash: | - # bash tests/special_tests.sh - # displayName: 'Testing: special' -# - #- bash: | - # python -m coverage report - # python -m coverage xml - # python -m coverage html - # python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure - # ls -l - # displayName: 'Statistics' -# - #- task: PublishTestResults@2 - # displayName: 'Publish test results' - # inputs: - # testResultsFiles: '$(Build.StagingDirectory)/test-results.xml' - # testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' - # condition: succeededOrFailed() -# - #- task: PublishCodeCoverageResults@1 - # displayName: 'Publish coverage report' - # inputs: - # codeCoverageTool: 'cobertura' - # summaryFileLocation: 'coverage.xml' - # reportDirectory: '$(Build.SourcesDirectory)/htmlcov' - # testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)' - # condition: succeededOrFailed() -# - #- bash: | - # python -m pytest benchmarks -v --maxfail=2 --durations=0 - # displayName: 'Testing: benchmarks' -# - #- script: | - # set -e - # python -m pytest pl_examples -v --maxfail=2 --durations=0 - # bash pl_examples/run_examples-args.sh --trainer.gpus 1 --trainer.max_epochs 1 --data.batch_size 64 --trainer.limit_train_batches 5 --trainer.limit_val_batches 3 - # bash pl_examples/run_ddp-examples.sh --trainer.max_epochs 1 --data.batch_size 32 --trainer.limit_train_batches 2 --trainer.limit_val_batches 2 - # env: - # PL_USE_MOCKED_MNIST: "1" - # displayName: 'Examples' + - bash: | + bash tests/special_tests.sh + displayName: 'Testing: special' + + - bash: | + python -m coverage report + python -m coverage xml + python -m coverage html + python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure + ls -l + displayName: 'Statistics' + + - task: PublishTestResults@2 + displayName: 'Publish test results' + inputs: + testResultsFiles: '$(Build.StagingDirectory)/test-results.xml' + testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' + condition: succeededOrFailed() + + - task: PublishCodeCoverageResults@1 + displayName: 'Publish coverage report' + inputs: + codeCoverageTool: 'cobertura' + summaryFileLocation: 'coverage.xml' + reportDirectory: '$(Build.SourcesDirectory)/htmlcov' + testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)' + condition: succeededOrFailed() + + - bash: | + python -m pytest benchmarks -v --maxfail=2 --durations=0 + displayName: 'Testing: benchmarks' + + - script: | + set -e + python -m pytest pl_examples -v --maxfail=2 --durations=0 + bash pl_examples/run_examples-args.sh --trainer.gpus 1 --trainer.max_epochs 1 --data.batch_size 64 --trainer.limit_train_batches 5 --trainer.limit_val_batches 3 + bash pl_examples/run_ddp-examples.sh --trainer.max_epochs 1 --data.batch_size 32 --trainer.limit_train_batches 2 --trainer.limit_val_batches 2 + env: + PL_USE_MOCKED_MNIST: "1" + displayName: 'Examples' diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 08a6dd40f49ae..9e557abeb879e 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -378,28 +378,6 @@ def _predict_batch(trainer, model, batches): return out -@RunIf(deepspeed=True, min_gpus=1) -def test_ci_bug(tmpdir): - called = [] - model = HookedModel(called) - callback = HookedCallback(called) - train_batches = 2 - val_batches = 2 - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=1, - limit_train_batches=train_batches, - limit_val_batches=val_batches, - progress_bar_refresh_rate=0, - weights_summary=None, - callbacks=[callback], - gpus=1, - precision=16, - plugins='deepspeed', - ) - trainer.fit(model) - - @pytest.mark.parametrize( 'kwargs', [ From c700cabcc985f9780843d5a28f9d5974be588017 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 22 Jun 2021 19:02:04 +0200 Subject: [PATCH 07/39] Add ddp training type teardown --- pytorch_lightning/plugins/training_type/ddp.py | 9 +++++++-- tests/accelerators/test_ddp.py | 4 +--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index ed320f37d7006..1ebed7c55405b 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -227,7 +227,7 @@ def setup_distributed(self): self.init_ddp_connection() # on world_size=0 let everyone know training is starting - if self.is_global_zero and not torch.distributed.is_initialized(): + if self.is_global_zero and not torch_distrib.is_initialized(): log.info("-" * 100) log.info(f"distributed_backend={self.distributed_backend}") log.info(f"All DDP processes registered. Starting ddp with {self.world_size} processes") @@ -297,7 +297,7 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt world_size = world_size if world_size is not None else self.cluster_environment.world_size() os.environ["MASTER_ADDR"] = self.cluster_environment.master_address() os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) - if not torch.distributed.is_initialized(): + if not torch_distrib.is_initialized(): log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") torch_distrib.init_process_group(self.torch_distributed_backend, rank=global_rank, world_size=world_size) @@ -373,3 +373,8 @@ def register_plugins(cls, plugin_registry: Dict) -> None: description="DDP Plugin with `find_unused_parameters` as False", find_unused_parameters=False ) + + def teardown(self) -> None: + if torch_distrib.is_initialized(): + torch_distrib.destroy_process_group() + super().teardown() diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py index 3f335964a5eee..9f6b160567a84 100644 --- a/tests/accelerators/test_ddp.py +++ b/tests/accelerators/test_ddp.py @@ -109,7 +109,6 @@ class TestModel(BoringModel): def setup(self, stage: Optional[str] = None) -> None: assert torch.distributed.is_initialized() - raise SystemExit() model = TestModel() trainer = Trainer( @@ -118,8 +117,7 @@ def setup(self, stage: Optional[str] = None) -> None: accelerator="ddp", gpus=1, ) - with pytest.raises(SystemExit): - trainer.fit(model) + trainer.fit(model) @RunIf(min_gpus=2, min_torch="1.8.1", special=True) From e5602c92942caf6d54b9e0b3ce7f2fd90783950a Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 22 Jun 2021 19:10:42 +0200 Subject: [PATCH 08/39] Update CHANGELOG --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 176413cd55e76..d2cd3926381e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -285,6 +285,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Support manual optimization with DeepSpeed ([#7970](https://github.com/PyTorchLightning/pytorch-lightning/pull/7970)) +- Destroy the distributed process group on DDP teardown ([#8080](https://github.com/PyTorchLightning/pytorch-lightning/pull/8080)) + + - Fixed `dataloader_idx` argument value when predicting with only one `DataLoader` ([#7941](https://github.com/PyTorchLightning/pytorch-lightning/pull/7941)) From 52b2256164c08db4565d33b0aeded2678a781de8 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Tue, 22 Jun 2021 19:00:07 +0200 Subject: [PATCH 09/39] Adrian's fix --- tests/conftest.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 7f6407ecfd82b..3f767d8b6fad2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,6 +18,7 @@ from http.server import SimpleHTTPRequestHandler import pytest +import torch.distributed import torch.multiprocessing as mp @@ -41,6 +42,14 @@ def restore_env_variables(): os.environ.update(env_backup) +@pytest.fixture(scope="function", autouse=True) +def teardown_process_group(): + """ Ensures that the distributed process group gets closed before the next test runs. """ + yield + if torch.distributed.is_available() and torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + + def pytest_configure(config): config.addinivalue_line("markers", "spawn: spawn test in a separate process using torch.multiprocessing.spawn") From 0b94b6c269cd04c3ec495a0beebd58bcda949b29 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 23 Jun 2021 14:12:08 +0200 Subject: [PATCH 10/39] Use destructor --- pytorch_lightning/plugins/training_type/ddp.py | 3 +-- tests/accelerators/test_ddp.py | 4 +++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 1ebed7c55405b..2ea19fb0c781b 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -374,7 +374,6 @@ def register_plugins(cls, plugin_registry: Dict) -> None: find_unused_parameters=False ) - def teardown(self) -> None: + def __del__(self) -> None: if torch_distrib.is_initialized(): torch_distrib.destroy_process_group() - super().teardown() diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py index 9f6b160567a84..3f335964a5eee 100644 --- a/tests/accelerators/test_ddp.py +++ b/tests/accelerators/test_ddp.py @@ -109,6 +109,7 @@ class TestModel(BoringModel): def setup(self, stage: Optional[str] = None) -> None: assert torch.distributed.is_initialized() + raise SystemExit() model = TestModel() trainer = Trainer( @@ -117,7 +118,8 @@ def setup(self, stage: Optional[str] = None) -> None: accelerator="ddp", gpus=1, ) - trainer.fit(model) + with pytest.raises(SystemExit): + trainer.fit(model) @RunIf(min_gpus=2, min_torch="1.8.1", special=True) From aaf32abde54cfb1bf205ae8bea878e9ebe282ad3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 23 Jun 2021 14:14:54 +0200 Subject: [PATCH 11/39] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2cd3926381e3..1c6633fe88508 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -285,7 +285,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Support manual optimization with DeepSpeed ([#7970](https://github.com/PyTorchLightning/pytorch-lightning/pull/7970)) -- Destroy the distributed process group on DDP teardown ([#8080](https://github.com/PyTorchLightning/pytorch-lightning/pull/8080)) +- Destroy the distributed process group on DDP destructor ([#8080](https://github.com/PyTorchLightning/pytorch-lightning/pull/8080)) - Fixed `dataloader_idx` argument value when predicting with only one `DataLoader` ([#7941](https://github.com/PyTorchLightning/pytorch-lightning/pull/7941)) From 0444d541c67e4f236c2de084924bbf4fef36c5e9 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 23 Jun 2021 16:28:22 +0200 Subject: [PATCH 12/39] RPC destructor --- pytorch_lightning/plugins/training_type/rpc.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py index 3e0f57daef001..d8698e71bd261 100644 --- a/pytorch_lightning/plugins/training_type/rpc.py +++ b/pytorch_lightning/plugins/training_type/rpc.py @@ -83,3 +83,7 @@ def exit_rpc_process(self): @property def rpc_enabled(self) -> bool: return True + + def __del__(self): + self.exit_rpc_process() + super().__del__() From 5d4f811cd865ee8952b7f388c487af671e919bbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 23 Jun 2021 16:28:38 +0200 Subject: [PATCH 13/39] Update pytorch_lightning/plugins/training_type/ddp.py --- pytorch_lightning/plugins/training_type/ddp.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 2ea19fb0c781b..11601dec1fc6c 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -377,3 +377,6 @@ def register_plugins(cls, plugin_registry: Dict) -> None: def __del__(self) -> None: if torch_distrib.is_initialized(): torch_distrib.destroy_process_group() + # clean up memory + with torch.cuda.device(self.root_device): + torch.cuda.empty_cache() From bf8766d392c021a4b20b4641ca9ee83bb042386c Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 23 Jun 2021 16:56:20 +0200 Subject: [PATCH 14/39] Why do you not work :( --- pytorch_lightning/plugins/training_type/rpc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py index d8698e71bd261..f825732f7e316 100644 --- a/pytorch_lightning/plugins/training_type/rpc.py +++ b/pytorch_lightning/plugins/training_type/rpc.py @@ -85,5 +85,5 @@ def rpc_enabled(self) -> bool: return True def __del__(self): - self.exit_rpc_process() - super().__del__() + # avoid hang + ... From 48bcb7ed2ea7ac463e0dc39e5808416c56957404 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 23 Jun 2021 18:05:16 +0200 Subject: [PATCH 15/39] Missing condition --- pytorch_lightning/plugins/training_type/ddp.py | 7 ++++--- pytorch_lightning/plugins/training_type/rpc.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 11601dec1fc6c..88fe86420069e 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -377,6 +377,7 @@ def register_plugins(cls, plugin_registry: Dict) -> None: def __del__(self) -> None: if torch_distrib.is_initialized(): torch_distrib.destroy_process_group() - # clean up memory - with torch.cuda.device(self.root_device): - torch.cuda.empty_cache() + if self.on_gpu: + # clean up memory + with torch.cuda.device(self.root_device): + torch.cuda.empty_cache() diff --git a/pytorch_lightning/plugins/training_type/rpc.py b/pytorch_lightning/plugins/training_type/rpc.py index f825732f7e316..f20ece7ebbcf7 100644 --- a/pytorch_lightning/plugins/training_type/rpc.py +++ b/pytorch_lightning/plugins/training_type/rpc.py @@ -86,4 +86,4 @@ def rpc_enabled(self) -> bool: def __del__(self): # avoid hang - ... + pass From 21ad2d8234e053e08d97efd74a261fa76d6e8b56 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 04:46:44 +0200 Subject: [PATCH 16/39] Fix deepspeed test --- tests/plugins/test_deepspeed_plugin.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index c5eaadd1e5985..2e96ced4c0c26 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -1,3 +1,4 @@ +import gc import json import os from typing import Any, Dict @@ -265,6 +266,10 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args (RandomIterableDataset, "auto"), (RandomIterableDataset, 10)]) def test_deepspeed_auto_batch_size_config_select(tmpdir, dataset_cls, value): """Test to ensure that the batch size is correctly set as expected for deepspeed logging purposes.""" + # the previous parametrization can impact the current one as it's not guaranteed that resources will be released + # between parametrizations. This is important as we call `destroy_process_group` in `DDPPlugin.__del__`. + # Another option would be to not use `parametrize`: https://github.com/pytest-dev/pytest/discussions/8153 + gc.collect() class TestModel(BoringModel): From bbc489e313dccc95018d2f750a89f00c656a43d2 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 04:57:54 +0200 Subject: [PATCH 17/39] GC collect in conftest --- tests/conftest.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 7f6407ecfd82b..6cbdc3c3783c2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import gc import os import sys import threading @@ -36,6 +37,9 @@ def restore_env_variables(): """ Ensures that environment variables set during the test do not leak out. """ env_backup = os.environ.copy() yield + # if a destructor accesses an environment variable, we need to make sure that `os.environ` is not cleared + # before `__del__` is called. Force the call by triggering garbage collection. + gc.collect() # restore environment as it was before running the test os.environ.clear() os.environ.update(env_backup) From 5b06fd2c2528e29929e56fcc765ad708a8a77586 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 05:43:43 +0200 Subject: [PATCH 18/39] Do not show warnings for special tests --- tests/special_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/special_tests.sh b/tests/special_tests.sh index 9fca3b62bad40..a87f50548d06b 100755 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -17,7 +17,7 @@ set -e # this environment variable allows special tests to run export PL_RUNNING_SPECIAL_TESTS=1 # python arguments -defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no' +defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no --disable-warnings' # find tests marked as `@RunIf(special=True)` grep_output=$(grep --recursive --line-number --word-regexp 'tests' 'benchmarks' --regexp 'special=True') From 5e69ed84f9b09bcd25fe70909dc94470287982c6 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 05:44:05 +0200 Subject: [PATCH 19/39] Needs to run on 1.8 To avoid: "RuntimeError: NCCL error in: /pytorch/torch/lib/c10d/ProcessGroupNCCL.cpp:32, unhandled cuda error, NCCL version 2.4.8" --- .azure-pipelines/gpu-tests.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 5333bfd867da0..bc7120bbc2ae6 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -32,12 +32,9 @@ jobs: # python.version: '3.7' # ToDo: this need to have installed docker in the base image... - #container: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6 - #container: "pytorchlightning/pytorch_lightning:base-cuda-py$[ variables['python.version'] ]-torch1.6" container: # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.8-torch1.6" - #endpoint: azureContainerRegistryConnection + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all" workspace: From aed51a2c09213267283455efefb50f111c33384b Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 15:45:22 +0200 Subject: [PATCH 20/39] Run torch 1.8 --- .azure-pipelines/gpu-tests.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 5333bfd867da0..bc7120bbc2ae6 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -32,12 +32,9 @@ jobs: # python.version: '3.7' # ToDo: this need to have installed docker in the base image... - #container: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6 - #container: "pytorchlightning/pytorch_lightning:base-cuda-py$[ variables['python.version'] ]-torch1.6" container: # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.8-torch1.6" - #endpoint: azureContainerRegistryConnection + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all" workspace: From e0a3e8785d2fecd63667da433a648f958d60ef89 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 16:01:33 +0200 Subject: [PATCH 21/39] Skip test due to 'Python bus error' --- tests/helpers/test_models.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/helpers/test_models.py b/tests/helpers/test_models.py index e4bb7e7df0827..61b33265d1458 100644 --- a/tests/helpers/test_models.py +++ b/tests/helpers/test_models.py @@ -23,11 +23,12 @@ @pytest.mark.parametrize( - "data_class,model_class", [ + "data_class,model_class", + [ (None, BoringModel), (None, BasicGAN), (None, ParityModuleRNN), - (None, ParityModuleMNIST), + # (None, ParityModuleMNIST), (ClassifDataModule, ClassificationModel), (RegressDataModule, RegressionModel), ] From 9ee2d193832d022dd95096e932476dedcbd990d4 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 16:34:26 +0200 Subject: [PATCH 22/39] Debug NCCL --- .azure-pipelines/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index bc7120bbc2ae6..f1b57f9233ae3 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -71,7 +71,7 @@ jobs: displayName: 'Get legacy checkpoints' - bash: | - python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 + NCCL_DEBUG=INFO python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 displayName: 'Testing: standard' - bash: | From 3588aaa37723db12ee17969a80e4c90028c071ba Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 17:06:20 +0200 Subject: [PATCH 23/39] shm size --- .azure-pipelines/gpu-tests.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index f1b57f9233ae3..421ad96688d5a 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -25,17 +25,11 @@ jobs: pool: gridai-spot-pool - #strategy: - # matrix: - # PT16: - # torch.version: '1.6' - # python.version: '3.7' - # ToDo: this need to have installed docker in the base image... container: # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" - options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all" + options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m" workspace: clean: all From 067bf1ae9eee271aaf3c4e4ac6bf9a50ba807fa2 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 17:28:56 +0200 Subject: [PATCH 24/39] Disable warnings for special tests --- tests/special_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/special_tests.sh b/tests/special_tests.sh index 9fca3b62bad40..a87f50548d06b 100755 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -17,7 +17,7 @@ set -e # this environment variable allows special tests to run export PL_RUNNING_SPECIAL_TESTS=1 # python arguments -defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no' +defaults='-m coverage run --source pytorch_lightning --append -m pytest --verbose --capture=no --disable-warnings' # find tests marked as `@RunIf(special=True)` grep_output=$(grep --recursive --line-number --word-regexp 'tests' 'benchmarks' --regexp 'special=True') From 6060b05215f0b824944bcabb2d7a4f3440625a96 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 17:29:25 +0200 Subject: [PATCH 25/39] Remove NCCL_DEBUG statement --- .azure-pipelines/gpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 421ad96688d5a..5499202bc690e 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -65,7 +65,7 @@ jobs: displayName: 'Get legacy checkpoints' - bash: | - NCCL_DEBUG=INFO python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 + python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 displayName: 'Testing: standard' - bash: | From f0fa1b74d0790a397702305a8cdd93ad7bcf18b7 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 17:30:06 +0200 Subject: [PATCH 26/39] Try smaller shm size --- .azure-pipelines/gpu-tests.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 5499202bc690e..b1fedd578bc85 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -28,8 +28,11 @@ jobs: # ToDo: this need to have installed docker in the base image... container: # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 + # run on torch 1.8 as it's the LTS version image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" - options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m" + # default shm size is 64m. Increase it to avoid: + # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' + options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=256m" workspace: clean: all From 6dd70381ce88f8ac3459de4b9795a875d596c9f5 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 17:31:05 +0200 Subject: [PATCH 27/39] Revert "Skip test due to 'Python bus error'" This reverts commit e0a3e8785d2fecd63667da433a648f958d60ef89. --- tests/helpers/test_models.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/helpers/test_models.py b/tests/helpers/test_models.py index 61b33265d1458..e4bb7e7df0827 100644 --- a/tests/helpers/test_models.py +++ b/tests/helpers/test_models.py @@ -23,12 +23,11 @@ @pytest.mark.parametrize( - "data_class,model_class", - [ + "data_class,model_class", [ (None, BoringModel), (None, BasicGAN), (None, ParityModuleRNN), - # (None, ParityModuleMNIST), + (None, ParityModuleMNIST), (ClassifDataModule, ClassificationModel), (RegressDataModule, RegressionModel), ] From 73e62f8aba385a3cad540c438fb500a46ded9648 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 18:15:47 +0200 Subject: [PATCH 28/39] README and adjust versions --- README.md | 4 ++-- requirements/adjust_versions.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 7a540adadd327..78175f95c28fd 100644 --- a/README.md +++ b/README.md @@ -74,10 +74,10 @@ Lightning is rigorously tested across multiple GPUs, TPUs CPUs and against major
- | System / PyTorch ver. | 1.4 (min. req.) | 1.5 | 1.6 | 1.7 | 1.8 (latest) | 1.9 (nightly) | + | System / PyTorch ver. | 1.4 (min. req.) | 1.5 | 1.6 | 1.7 | 1.8 (LTS) | 1.9 (latest) | | :---: | :---: | :---: | :---: | :---: | :---: | :---: | | Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | - | Linux py3.7 [GPUs**] | - | - | [![Build Status](https://dev.azure.com/PytorchLightning/pytorch-lightning/_apis/build/status/PL.pytorch-lightning%20(GPUs)?branchName=master)](https://dev.azure.com/PytorchLightning/pytorch-lightning/_build/latest?definitionId=6&branchName=master) | - | - | - | + | Linux py3.7 [GPUs**] | - | - | - | - | [![Build Status](https://dev.azure.com/PytorchLightning/pytorch-lightning/_apis/build/status/PL.pytorch-lightning%20(GPUs)?branchName=master)](https://dev.azure.com/PytorchLightning/pytorch-lightning/_build/latest?definitionId=6&branchName=master) | - | | Linux py3.{6,7} [TPUs***] | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | | Linux py3.{6,7,8,9} | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | | OSX py3.{6,7,8,9} | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | diff --git a/requirements/adjust_versions.py b/requirements/adjust_versions.py index a09128c6200db..84879b4e48a34 100644 --- a/requirements/adjust_versions.py +++ b/requirements/adjust_versions.py @@ -4,7 +4,8 @@ from typing import Dict, Optional VERSIONS = [ - dict(torch="1.9.0", torchvision="", torchtext=""), # nightly + dict(torch="1.10.0", torchvision="", torchtext=""), # nightly + dict(torch="1.9.0", torchvision="0.10.0", torchtext="0.10.0"), dict(torch="1.8.1", torchvision="0.9.1", torchtext="0.9.1"), dict(torch="1.8.0", torchvision="0.9.0", torchtext="0.9.0"), dict(torch="1.7.1", torchvision="0.8.2", torchtext="0.8.1"), From 902ef02b95fee49275b60a04ac8dbe9d6f682933 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 18:22:21 +0200 Subject: [PATCH 29/39] Avoid self.on_gpu call --- pytorch_lightning/plugins/training_type/ddp.py | 6 ++---- tests/conftest.py | 4 ---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index dcc78f7bc5d40..c04a4ab111a20 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -384,7 +384,5 @@ def register_plugins(cls, plugin_registry: Dict) -> None: def __del__(self) -> None: if torch_distrib.is_initialized(): torch_distrib.destroy_process_group() - if self.on_gpu: - # clean up memory - with torch.cuda.device(self.root_device): - torch.cuda.empty_cache() + # `is_initialized` is checked inside and we already set the default device with `set_device(self.root_device)` + torch.cuda.empty_cache() diff --git a/tests/conftest.py b/tests/conftest.py index 6cbdc3c3783c2..7f6407ecfd82b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import gc import os import sys import threading @@ -37,9 +36,6 @@ def restore_env_variables(): """ Ensures that environment variables set during the test do not leak out. """ env_backup = os.environ.copy() yield - # if a destructor accesses an environment variable, we need to make sure that `os.environ` is not cleared - # before `__del__` is called. Force the call by triggering garbage collection. - gc.collect() # restore environment as it was before running the test os.environ.clear() os.environ.update(env_backup) From 4ce0f9a1feaa8e85e536f5dead658d17c65611c8 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 18:41:53 +0200 Subject: [PATCH 30/39] empty cache cleanup --- pytorch_lightning/accelerators/gpu.py | 5 +---- .../plugins/training_type/parallel.py | 3 +-- .../plugins/training_type/single_device.py | 3 +-- .../trainer/connectors/checkpoint_connector.py | 14 +++----------- pytorch_lightning/utilities/memory.py | 15 +++++++-------- 5 files changed, 13 insertions(+), 27 deletions(-) diff --git a/pytorch_lightning/accelerators/gpu.py b/pytorch_lightning/accelerators/gpu.py index 7543a2b794b5d..1c5ff56d805a6 100644 --- a/pytorch_lightning/accelerators/gpu.py +++ b/pytorch_lightning/accelerators/gpu.py @@ -42,10 +42,7 @@ def setup(self, trainer: 'pl.Trainer', model: 'pl.LightningModule') -> None: def on_train_start(self) -> None: # clear cache before training - # use context because of: - # https://discuss.pytorch.org/t/out-of-memory-when-i-use-torch-cuda-empty-cache/57898 - with torch.cuda.device(self.root_device): - torch.cuda.empty_cache() + torch.cuda.empty_cache() @staticmethod def set_nvidia_flags(local_rank: int) -> None: diff --git a/pytorch_lightning/plugins/training_type/parallel.py b/pytorch_lightning/plugins/training_type/parallel.py index 09e48a760e868..122a1423c2817 100644 --- a/pytorch_lightning/plugins/training_type/parallel.py +++ b/pytorch_lightning/plugins/training_type/parallel.py @@ -132,5 +132,4 @@ def teardown(self) -> None: # GPU teardown self.lightning_module.cpu() # clean up memory - with torch.cuda.device(self.root_device): - torch.cuda.empty_cache() + torch.cuda.empty_cache() diff --git a/pytorch_lightning/plugins/training_type/single_device.py b/pytorch_lightning/plugins/training_type/single_device.py index 1816f5838c948..d4a328902eba0 100644 --- a/pytorch_lightning/plugins/training_type/single_device.py +++ b/pytorch_lightning/plugins/training_type/single_device.py @@ -85,5 +85,4 @@ def teardown(self) -> None: # GPU teardown self.lightning_module.cpu() # clean up memory - with torch.cuda.device(self.root_device): - torch.cuda.empty_cache() + torch.cuda.empty_cache() diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index c2a0411c0df36..0bc3145a99e59 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -21,13 +21,7 @@ import pytorch_lightning from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.utilities import ( - _OMEGACONF_AVAILABLE, - DeviceType, - rank_zero_deprecation, - rank_zero_info, - rank_zero_warn, -) +from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, rank_zero_deprecation, rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.upgrade_checkpoint import KEYS_MAPPING as DEPRECATED_CHECKPOINT_KEYS @@ -69,8 +63,7 @@ def resume_start(self) -> None: return # clear cache before restore - if self.trainer._device_type == DeviceType.GPU: - torch.cuda.empty_cache() + torch.cuda.empty_cache() # Try to read the checkpoint file at `checkpoint_path`. If not exist, do not restore checkpoint. fs = get_filesystem(checkpoint_path) @@ -88,8 +81,7 @@ def resume_end(self) -> None: self._loaded_checkpoint = dict() # clear cache after restore - if self.trainer._device_type == DeviceType.GPU: - torch.cuda.empty_cache() + torch.cuda.empty_cache() # wait for all to catch up self.trainer.training_type_plugin.barrier("CheckpointConnector.resume_end") diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index 6c01390a8c81e..0ae88e8995614 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -76,11 +76,10 @@ def is_out_of_cpu_memory(exception): def garbage_collection_cuda(): """Garbage collection Torch (CUDA) memory.""" gc.collect() - if torch.cuda.is_available(): - try: - # This is the last thing that should cause an OOM error, but seemingly it can. - torch.cuda.empty_cache() - except RuntimeError as exception: - if not is_oom_error(exception): - # Only handle OOM errors - raise + try: + # This is the last thing that should cause an OOM error, but seemingly it can. + torch.cuda.empty_cache() + except RuntimeError as exception: + if not is_oom_error(exception): + # Only handle OOM errors + raise From 738daa5bbccd5e00f65ea5f4c8b9218fea15839d Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 24 Jun 2021 19:06:25 +0200 Subject: [PATCH 31/39] More garbage collection --- tests/plugins/test_deepspeed_plugin.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 2e96ced4c0c26..b609bc78d74fc 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -650,6 +650,8 @@ def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_opt """ Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works. """ + gc.collect() + seed_everything(42) class VerificationCallback(Callback): From 236aa97bf35af324a43ca4f729f7ebdecff5fa3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 24 Jun 2021 21:23:50 +0200 Subject: [PATCH 32/39] Unroll parametrizations --- tests/callbacks/test_pruning.py | 41 ++++++++++++++++--- .../test_checkpoint_callback_frequency.py | 14 +++++-- tests/plugins/test_deepspeed_plugin.py | 16 +++++--- 3 files changed, 58 insertions(+), 13 deletions(-) diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py index f198b29d24e84..1a5ddad64106e 100644 --- a/tests/callbacks/test_pruning.py +++ b/tests/callbacks/test_pruning.py @@ -162,13 +162,44 @@ def test_pruning_callback( @RunIf(special=True, min_gpus=2) -@pytest.mark.parametrize("parameters_to_prune", [False, True]) -@pytest.mark.parametrize("use_global_unstructured", [False, True]) -def test_pruning_callback_ddp(tmpdir, use_global_unstructured: bool, parameters_to_prune: bool): +def test_pruning_callback_ddp_0(tmpdir): train_with_pruning_callback( tmpdir, - parameters_to_prune=parameters_to_prune, - use_global_unstructured=use_global_unstructured, + parameters_to_prune=False, + use_global_unstructured=False, + accelerator="ddp", + gpus=2, + ) + + +@RunIf(special=True, min_gpus=2) +def test_pruning_callback_ddp_1(tmpdir): + train_with_pruning_callback( + tmpdir, + parameters_to_prune=False, + use_global_unstructured=True, + accelerator="ddp", + gpus=2, + ) + + +@RunIf(special=True, min_gpus=2) +def test_pruning_callback_ddp_2(tmpdir): + train_with_pruning_callback( + tmpdir, + parameters_to_prune=True, + use_global_unstructured=False, + accelerator="ddp", + gpus=2, + ) + + +@RunIf(special=True, min_gpus=2) +def test_pruning_callback_ddp_3(tmpdir): + train_with_pruning_callback( + tmpdir, + parameters_to_prune=True, + use_global_unstructured=True, accelerator="ddp", gpus=2, ) diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 9fdd69dba7a9a..c5afecc2b4bf3 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -105,10 +105,18 @@ def training_step(self, batch, batch_idx): assert save_mock.call_count == expected -@mock.patch('torch.save') @RunIf(special=True, min_gpus=2) -@pytest.mark.parametrize(['k', 'epochs', 'val_check_interval', 'expected'], [(1, 1, 1.0, 1), (2, 2, 0.3, 5)]) -def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): +def test_top_k_ddp_0(tmpdir): + _top_k_ddp(tmpdir, k=1, epochs=1, val_check_interval=1.0, expected=1) + + +@RunIf(special=True, min_gpus=2) +def test_top_k_ddp_1(tmpdir): + _top_k_ddp(tmpdir, k=2, epochs=2, val_check_interval=0.3, expected=5) + + +@mock.patch('torch.save') +def _top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): class TestModel(BoringModel): diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index b609bc78d74fc..b443827cac70c 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -644,14 +644,10 @@ def test_deepspeed_multigpu_stage_3_checkpointing_full_weights_manual(tmpdir): run_checkpoint_test(tmpdir, save_full_weights=True, automatic_optimization=False, accumulate_grad_batches=1) -@RunIf(min_gpus=2, deepspeed=True, special=True) -@pytest.mark.parametrize('offload_optimizer', [True, False]) -def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer): +def _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer): """ Test to ensure with Stage 2 and multiple GPUs, accumulated grad batches works. """ - gc.collect() - seed_everything(42) class VerificationCallback(Callback): @@ -678,6 +674,16 @@ def on_train_batch_start( trainer.fit(model, datamodule=dm) +@RunIf(min_gpus=2, deepspeed=True, special=True) +def test_deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir): + _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=False) + + +@RunIf(min_gpus=2, deepspeed=True, special=True) +def test_deepspeed_multigpu_stage_2_accumulated_grad_batches_offload_optimizer(tmpdir): + _deepspeed_multigpu_stage_2_accumulated_grad_batches(tmpdir, offload_optimizer=True) + + @RunIf(min_gpus=2, deepspeed=True, special=True) def test_deepspeed_multigpu_test(tmpdir, deepspeed_config): """ From ffa532d3bb888606438577f98e7ae512fa28a0cd Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 25 Jun 2021 00:30:21 +0200 Subject: [PATCH 33/39] Do not reuse mock --- .../test_checkpoint_callback_frequency.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index c5afecc2b4bf3..67db594aa2539 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -105,17 +105,18 @@ def training_step(self, batch, batch_idx): assert save_mock.call_count == expected +@mock.patch('torch.save') @RunIf(special=True, min_gpus=2) -def test_top_k_ddp_0(tmpdir): - _top_k_ddp(tmpdir, k=1, epochs=1, val_check_interval=1.0, expected=1) +def test_top_k_ddp_0(save_mock, tmpdir): + _top_k_ddp(save_mock, tmpdir, k=1, epochs=1, val_check_interval=1.0, expected=1) +@mock.patch('torch.save') @RunIf(special=True, min_gpus=2) -def test_top_k_ddp_1(tmpdir): - _top_k_ddp(tmpdir, k=2, epochs=2, val_check_interval=0.3, expected=5) +def test_top_k_ddp_1(save_mock, tmpdir): + _top_k_ddp(save_mock, tmpdir, k=2, epochs=2, val_check_interval=0.3, expected=5) -@mock.patch('torch.save') def _top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): class TestModel(BoringModel): From e190089054817f6d62cfd7433774146ac82f9a81 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 25 Jun 2021 01:35:15 +0200 Subject: [PATCH 34/39] Undo changes --- tests/conftest.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 3f767d8b6fad2..7f6407ecfd82b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,7 +18,6 @@ from http.server import SimpleHTTPRequestHandler import pytest -import torch.distributed import torch.multiprocessing as mp @@ -42,14 +41,6 @@ def restore_env_variables(): os.environ.update(env_backup) -@pytest.fixture(scope="function", autouse=True) -def teardown_process_group(): - """ Ensures that the distributed process group gets closed before the next test runs. """ - yield - if torch.distributed.is_available() and torch.distributed.is_initialized(): - torch.distributed.destroy_process_group() - - def pytest_configure(config): config.addinivalue_line("markers", "spawn: spawn test in a separate process using torch.multiprocessing.spawn") From 261a166194872fb3d031496dd552d463a9a135ed Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 25 Jun 2021 01:40:40 +0200 Subject: [PATCH 35/39] Undo notebooks modification --- _notebooks | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_notebooks b/_notebooks index 3321b468e7816..29aea106edefc 160000 --- a/_notebooks +++ b/_notebooks @@ -1 +1 @@ -Subproject commit 3321b468e78167aaf056894e92ed6d649c76e89e +Subproject commit 29aea106edefc9d1904c0c17223a8ac2b15c48e7 From 33a68d43e2e99cfe5aa130dee0cc8451dff9e713 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Sat, 3 Jul 2021 20:07:23 +0200 Subject: [PATCH 36/39] Undo --- CHANGELOG.md | 3 --- pytorch_lightning/plugins/training_type/ddp.py | 2 +- tests/plugins/test_deepspeed_plugin.py | 5 ----- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a3e162ae1bea4..2256dcefeac31 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -327,9 +327,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Support manual optimization with DeepSpeed ([#7970](https://github.com/PyTorchLightning/pytorch-lightning/pull/7970)) -- Destroy the distributed process group on DDP destructor ([#8080](https://github.com/PyTorchLightning/pytorch-lightning/pull/8080)) - - - Fixed `dataloader_idx` argument value when predicting with only one `DataLoader` ([#7941](https://github.com/PyTorchLightning/pytorch-lightning/pull/7941)) diff --git a/pytorch_lightning/plugins/training_type/ddp.py b/pytorch_lightning/plugins/training_type/ddp.py index 98ac485c205c2..a882390b78b0d 100644 --- a/pytorch_lightning/plugins/training_type/ddp.py +++ b/pytorch_lightning/plugins/training_type/ddp.py @@ -321,7 +321,7 @@ def init_ddp_connection(self, global_rank: Optional[int] = None, world_size: Opt world_size = world_size if world_size is not None else self.cluster_environment.world_size() os.environ["MASTER_ADDR"] = self.cluster_environment.master_address() os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) - if not torch_distrib.is_initialized(): + if not torch.distributed.is_initialized(): log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") torch.distributed.init_process_group( self.torch_distributed_backend, rank=global_rank, world_size=world_size diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 6c238ab747350..efe8da981c9eb 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -1,4 +1,3 @@ -import gc import json import os from typing import Any, Dict @@ -266,10 +265,6 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args (RandomIterableDataset, "auto"), (RandomIterableDataset, 10)]) def test_deepspeed_auto_batch_size_config_select(tmpdir, dataset_cls, value): """Test to ensure that the batch size is correctly set as expected for deepspeed logging purposes.""" - # the previous parametrization can impact the current one as it's not guaranteed that resources will be released - # between parametrizations. This is important as we call `destroy_process_group` in `DDPPlugin.__del__`. - # Another option would be to not use `parametrize`: https://github.com/pytest-dev/pytest/discussions/8153 - gc.collect() class TestModel(BoringModel): From ac006c75a6105ebd461c7461d1e9258c69f5e8fb Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Sat, 3 Jul 2021 20:09:24 +0200 Subject: [PATCH 37/39] Fix test --- tests/models/test_hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 859ca504ad5f6..630166f4e40a9 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -574,7 +574,7 @@ def test_trainer_model_hook_system_fit_no_val_and_resume(tmpdir): dict(name='on_pretrain_routine_start'), dict(name='Callback.on_pretrain_routine_end', args=(trainer, model)), dict(name='on_pretrain_routine_end'), - dict(name='train'), + dict(name='train', args=(True, )), dict(name='on_train_dataloader'), dict(name='train_dataloader'), # even though no validation runs, we initialize the val dataloader for properties like `num_val_batches` From a5becf4afdd619753f3d6533a52385871270b4e6 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Sat, 3 Jul 2021 20:34:06 +0200 Subject: [PATCH 38/39] Update test --- tests/plugins/test_deepspeed_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index efe8da981c9eb..dcb4ff00b219b 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -256,7 +256,7 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args gpus=1, precision=16, ) - with pytest.warns(UserWarning, match='Overridden backward hook in the LightningModule will be ignored'): + with pytest.warns(UserWarning, match='will be ignored since DeepSpeed handles the backward'): trainer.fit(model) From 88b3183122525c7e66196204479bfa77b0731c0c Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 5 Jul 2021 12:19:21 +0200 Subject: [PATCH 39/39] Fix merge --- tests/models/test_hooks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 08d4d86b3c931..789959e38908a 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -293,7 +293,7 @@ def _train_batch(trainer, model, batches, device=torch.device('cpu'), current_ep for i in range(batches): out.extend([ dict(name='on_before_batch_transfer', args=(ANY, 0)), - dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), 0)), + dict(name='transfer_batch_to_device', args=(ANY, device, 0)), dict(name='on_after_batch_transfer', args=(ANY, 0)), # TODO: `on_batch_{start,end}` dict(name='Callback.on_batch_start', args=(trainer, model)), @@ -345,7 +345,7 @@ def _eval_batch(fn, trainer, model, batches, key, device=torch.device('cpu')): for i in range(batches): out.extend([ dict(name='on_before_batch_transfer', args=(ANY, 0)), - dict(name='transfer_batch_to_device', args=(ANY, torch.device('cpu'), 0)), + dict(name='transfer_batch_to_device', args=(ANY, device, 0)), dict(name='on_after_batch_transfer', args=(ANY, 0)), # TODO: `{,Callback}.on_batch_{start,end}` dict(name=f'Callback.on_{fn}_batch_start', args=(trainer, model, ANY, i, 0)),