From de633d7e52d8575481f3eb39808ec49d8abe61df Mon Sep 17 00:00:00 2001 From: Jon Ashby Date: Thu, 2 Dec 2021 14:24:37 +0000 Subject: [PATCH 01/10] Forward cpu offloading args to deepspeed correctly and add tests for same --- .../plugins/training_type/deepspeed.py | 4 +-- tests/plugins/test_deepspeed_plugin.py | 27 +++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 3a704fc5f1848..ad86222ea6a8d 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -526,8 +526,8 @@ def _set_deepspeed_activation_checkpointing(self): deepspeed.checkpointing.configure( mpu_=None, partition_activations=checkpoint_config.get("partition_activations"), - contiguous_checkpointing=checkpoint_config.get("contiguous_checkpointing"), - checkpoint_in_cpu=checkpoint_config.get("checkpoint_in_cpu"), + contiguous_checkpointing=checkpoint_config.get("contiguous_memory_optimization"), + checkpoint_in_cpu=checkpoint_config.get("cpu_checkpointing"), profile=checkpoint_config.get("profile"), ) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 7cca6f6724656..f7ce1992702f0 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -24,6 +24,7 @@ from tests.helpers.datamodules import ClassifDataModule from tests.helpers.runif import RunIf + if _DEEPSPEED_AVAILABLE: import deepspeed from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict @@ -361,6 +362,32 @@ def test_deepspeed_custom_activation_checkpointing_params(tmpdir): assert checkpoint_config["synchronize_checkpoint_boundary"] +@RunIf(min_gpus=1, deepspeed=True) +@mock.patch("deepspeed.checkpointing.configure", autospec=True, wraps=deepspeed.checkpointing.configure) +def test_deepspeed_custom_activation_checkpointing_params_forwarded(deepspeed_checkpointing_configure, tmpdir): + """Ensure if we modify the activation checkpointing parameters, we pass these + to deepspeed.checkpointing.configure correctly.""" + ds = DeepSpeedPlugin( + partition_activations=True, + cpu_checkpointing=True, + contiguous_memory_optimization=True, + synchronize_checkpoint_boundary=True, + ) + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + enable_progress_bar=False, + max_epochs=1, + strategy=ds, + precision=16, + gpus=1, + ) + trainer.fit(model) + + deepspeed_checkpointing_configure.assert_called_with(mpu_=None, partition_activations=True, contiguous_checkpointing=True, checkpoint_in_cpu=True, profile=None) + + @RunIf(min_gpus=1, deepspeed=True) def test_deepspeed_assert_config_zero_offload_disabled(tmpdir, deepspeed_zero_config): """Ensure if we use a config and turn off offload_optimizer, that this is set to False within the config.""" From c3ca703dff2c8ce2f4298f932e8a26f6d3530d01 Mon Sep 17 00:00:00 2001 From: Jon Ashby Date: Thu, 2 Dec 2021 14:36:01 +0000 Subject: [PATCH 02/10] whitespace fix --- tests/plugins/test_deepspeed_plugin.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index f7ce1992702f0..fefa0de5426a2 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -24,7 +24,6 @@ from tests.helpers.datamodules import ClassifDataModule from tests.helpers.runif import RunIf - if _DEEPSPEED_AVAILABLE: import deepspeed from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict From 13ca64767a9c4bbbe7d9eb126527322d25039a9c Mon Sep 17 00:00:00 2001 From: Jon Ashby Date: Thu, 2 Dec 2021 14:56:43 +0000 Subject: [PATCH 03/10] add to changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ed0408bfbb15..56fe362b221aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -231,6 +231,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed a bug that caused incorrect batch indices to be passed to the `BasePredictionWriter` hooks when using a dataloader with `num_workers > 0` ([#10870](https://github.com/PyTorchLightning/pytorch-lightning/pull/10870)) +- Fixed a bug where the DeepSpeedPlugin arguments cpu_checkpointing and contiguous_memory_optimization were not being forwarded to deepspeed correctly ([#10874](https://github.com/PyTorchLightning/pytorch-lightning/issues/10874)) + + - From cb0e45daf359f745f371c81649866bc885e80086 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Dec 2021 15:08:15 +0000 Subject: [PATCH 04/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/plugins/test_deepspeed_plugin.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index fefa0de5426a2..6205200645bf7 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -364,8 +364,8 @@ def test_deepspeed_custom_activation_checkpointing_params(tmpdir): @RunIf(min_gpus=1, deepspeed=True) @mock.patch("deepspeed.checkpointing.configure", autospec=True, wraps=deepspeed.checkpointing.configure) def test_deepspeed_custom_activation_checkpointing_params_forwarded(deepspeed_checkpointing_configure, tmpdir): - """Ensure if we modify the activation checkpointing parameters, we pass these - to deepspeed.checkpointing.configure correctly.""" + """Ensure if we modify the activation checkpointing parameters, we pass these to + deepspeed.checkpointing.configure correctly.""" ds = DeepSpeedPlugin( partition_activations=True, cpu_checkpointing=True, @@ -384,7 +384,9 @@ def test_deepspeed_custom_activation_checkpointing_params_forwarded(deepspeed_ch ) trainer.fit(model) - deepspeed_checkpointing_configure.assert_called_with(mpu_=None, partition_activations=True, contiguous_checkpointing=True, checkpoint_in_cpu=True, profile=None) + deepspeed_checkpointing_configure.assert_called_with( + mpu_=None, partition_activations=True, contiguous_checkpointing=True, checkpoint_in_cpu=True, profile=None + ) @RunIf(min_gpus=1, deepspeed=True) From 7b967a0002e09644ffdeeef900e8e9a25cbd134f Mon Sep 17 00:00:00 2001 From: Jon Ashby Date: Thu, 2 Dec 2021 15:17:58 +0000 Subject: [PATCH 05/10] move mock inside test to avoid failures on machines without deepspeed --- tests/plugins/test_deepspeed_plugin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index fefa0de5426a2..6991991935e5a 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -362,8 +362,7 @@ def test_deepspeed_custom_activation_checkpointing_params(tmpdir): @RunIf(min_gpus=1, deepspeed=True) -@mock.patch("deepspeed.checkpointing.configure", autospec=True, wraps=deepspeed.checkpointing.configure) -def test_deepspeed_custom_activation_checkpointing_params_forwarded(deepspeed_checkpointing_configure, tmpdir): +def test_deepspeed_custom_activation_checkpointing_params_forwarded(tmpdir): """Ensure if we modify the activation checkpointing parameters, we pass these to deepspeed.checkpointing.configure correctly.""" ds = DeepSpeedPlugin( @@ -382,7 +381,8 @@ def test_deepspeed_custom_activation_checkpointing_params_forwarded(deepspeed_ch precision=16, gpus=1, ) - trainer.fit(model) + with mock.patch("deepspeed.checkpointing.configure", wraps=deepspeed.checkpointing.configure) as deepspeed_checkpointing_configure: + trainer.fit(model) deepspeed_checkpointing_configure.assert_called_with(mpu_=None, partition_activations=True, contiguous_checkpointing=True, checkpoint_in_cpu=True, profile=None) From 36f9d407986774cc144a67bb9f04bba0e65e6cbc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Dec 2021 15:22:00 +0000 Subject: [PATCH 06/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/plugins/test_deepspeed_plugin.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 9613244579557..8b1b14402a07f 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -381,7 +381,9 @@ def test_deepspeed_custom_activation_checkpointing_params_forwarded(tmpdir): precision=16, gpus=1, ) - with mock.patch("deepspeed.checkpointing.configure", wraps=deepspeed.checkpointing.configure) as deepspeed_checkpointing_configure: + with mock.patch( + "deepspeed.checkpointing.configure", wraps=deepspeed.checkpointing.configure + ) as deepspeed_checkpointing_configure: trainer.fit(model) deepspeed_checkpointing_configure.assert_called_with( From 55b2b45843ca2dac878fdbe10f984b81558a3f8f Mon Sep 17 00:00:00 2001 From: jona-0 <77344640+jona-0@users.noreply.github.com> Date: Tue, 7 Dec 2021 08:52:34 +0000 Subject: [PATCH 07/10] Update tests/plugins/test_deepspeed_plugin.py Sounds good Co-authored-by: Sean Naren --- tests/plugins/test_deepspeed_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 8b1b14402a07f..3e017d91d0711 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -361,7 +361,7 @@ def test_deepspeed_custom_activation_checkpointing_params(tmpdir): assert checkpoint_config["synchronize_checkpoint_boundary"] -@RunIf(min_gpus=1, deepspeed=True) +@RunIf(min_gpus=1, deepspeed=True, standalone=True) def test_deepspeed_custom_activation_checkpointing_params_forwarded(tmpdir): """Ensure if we modify the activation checkpointing parameters, we pass these to deepspeed.checkpointing.configure correctly.""" From cc3cf9131b11a5f00ce82be4096f128b3811c995 Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Tue, 7 Dec 2021 20:30:26 +0000 Subject: [PATCH 08/10] Update CHANGELOG.md Co-authored-by: ananthsub --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index deedb5477e644..3e51ba3c436e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -259,7 +259,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed a bug that caused incorrect batch indices to be passed to the `BasePredictionWriter` hooks when using a dataloader with `num_workers > 0` ([#10870](https://github.com/PyTorchLightning/pytorch-lightning/pull/10870)) -- Fixed a bug where the DeepSpeedPlugin arguments cpu_checkpointing and contiguous_memory_optimization were not being forwarded to deepspeed correctly ([#10874](https://github.com/PyTorchLightning/pytorch-lightning/issues/10874)) +- Fixed a bug where the DeepSpeedPlugin arguments `cpu_checkpointing` and `contiguous_memory_optimization` were not being forwarded to deepspeed correctly ([#10874](https://github.com/PyTorchLightning/pytorch-lightning/issues/10874)) - Fixed an issue with item assignment on the logger on rank > 0 for those who support it ([#10917](https://github.com/PyTorchLightning/pytorch-lightning/pull/10917)) From e271481972555e0fa2799724786904226355eb2a Mon Sep 17 00:00:00 2001 From: Jon Ashby Date: Wed, 8 Dec 2021 09:23:17 +0000 Subject: [PATCH 09/10] bad automerge - this change is not in 1.5.5 so move to 1.6.0 section --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a4a99efa6678..3ccef7bfa0e47 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -238,7 +238,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- +- Fixed a bug where the DeepSpeedPlugin arguments `cpu_checkpointing` and `contiguous_memory_optimization` were not being forwarded to deepspeed correctly ([#10874](https://github.com/PyTorchLightning/pytorch-lightning/issues/10874)) - @@ -256,7 +256,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed uploading best model checkpoint in NeptuneLogger ([#10369](https://github.com/PyTorchLightning/pytorch-lightning/pull/10369)) - Fixed early schedule reset logic in PyTorch profiler that was causing data leak ([#10837](https://github.com/PyTorchLightning/pytorch-lightning/pull/10837)) - Fixed a bug that caused incorrect batch indices to be passed to the `BasePredictionWriter` hooks when using a dataloader with `num_workers > 0` ([#10870](https://github.com/PyTorchLightning/pytorch-lightning/pull/10870)) -- Fixed a bug where the DeepSpeedPlugin arguments `cpu_checkpointing` and `contiguous_memory_optimization` were not being forwarded to deepspeed correctly ([#10874](https://github.com/PyTorchLightning/pytorch-lightning/issues/10874)) - Fixed an issue with item assignment on the logger on rank > 0 for those who support it ([#10917](https://github.com/PyTorchLightning/pytorch-lightning/pull/10917)) - Fixed importing `torch_xla.debug` for `torch-xla<1.8` ([#10836](https://github.com/PyTorchLightning/pytorch-lightning/pull/10836)) - Fixed an issue with `DDPSpawnPlugin` and related plugins leaving a temporary checkpoint behind ([#10934](https://github.com/PyTorchLightning/pytorch-lightning/pull/10934)) From 24ded0fc70f95d211d8163d46af84c83b2a6ee05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 14 Dec 2021 16:23:32 +0100 Subject: [PATCH 10/10] Update tests/plugins/test_deepspeed_plugin.py --- tests/plugins/test_deepspeed_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 3e017d91d0711..4b56e9d389174 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -376,7 +376,7 @@ def test_deepspeed_custom_activation_checkpointing_params_forwarded(tmpdir): trainer = Trainer( default_root_dir=tmpdir, enable_progress_bar=False, - max_epochs=1, + fast_dev_run=1, strategy=ds, precision=16, gpus=1,