From 1c1a1d6546a7d5cd42de5ee2fb666510c24b2deb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 26 Jul 2022 16:28:32 +0200 Subject: [PATCH 01/11] Upgrade DeepSpeed --- .azure/gpu-tests.yml | 1 + requirements/pytorch/strategies.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index b5dbd9e3340c7..c3b8b22d1e286 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -73,6 +73,7 @@ jobs: CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0" pip install -e .[strategies] + pip install deepspeed -U # revert me pip install --requirement requirements/pytorch/devel.txt pip list env: diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt index 2b69c8ba76b81..2b7792f3c1ab2 100644 --- a/requirements/pytorch/strategies.txt +++ b/requirements/pytorch/strategies.txt @@ -1,5 +1,5 @@ fairscale>=0.4.5, <=0.4.6 -deepspeed<0.6.0 +deepspeed<0.7.0 # no need to install with [pytorch] as pytorch is already installed horovod>=0.21.2, !=0.24.0, <0.25.1 hivemind>=1.0.1, <=1.0.1; sys_platform == 'linux' From 71892bcb3bbbccf4f36128e5b6de12e5d41f3da5 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Tue, 26 Jul 2022 20:01:39 +0200 Subject: [PATCH 02/11] skip test with deepspeed >= 0.6.5 --- tests/tests_pytorch/lite/test_lite.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index c0439854013a2..16d4260c52252 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -29,6 +29,7 @@ from pytorch_lightning.strategies import DeepSpeedStrategy, Strategy from pytorch_lightning.utilities import _StrategyType from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.seed import pl_worker_init_function from tests_pytorch.helpers.runif import RunIf @@ -400,6 +401,8 @@ def test_autocast(): lite._precision_plugin.forward_context().__exit__.assert_called() +# https://github.com/microsoft/DeepSpeed/issues/2139 +@pytest.mark.skipif(_RequirementAvailable("deepspeed>=0.6.5"), reason="Lite does not support 0.6.5") @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) def test_deepspeed_multiple_models(): class Lite(LightningLite): From 1aa264c9972091fd2a1fa02be4afad5314cbf1fa Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 27 Jul 2022 15:39:15 +0200 Subject: [PATCH 03/11] one None too many --- src/pytorch_lightning/core/module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index a66c7679b3ee0..7697d4160ea74 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -1389,7 +1389,7 @@ def training_step(...): **kwargs: Additional keyword arguments to be forwarded to :meth:`~torch.Tensor.backward` """ self._verify_is_manual_optimization("manual_backward") - self.trainer.strategy.backward(loss, None, None, *args, **kwargs) + self.trainer.strategy.backward(loss, None, *args, **kwargs) def backward( self, loss: Tensor, optimizer: Optional[Optimizer], optimizer_idx: Optional[int], *args, **kwargs From ae046199c008bf4c85bfd44c9172d108faed10c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 27 Jul 2022 15:41:09 +0200 Subject: [PATCH 04/11] debug --- tests/tests_pytorch/run_standalone_tests.sh | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/tests_pytorch/run_standalone_tests.sh b/tests/tests_pytorch/run_standalone_tests.sh index 5297cbd033347..ee9488ee79ccd 100644 --- a/tests/tests_pytorch/run_standalone_tests.sh +++ b/tests/tests_pytorch/run_standalone_tests.sh @@ -18,16 +18,16 @@ set -e # Batch size for testing: Determines how many standalone test invocations run in parallel test_batch_size=6 -while getopts "b:" opt; do - case $opt in - b) - test_batch_size=$OPTARG;; - *) - echo "Usage: $(basename $0) [-b batch_size]" - exit 1;; - esac -done -shift $((OPTIND-1)) +#while getopts "b:" opt; do +# case $opt in +# b) +# test_batch_size=$OPTARG;; +# *) +# echo "Usage: $(basename $0) [-b batch_size]" +# exit 1;; +# esac +#done +#shift $((OPTIND-1)) # this environment variable allows special tests to run export PL_RUN_STANDALONE_TESTS=1 From f3d6c5993dc69d198ba706260201bcce1bb11e05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 27 Jul 2022 15:41:49 +0200 Subject: [PATCH 05/11] debug --- src/pytorch_lightning/plugins/precision/deepspeed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/plugins/precision/deepspeed.py b/src/pytorch_lightning/plugins/precision/deepspeed.py index 96458487c7420..c3b29973c1922 100644 --- a/src/pytorch_lightning/plugins/precision/deepspeed.py +++ b/src/pytorch_lightning/plugins/precision/deepspeed.py @@ -78,7 +78,7 @@ def backward(self, model: "pl.LightningModule", closure_loss: Tensor, *args: Any " the backward logic internally." ) deepspeed_engine: "deepspeed.DeepSpeedEngine" = model.trainer.model - deepspeed_engine.backward(closure_loss, *args, **kwargs) + deepspeed_engine.backward(closure_loss) def _run_backward( self, tensor: Tensor, model: Optional["deepspeed.DeepSpeedEngine"], *args: Any, **kwargs: Any From 2c6fb988e606cf5b4ce465a4c2080eb51cc10ebe Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 27 Jul 2022 15:56:34 +0200 Subject: [PATCH 06/11] fixes --- src/pytorch_lightning/core/module.py | 2 +- src/pytorch_lightning/plugins/precision/apex_amp.py | 3 ++- src/pytorch_lightning/plugins/precision/deepspeed.py | 4 ++-- src/pytorch_lightning/plugins/precision/ipu.py | 2 +- src/pytorch_lightning/plugins/precision/precision_plugin.py | 3 ++- src/pytorch_lightning/strategies/strategy.py | 4 ++-- 6 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index 7697d4160ea74..6beb6352390c7 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -1389,7 +1389,7 @@ def training_step(...): **kwargs: Additional keyword arguments to be forwarded to :meth:`~torch.Tensor.backward` """ self._verify_is_manual_optimization("manual_backward") - self.trainer.strategy.backward(loss, None, *args, **kwargs) + self.trainer.strategy.backward(loss, *args, **kwargs) def backward( self, loss: Tensor, optimizer: Optional[Optimizer], optimizer_idx: Optional[int], *args, **kwargs diff --git a/src/pytorch_lightning/plugins/precision/apex_amp.py b/src/pytorch_lightning/plugins/precision/apex_amp.py index 15825dedd2ef6..e18f82dc27f6e 100644 --- a/src/pytorch_lightning/plugins/precision/apex_amp.py +++ b/src/pytorch_lightning/plugins/precision/apex_amp.py @@ -59,6 +59,7 @@ def backward( model: "pl.LightningModule", closure_loss: Tensor, optimizer: Optional[Optimizer], + optimizer_idx: Optional[int], *args: Any, **kwargs: Any, ) -> None: @@ -71,7 +72,7 @@ def backward( """ opt = optimizer or model.trainer.optimizers with amp.scale_loss(closure_loss, opt) as closure_loss: - super().backward(model, closure_loss, optimizer, *args, **kwargs) + super().backward(model, closure_loss, optimizer, optimizer_idx, *args, **kwargs) def optimizer_step( self, diff --git a/src/pytorch_lightning/plugins/precision/deepspeed.py b/src/pytorch_lightning/plugins/precision/deepspeed.py index c3b29973c1922..9c8d33eb6acde 100644 --- a/src/pytorch_lightning/plugins/precision/deepspeed.py +++ b/src/pytorch_lightning/plugins/precision/deepspeed.py @@ -71,14 +71,14 @@ def __init__(self, precision: Union[str, int], amp_type: str, amp_level: Optiona self.amp_type = amp_type self.amp_level = amp_level - def backward(self, model: "pl.LightningModule", closure_loss: Tensor, *args: Any, **kwargs: Any) -> None: + def backward(self, model: "pl.LightningModule", closure_loss: Tensor, optimizer: Optional[Optimizer], *args: Any, **kwargs: Any) -> None: if is_overridden("backward", model): warning_cache.warn( "You have overridden the `LightningModule.backward` hook but it will be ignored since DeepSpeed handles" " the backward logic internally." ) deepspeed_engine: "deepspeed.DeepSpeedEngine" = model.trainer.model - deepspeed_engine.backward(closure_loss) + deepspeed_engine.backward(closure_loss, *args, **kwargs) def _run_backward( self, tensor: Tensor, model: Optional["deepspeed.DeepSpeedEngine"], *args: Any, **kwargs: Any diff --git a/src/pytorch_lightning/plugins/precision/ipu.py b/src/pytorch_lightning/plugins/precision/ipu.py index 329a8b8978e50..89f544575f63f 100644 --- a/src/pytorch_lightning/plugins/precision/ipu.py +++ b/src/pytorch_lightning/plugins/precision/ipu.py @@ -44,7 +44,7 @@ def __init__(self, precision: int) -> None: super().__init__() self.precision = precision - def backward(self, model: "pl.LightningModule", *args: Any, **kwargs: Any) -> None: + def backward(self, model: "pl.LightningModule", *_: Any, **__: Any) -> None: if is_overridden("backward", model): warning_cache.warn( "You have overridden the `LightningModule.backward` hook but it will be ignored since IPUs handle" diff --git a/src/pytorch_lightning/plugins/precision/precision_plugin.py b/src/pytorch_lightning/plugins/precision/precision_plugin.py index cbf18b8c4fa41..02d343a0876b4 100644 --- a/src/pytorch_lightning/plugins/precision/precision_plugin.py +++ b/src/pytorch_lightning/plugins/precision/precision_plugin.py @@ -64,6 +64,7 @@ def backward( model: "pl.LightningModule", closure_loss: Tensor, optimizer: Optional[Optimizer], + optimizer_idx: Optional[int], *args: Any, **kwargs: Any, ) -> None: @@ -76,7 +77,7 @@ def backward( """ # do backward pass if model is not None and isinstance(model, pl.LightningModule): - model.backward(closure_loss, optimizer, *args, **kwargs) + model.backward(closure_loss, optimizer, optimizer_idx, *args, **kwargs) else: self._run_backward(closure_loss, *args, **kwargs) diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py index f47afc890bcbb..69e0c8046833a 100644 --- a/src/pytorch_lightning/strategies/strategy.py +++ b/src/pytorch_lightning/strategies/strategy.py @@ -171,7 +171,7 @@ def optimizer_state(self, optimizer: Optimizer) -> Dict[str, Tensor]: """ return optimizer.state_dict() - def backward(self, closure_loss: Tensor, *args: Any, **kwargs: Any) -> Tensor: + def backward(self, closure_loss: Tensor, optimizer: Optional[Optimizer], optimizer_idx: Optional[int], *args: Any, **kwargs: Any) -> Tensor: """Forwards backward-calls to the precision plugin. Args: @@ -181,7 +181,7 @@ def backward(self, closure_loss: Tensor, *args: Any, **kwargs: Any) -> Tensor: assert self.lightning_module is not None closure_loss = self.precision_plugin.pre_backward(self.lightning_module, closure_loss) - self.precision_plugin.backward(self.lightning_module, closure_loss, *args, **kwargs) + self.precision_plugin.backward(self.lightning_module, closure_loss, optimizer, optimizer_idx, *args, **kwargs) closure_loss = self.precision_plugin.post_backward(self.lightning_module, closure_loss) self.post_backward(closure_loss) From 46e5ff27355293858b5eaca73f8b8792f61d5938 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 27 Jul 2022 15:57:43 +0200 Subject: [PATCH 07/11] debug --- src/pytorch_lightning/core/module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index 6beb6352390c7..a66c7679b3ee0 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -1389,7 +1389,7 @@ def training_step(...): **kwargs: Additional keyword arguments to be forwarded to :meth:`~torch.Tensor.backward` """ self._verify_is_manual_optimization("manual_backward") - self.trainer.strategy.backward(loss, *args, **kwargs) + self.trainer.strategy.backward(loss, None, None, *args, **kwargs) def backward( self, loss: Tensor, optimizer: Optional[Optimizer], optimizer_idx: Optional[int], *args, **kwargs From 96ab30ef40329a0ac38a36ab69a682471bd57684 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 27 Jul 2022 15:58:51 +0200 Subject: [PATCH 08/11] debug --- src/pytorch_lightning/plugins/precision/deepspeed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/plugins/precision/deepspeed.py b/src/pytorch_lightning/plugins/precision/deepspeed.py index 9c8d33eb6acde..9b4f9e4e38247 100644 --- a/src/pytorch_lightning/plugins/precision/deepspeed.py +++ b/src/pytorch_lightning/plugins/precision/deepspeed.py @@ -71,7 +71,7 @@ def __init__(self, precision: Union[str, int], amp_type: str, amp_level: Optiona self.amp_type = amp_type self.amp_level = amp_level - def backward(self, model: "pl.LightningModule", closure_loss: Tensor, optimizer: Optional[Optimizer], *args: Any, **kwargs: Any) -> None: + def backward(self, model: "pl.LightningModule", closure_loss: Tensor, optimizer: Optional[Optimizer], optimizer_idx: Optional[int], *args: Any, **kwargs: Any) -> None: if is_overridden("backward", model): warning_cache.warn( "You have overridden the `LightningModule.backward` hook but it will be ignored since DeepSpeed handles" From 93fc582603be711f4af395584c437436bbcc3f43 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 27 Jul 2022 16:02:07 +0200 Subject: [PATCH 09/11] revert --- tests/tests_pytorch/run_standalone_tests.sh | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/tests_pytorch/run_standalone_tests.sh b/tests/tests_pytorch/run_standalone_tests.sh index ee9488ee79ccd..5297cbd033347 100644 --- a/tests/tests_pytorch/run_standalone_tests.sh +++ b/tests/tests_pytorch/run_standalone_tests.sh @@ -18,16 +18,16 @@ set -e # Batch size for testing: Determines how many standalone test invocations run in parallel test_batch_size=6 -#while getopts "b:" opt; do -# case $opt in -# b) -# test_batch_size=$OPTARG;; -# *) -# echo "Usage: $(basename $0) [-b batch_size]" -# exit 1;; -# esac -#done -#shift $((OPTIND-1)) +while getopts "b:" opt; do + case $opt in + b) + test_batch_size=$OPTARG;; + *) + echo "Usage: $(basename $0) [-b batch_size]" + exit 1;; + esac +done +shift $((OPTIND-1)) # this environment variable allows special tests to run export PL_RUN_STANDALONE_TESTS=1 From a8828e1bec47a0081c3594faf2185c5cc0009307 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 27 Jul 2022 16:02:54 +0200 Subject: [PATCH 10/11] format --- src/pytorch_lightning/plugins/precision/deepspeed.py | 10 +++++++++- src/pytorch_lightning/strategies/strategy.py | 9 ++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/plugins/precision/deepspeed.py b/src/pytorch_lightning/plugins/precision/deepspeed.py index 9b4f9e4e38247..7987c5958ab0a 100644 --- a/src/pytorch_lightning/plugins/precision/deepspeed.py +++ b/src/pytorch_lightning/plugins/precision/deepspeed.py @@ -71,7 +71,15 @@ def __init__(self, precision: Union[str, int], amp_type: str, amp_level: Optiona self.amp_type = amp_type self.amp_level = amp_level - def backward(self, model: "pl.LightningModule", closure_loss: Tensor, optimizer: Optional[Optimizer], optimizer_idx: Optional[int], *args: Any, **kwargs: Any) -> None: + def backward( + self, + model: "pl.LightningModule", + closure_loss: Tensor, + optimizer: Optional[Optimizer], + optimizer_idx: Optional[int], + *args: Any, + **kwargs: Any, + ) -> None: if is_overridden("backward", model): warning_cache.warn( "You have overridden the `LightningModule.backward` hook but it will be ignored since DeepSpeed handles" diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py index 69e0c8046833a..0de904ccbd283 100644 --- a/src/pytorch_lightning/strategies/strategy.py +++ b/src/pytorch_lightning/strategies/strategy.py @@ -171,7 +171,14 @@ def optimizer_state(self, optimizer: Optimizer) -> Dict[str, Tensor]: """ return optimizer.state_dict() - def backward(self, closure_loss: Tensor, optimizer: Optional[Optimizer], optimizer_idx: Optional[int], *args: Any, **kwargs: Any) -> Tensor: + def backward( + self, + closure_loss: Tensor, + optimizer: Optional[Optimizer], + optimizer_idx: Optional[int], + *args: Any, + **kwargs: Any, + ) -> Tensor: """Forwards backward-calls to the precision plugin. Args: From 598f1e984b08933cffc3a7518c8401b937b05be8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 27 Jul 2022 18:09:30 +0200 Subject: [PATCH 11/11] Apply suggestions from code review --- tests/tests_pytorch/lite/test_lite.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index 7a2eff8483964..86a0a5a82195a 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -29,7 +29,6 @@ from pytorch_lightning.strategies import DeepSpeedStrategy, Strategy from pytorch_lightning.utilities import _StrategyType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.seed import pl_worker_init_function from tests_pytorch.helpers.runif import RunIf @@ -406,8 +405,6 @@ def test_autocast(): lite._precision_plugin.forward_context().__exit__.assert_called() -# https://github.com/microsoft/DeepSpeed/issues/2139 -@pytest.mark.skipif(_RequirementAvailable("deepspeed>=0.6.5"), reason="Lite does not support 0.6.5") @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) def test_deepspeed_multiple_models(): class Lite(LightningLite):