From 1c1a1d6546a7d5cd42de5ee2fb666510c24b2deb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 26 Jul 2022 16:28:32 +0200 Subject: [PATCH 1/6] Upgrade DeepSpeed --- .azure/gpu-tests.yml | 1 + requirements/pytorch/strategies.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index b5dbd9e3340c7..c3b8b22d1e286 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -73,6 +73,7 @@ jobs: CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0" pip install -e .[strategies] + pip install deepspeed -U # revert me pip install --requirement requirements/pytorch/devel.txt pip list env: diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt index 2b69c8ba76b81..2b7792f3c1ab2 100644 --- a/requirements/pytorch/strategies.txt +++ b/requirements/pytorch/strategies.txt @@ -1,5 +1,5 @@ fairscale>=0.4.5, <=0.4.6 -deepspeed<0.6.0 +deepspeed<0.7.0 # no need to install with [pytorch] as pytorch is already installed horovod>=0.21.2, !=0.24.0, <0.25.1 hivemind>=1.0.1, <=1.0.1; sys_platform == 'linux' From 71892bcb3bbbccf4f36128e5b6de12e5d41f3da5 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Tue, 26 Jul 2022 20:01:39 +0200 Subject: [PATCH 2/6] skip test with deepspeed >= 0.6.5 --- tests/tests_pytorch/lite/test_lite.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index c0439854013a2..16d4260c52252 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -29,6 +29,7 @@ from pytorch_lightning.strategies import DeepSpeedStrategy, Strategy from pytorch_lightning.utilities import _StrategyType from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.seed import pl_worker_init_function from tests_pytorch.helpers.runif import RunIf @@ -400,6 +401,8 @@ def test_autocast(): lite._precision_plugin.forward_context().__exit__.assert_called() +# https://github.com/microsoft/DeepSpeed/issues/2139 +@pytest.mark.skipif(_RequirementAvailable("deepspeed>=0.6.5"), reason="Lite does not support 0.6.5") @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) def test_deepspeed_multiple_models(): class Lite(LightningLite): From 16776c0bf2427f63bbc037e4a86a7e9dc89a1abf Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 27 Jul 2022 18:14:48 +0200 Subject: [PATCH 3/6] fix deepspeed summary --- .../utilities/deepspeed_model_summary.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/pytorch_lightning/utilities/deepspeed_model_summary.py b/src/pytorch_lightning/utilities/deepspeed_model_summary.py index 89dd6a9f9a25f..45d55392df51d 100644 --- a/src/pytorch_lightning/utilities/deepspeed_model_summary.py +++ b/src/pytorch_lightning/utilities/deepspeed_model_summary.py @@ -17,7 +17,9 @@ from typing import Dict, List, Tuple import torch +from torch.nn import Parameter +from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.model_summary import ( _is_lazy_weight_tensor, get_human_readable_count, @@ -40,7 +42,11 @@ def num_parameters(self) -> int: @property def average_shard_parameters(self) -> int: """Returns the number of parameters in this module.""" - return sum(p.partitioned_size() if not _is_lazy_weight_tensor(p) else 0 for p in self._module.parameters()) + + def partitioned_size(p: Parameter) -> int: + return p.partitioned_size() if _RequirementAvailable("deepspeed<0.6.6") else p.partition_numel() + + return sum(partitioned_size(p) if not _is_lazy_weight_tensor(p) else 0 for p in self._module.parameters()) class DeepSpeedSummary(ModelSummary): From 7bf92fcd96f12ae6907cfc314ec8f2765a1839d8 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 27 Jul 2022 18:35:52 +0200 Subject: [PATCH 4/6] add check for deepspeed support in Lite --- src/pytorch_lightning/lite/lite.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index 0195e6852eb28..250ce7e9cfe15 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -40,6 +40,7 @@ has_iterable_dataset, ) from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.seed import seed_everything @@ -105,6 +106,8 @@ def __init__( self._precision_plugin = self._strategy.precision_plugin self._models_setup: int = 0 + self._check_deepspeed_support() + # wrap the run method so we can inject setup logic or spawn processes for the user setattr(self, "run", partial(self._run_impl, self.run)) @@ -454,6 +457,18 @@ def _check_strategy_support(self, strategy: Optional[Union[str, Strategy]]) -> N f" Choose one of {supported} or pass in a `Strategy` instance." ) + def _check_deepspeed_support(self) -> None: + if ( + isinstance(self._strategy, DeepSpeedStrategy) + and self._strategy.zero_stage_3 + and _RequirementAvailable("deepspeed>=0.6.5") + ): + # https://github.com/microsoft/DeepSpeed/issues/2139 + raise RuntimeError( + "DeepSpeed ZeRO-3 is not supported with this version of Lightning Lite and `deepspeed>=0.6.5`." + " Please downgrade deepspeed to 0.6.4 or check if a newer version of Lightning is available." + ) + @staticmethod def _supported_device_types() -> Sequence[_AcceleratorType]: return ( From d4ab9d1cc15867ebe61daf4b8a81cf9951708e9a Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 27 Jul 2022 18:49:47 +0200 Subject: [PATCH 5/6] test for compatibility error update --- tests/tests_pytorch/lite/test_lite.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index 16d4260c52252..a8a7248ae5c0d 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import contextlib import os from copy import deepcopy from unittest import mock @@ -401,8 +402,6 @@ def test_autocast(): lite._precision_plugin.forward_context().__exit__.assert_called() -# https://github.com/microsoft/DeepSpeed/issues/2139 -@pytest.mark.skipif(_RequirementAvailable("deepspeed>=0.6.5"), reason="Lite does not support 0.6.5") @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) def test_deepspeed_multiple_models(): class Lite(LightningLite): @@ -466,4 +465,13 @@ def run(self): assert self.broadcast(True) assert self.is_global_zero == (self.local_rank == 0) - Lite(strategy=DeepSpeedStrategy(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run() + if _RequirementAvailable("deepspeed>=0.6.5"): + # https://github.com/microsoft/DeepSpeed/issues/2139 + raise_if_deepspeed_incompatilbe = pytest.raises( + RuntimeError, match="DeepSpeed ZeRO-3 is not supported with this version of Lightning Lite" + ) + else: + raise_if_deepspeed_incompatilbe = contextlib.suppress() + + with raise_if_deepspeed_incompatilbe: + Lite(strategy=DeepSpeedStrategy(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run() From 80f7870cb0ae4e87d86aae223266bd44c351cee5 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Wed, 27 Jul 2022 18:50:34 +0200 Subject: [PATCH 6/6] fix typo --- tests/tests_pytorch/lite/test_lite.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index a8a7248ae5c0d..97603ed598fa6 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -467,11 +467,11 @@ def run(self): if _RequirementAvailable("deepspeed>=0.6.5"): # https://github.com/microsoft/DeepSpeed/issues/2139 - raise_if_deepspeed_incompatilbe = pytest.raises( + raise_if_deepspeed_incompatible = pytest.raises( RuntimeError, match="DeepSpeed ZeRO-3 is not supported with this version of Lightning Lite" ) else: - raise_if_deepspeed_incompatilbe = contextlib.suppress() + raise_if_deepspeed_incompatible = contextlib.suppress() - with raise_if_deepspeed_incompatilbe: + with raise_if_deepspeed_incompatible: Lite(strategy=DeepSpeedStrategy(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run()