Skip to content

Commit

Permalink
Support DeepSpeed <0.7.0 (#13859)
Browse files Browse the repository at this point in the history
Co-authored-by: awaelchli <aedu.waelchli@gmail.com>
  • Loading branch information
carmocca and awaelchli authored Jul 28, 2022
1 parent 2a65934 commit 406cea7
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 4 deletions.
2 changes: 1 addition & 1 deletion .azure/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ jobs:
CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0"
pip install -e .[strategies]
pip install deepspeed==0.6.4 # TODO: remove when docker images are upgraded
pip install deepspeed>0.6.4 # TODO: remove when docker images are upgraded
pip install --requirement requirements/pytorch/devel.txt
pip list
env:
Expand Down
2 changes: 1 addition & 1 deletion requirements/pytorch/strategies.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
fairscale>=0.4.5, <=0.4.6
deepspeed>=0.6.0, <0.6.5
deepspeed>=0.6.0, <0.7.0
# no need to install with [pytorch] as pytorch is already installed
horovod>=0.21.2, !=0.24.0, <0.25.1
hivemind>=1.0.1, <=1.0.1; sys_platform == 'linux'
15 changes: 15 additions & 0 deletions src/pytorch_lightning/lite/lite.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
has_iterable_dataset,
)
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.imports import _RequirementAvailable
from pytorch_lightning.utilities.seed import seed_everything


Expand Down Expand Up @@ -105,6 +106,8 @@ def __init__(
self._precision_plugin = self._strategy.precision_plugin
self._models_setup: int = 0

self._check_deepspeed_support()

# wrap the run method so we can inject setup logic or spawn processes for the user
setattr(self, "run", partial(self._run_impl, self.run))

Expand Down Expand Up @@ -456,6 +459,18 @@ def _check_strategy_support(self, strategy: Optional[Union[str, Strategy]]) -> N
f" Choose one of {supported} or pass in a `Strategy` instance."
)

def _check_deepspeed_support(self) -> None:
if (
isinstance(self._strategy, DeepSpeedStrategy)
and self._strategy.zero_stage_3
and _RequirementAvailable("deepspeed>=0.6.5")
):
# https://github.com/microsoft/DeepSpeed/issues/2139
raise RuntimeError(
"DeepSpeed ZeRO-3 is not supported with this version of Lightning Lite and `deepspeed>=0.6.5`."
" Please downgrade deepspeed to 0.6.4 or check if a newer version of Lightning is available."
)

@staticmethod
def _supported_device_types() -> Sequence[_AcceleratorType]:
return (
Expand Down
8 changes: 7 additions & 1 deletion src/pytorch_lightning/utilities/deepspeed_model_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
from typing import Dict, List, Tuple

import torch
from torch.nn import Parameter

from pytorch_lightning.utilities.imports import _RequirementAvailable
from pytorch_lightning.utilities.model_summary import (
_is_lazy_weight_tensor,
get_human_readable_count,
Expand All @@ -40,7 +42,11 @@ def num_parameters(self) -> int:
@property
def average_shard_parameters(self) -> int:
"""Returns the number of parameters in this module."""
return sum(p.partitioned_size() if not _is_lazy_weight_tensor(p) else 0 for p in self._module.parameters())

def partitioned_size(p: Parameter) -> int:
return p.partitioned_size() if _RequirementAvailable("deepspeed<0.6.6") else p.partition_numel()

return sum(partitioned_size(p) if not _is_lazy_weight_tensor(p) else 0 for p in self._module.parameters())


class DeepSpeedSummary(ModelSummary):
Expand Down
13 changes: 12 additions & 1 deletion tests/tests_pytorch/lite/test_lite.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import os
from copy import deepcopy
from unittest import mock
Expand All @@ -29,6 +30,7 @@
from pytorch_lightning.strategies import DeepSpeedStrategy, Strategy
from pytorch_lightning.utilities import _StrategyType
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.imports import _RequirementAvailable
from pytorch_lightning.utilities.seed import pl_worker_init_function
from tests_pytorch.helpers.runif import RunIf

Expand Down Expand Up @@ -478,4 +480,13 @@ def run(self):
assert self.broadcast(True)
assert self.is_global_zero == (self.local_rank == 0)

Lite(strategy=DeepSpeedStrategy(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run()
if _RequirementAvailable("deepspeed>=0.6.5"):
# https://github.com/microsoft/DeepSpeed/issues/2139
raise_if_deepspeed_incompatible = pytest.raises(
RuntimeError, match="DeepSpeed ZeRO-3 is not supported with this version of Lightning Lite"
)
else:
raise_if_deepspeed_incompatible = contextlib.suppress()

with raise_if_deepspeed_incompatible:
Lite(strategy=DeepSpeedStrategy(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run()

0 comments on commit 406cea7

Please sign in to comment.