diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 8e8e2edb91d85..74c1df4553fe0 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -73,6 +73,7 @@ jobs: CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0" pip install -e .[strategies] + pip install deepspeed==0.6.4 # TODO: remove when docker images are upgraded pip install --requirement requirements/pytorch/devel.txt pip list env: diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt index 2b69c8ba76b81..db29ce556e839 100644 --- a/requirements/pytorch/strategies.txt +++ b/requirements/pytorch/strategies.txt @@ -1,5 +1,5 @@ fairscale>=0.4.5, <=0.4.6 -deepspeed<0.6.0 +deepspeed>=0.6.0, <0.6.5 # no need to install with [pytorch] as pytorch is already installed horovod>=0.21.2, !=0.24.0, <0.25.1 hivemind>=1.0.1, <=1.0.1; sys_platform == 'linux' diff --git a/src/pytorch_lightning/plugins/precision/apex_amp.py b/src/pytorch_lightning/plugins/precision/apex_amp.py index 15825dedd2ef6..e18f82dc27f6e 100644 --- a/src/pytorch_lightning/plugins/precision/apex_amp.py +++ b/src/pytorch_lightning/plugins/precision/apex_amp.py @@ -59,6 +59,7 @@ def backward( model: "pl.LightningModule", closure_loss: Tensor, optimizer: Optional[Optimizer], + optimizer_idx: Optional[int], *args: Any, **kwargs: Any, ) -> None: @@ -71,7 +72,7 @@ def backward( """ opt = optimizer or model.trainer.optimizers with amp.scale_loss(closure_loss, opt) as closure_loss: - super().backward(model, closure_loss, optimizer, *args, **kwargs) + super().backward(model, closure_loss, optimizer, optimizer_idx, *args, **kwargs) def optimizer_step( self, diff --git a/src/pytorch_lightning/plugins/precision/deepspeed.py b/src/pytorch_lightning/plugins/precision/deepspeed.py index 96458487c7420..fa948520e1fd6 100644 --- a/src/pytorch_lightning/plugins/precision/deepspeed.py +++ b/src/pytorch_lightning/plugins/precision/deepspeed.py @@ -27,10 +27,8 @@ from pytorch_lightning.utilities.warnings import WarningCache _DEEPSPEED_AVAILABLE = _RequirementAvailable("deepspeed") -_DEEPSPEED_GREATER_EQUAL_0_6 = _RequirementAvailable("deepspeed>=0.6.0") -if TYPE_CHECKING: - if _DEEPSPEED_AVAILABLE: - import deepspeed +if TYPE_CHECKING and _DEEPSPEED_AVAILABLE: + import deepspeed warning_cache = WarningCache() @@ -53,12 +51,6 @@ class DeepSpeedPrecisionPlugin(PrecisionPlugin): """ def __init__(self, precision: Union[str, int], amp_type: str, amp_level: Optional[str] = None) -> None: - if precision == PrecisionType.BFLOAT and not _DEEPSPEED_GREATER_EQUAL_0_6: - raise MisconfigurationException( - f"`Trainer(strategy='deepspeed', precision={precision!r})` is not supported" - " with `deepspeed < v0.6`. Please upgrade it using `pip install -U deepspeed`." - ) - supported_precision = (PrecisionType.HALF, PrecisionType.FLOAT, PrecisionType.BFLOAT, PrecisionType.MIXED) if precision not in supported_precision: raise ValueError( @@ -71,7 +63,15 @@ def __init__(self, precision: Union[str, int], amp_type: str, amp_level: Optiona self.amp_type = amp_type self.amp_level = amp_level - def backward(self, model: "pl.LightningModule", closure_loss: Tensor, *args: Any, **kwargs: Any) -> None: + def backward( + self, + model: "pl.LightningModule", + closure_loss: Tensor, + optimizer: Optional[Optimizer], + optimizer_idx: Optional[int], + *args: Any, + **kwargs: Any, + ) -> None: if is_overridden("backward", model): warning_cache.warn( "You have overridden the `LightningModule.backward` hook but it will be ignored since DeepSpeed handles" diff --git a/src/pytorch_lightning/plugins/precision/ipu.py b/src/pytorch_lightning/plugins/precision/ipu.py index 329a8b8978e50..89f544575f63f 100644 --- a/src/pytorch_lightning/plugins/precision/ipu.py +++ b/src/pytorch_lightning/plugins/precision/ipu.py @@ -44,7 +44,7 @@ def __init__(self, precision: int) -> None: super().__init__() self.precision = precision - def backward(self, model: "pl.LightningModule", *args: Any, **kwargs: Any) -> None: + def backward(self, model: "pl.LightningModule", *_: Any, **__: Any) -> None: if is_overridden("backward", model): warning_cache.warn( "You have overridden the `LightningModule.backward` hook but it will be ignored since IPUs handle" diff --git a/src/pytorch_lightning/plugins/precision/precision_plugin.py b/src/pytorch_lightning/plugins/precision/precision_plugin.py index cbf18b8c4fa41..02d343a0876b4 100644 --- a/src/pytorch_lightning/plugins/precision/precision_plugin.py +++ b/src/pytorch_lightning/plugins/precision/precision_plugin.py @@ -64,6 +64,7 @@ def backward( model: "pl.LightningModule", closure_loss: Tensor, optimizer: Optional[Optimizer], + optimizer_idx: Optional[int], *args: Any, **kwargs: Any, ) -> None: @@ -76,7 +77,7 @@ def backward( """ # do backward pass if model is not None and isinstance(model, pl.LightningModule): - model.backward(closure_loss, optimizer, *args, **kwargs) + model.backward(closure_loss, optimizer, optimizer_idx, *args, **kwargs) else: self._run_backward(closure_loss, *args, **kwargs) diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py index f47afc890bcbb..0de904ccbd283 100644 --- a/src/pytorch_lightning/strategies/strategy.py +++ b/src/pytorch_lightning/strategies/strategy.py @@ -171,7 +171,14 @@ def optimizer_state(self, optimizer: Optimizer) -> Dict[str, Tensor]: """ return optimizer.state_dict() - def backward(self, closure_loss: Tensor, *args: Any, **kwargs: Any) -> Tensor: + def backward( + self, + closure_loss: Tensor, + optimizer: Optional[Optimizer], + optimizer_idx: Optional[int], + *args: Any, + **kwargs: Any, + ) -> Tensor: """Forwards backward-calls to the precision plugin. Args: @@ -181,7 +188,7 @@ def backward(self, closure_loss: Tensor, *args: Any, **kwargs: Any) -> Tensor: assert self.lightning_module is not None closure_loss = self.precision_plugin.pre_backward(self.lightning_module, closure_loss) - self.precision_plugin.backward(self.lightning_module, closure_loss, *args, **kwargs) + self.precision_plugin.backward(self.lightning_module, closure_loss, optimizer, optimizer_idx, *args, **kwargs) closure_loss = self.precision_plugin.post_backward(self.lightning_module, closure_loss) self.post_backward(closure_loss) diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index ca2b06b6d695b..86a0a5a82195a 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -412,15 +412,21 @@ def run(self): model = BoringModel() optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) model, optimizer = self.setup(model, optimizer) - state_dict = deepcopy(model.state_dict()) - for _ in range(2): + for i in range(2): optimizer.zero_grad() x = model(torch.randn(1, 32).to(self.device)) loss = x.sum() + if i == 0: + # the weights are not initialized with stage 3 until backward is run once + assert all(w.nelement() == 0 for w in model.state_dict().values()) self.backward(loss, model=model) + if i == 0: + # save for later to check that the weights were updated + state_dict = deepcopy(model.state_dict()) optimizer.step() + # check that the model trained, the weights from step 1 do not match the weights from step 2 for mw_b, mw_a in zip(state_dict.values(), model.state_dict().values()): assert not torch.allclose(mw_b, mw_a) @@ -438,6 +444,7 @@ def run(self): model_1, optimizer_1 = self.setup(model_1, optimizer_1) model_2, optimizer_2 = self.setup(model_2, optimizer_2) + # train model_1 first self.seed_everything(42) data_list = [] for _ in range(2): @@ -449,9 +456,11 @@ def run(self): self.backward(loss, model=model_1) optimizer_1.step() - for mw_1, mw_2 in zip(model_1.state_dict().values(), model_2.state_dict().values()): - assert not torch.allclose(mw_1, mw_2) + # the weights do not match + assert all(w.nelement() > 1 for w in model_1.state_dict().values()) + assert all(w.nelement() == 0 for w in model_2.state_dict().values()) + # now train model_2 with the same data for data in data_list: optimizer_2.zero_grad() x = model_2(data) @@ -459,6 +468,7 @@ def run(self): self.backward(loss, model=model_2) optimizer_2.step() + # the weights should match for mw_1, mw_2 in zip(model_1.state_dict().values(), model_2.state_dict().values()): assert torch.allclose(mw_1, mw_2) diff --git a/tests/tests_pytorch/plugins/precision/test_deepspeed_precision.py b/tests/tests_pytorch/plugins/precision/test_deepspeed_precision.py index 8a68f7c73209b..a4698e7c19c97 100644 --- a/tests/tests_pytorch/plugins/precision/test_deepspeed_precision.py +++ b/tests/tests_pytorch/plugins/precision/test_deepspeed_precision.py @@ -11,20 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from unittest import mock - import pytest from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin -from pytorch_lightning.utilities.exceptions import MisconfigurationException def test_invalid_precision_with_deepspeed_precision(): with pytest.raises(ValueError, match="is not supported. `precision` must be one of"): DeepSpeedPrecisionPlugin(precision=64, amp_type="native") - - -@mock.patch("pytorch_lightning.plugins.precision.deepspeed._DEEPSPEED_GREATER_EQUAL_0_6", False) -def test_incompatible_bfloat16_raises_error_with_deepspeed_version(): - with pytest.raises(MisconfigurationException, match="is not supported with `deepspeed < v0.6`"): - DeepSpeedPrecisionPlugin(precision="bf16", amp_type="native") diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 79562134f9ccb..6a2a2fc5d9111 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -30,11 +30,9 @@ from pytorch_lightning.callbacks import Callback, LearningRateMonitor, ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset from pytorch_lightning.plugins import DeepSpeedPrecisionPlugin -from pytorch_lightning.plugins.precision.deepspeed import _DEEPSPEED_GREATER_EQUAL_0_6 from pytorch_lightning.strategies import DeepSpeedStrategy from pytorch_lightning.strategies.deepspeed import _DEEPSPEED_AVAILABLE, LightningDeepSpeedModule from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.meta import init_meta_context from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.datasets import RandomIterableDataset @@ -42,14 +40,9 @@ if _DEEPSPEED_AVAILABLE: import deepspeed + from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict - _DEEPSPEED_GREATER_EQUAL_0_5_9 = _RequirementAvailable("deepspeed>=0.5.9") - if _DEEPSPEED_GREATER_EQUAL_0_5_9: - from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer - else: - from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer as DeepSpeedZeroOptimizer - class ModelParallelBoringModel(BoringModel): def __init__(self): @@ -1294,7 +1287,6 @@ def training_step(self, *args, **kwargs): @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) -@pytest.mark.skipif(not _DEEPSPEED_GREATER_EQUAL_0_6, reason="requires deepspeed >= 0.6") def test_deepspeed_with_bfloat16_precision(tmpdir): """Test that deepspeed works with bfloat16 precision.""" model = BoringModel()