Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[bugfix] DeepSpeed with no schedulers #8580

Merged
merged 7 commits into from
Jul 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed `BackboneFinetuning` restoration ([#8501](https://github.com/PyTorchLightning/pytorch-lightning/pull/8501))


- Fixed `DeepSpeed` breaking with no schedulers ([#8580](https://github.com/PyTorchLightning/pytorch-lightning/pull/8580))



## [1.3.8] - 2021-07-01

Expand Down
8 changes: 5 additions & 3 deletions pytorch_lightning/plugins/training_type/deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ def _init_optimizers(self) -> Tuple[Optimizer, Optional[Union[LRSchedulerTypeTup
)
return (
optimizers[0],
schedulers[0] if schedulers else None,
schedulers[0] if schedulers else _get_default_scheduler_config(),
optimizer_frequencies[0] if optimizer_frequencies else None,
)

Expand All @@ -414,6 +414,7 @@ def _initialize_deepspeed_train(self, model):
"Using `configure_optimizers` to define optimizer and scheduler."
)
optimizer, lr_scheduler, _ = self._init_optimizers()

scheduler = lr_scheduler["scheduler"]

model_parameters = filter(lambda p: p.requires_grad, self.model.parameters())
Expand All @@ -430,8 +431,9 @@ def _initialize_deepspeed_train(self, model):

# although we set these here, deepspeed manages the specific optimizer logic
self.lightning_module.trainer.optimizers = [deepspeed_optimizer]
lr_scheduler["scheduler"] = deepspeed_scheduler
self.lightning_module.trainer.lr_schedulers = [lr_scheduler]
if deepspeed_scheduler is not None:
lr_scheduler["scheduler"] = deepspeed_scheduler
self.lightning_module.trainer.lr_schedulers = [lr_scheduler]
self.model = model

@contextlib.contextmanager
Expand Down
19 changes: 19 additions & 0 deletions tests/plugins/test_deepspeed_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
self.configure_sharded_model()


class ModelParallelBoringModelNoSchedulers(ModelParallelBoringModel):
def configure_optimizers(self):
return torch.optim.SGD(self.layer.parameters(), lr=0.1)


class ModelParallelBoringModelManualOptim(BoringModel):
def __init__(self):
super().__init__()
Expand Down Expand Up @@ -687,3 +692,17 @@ def _assert_save_model_is_equal(model, tmpdir, trainer, cls=BoringModel):
# Assert model parameters are identical after loading
for orig_param, trained_model_param in zip(model.parameters(), saved_model.parameters()):
assert torch.equal(orig_param, trained_model_param)


@RunIf(min_gpus=2, deepspeed=True, special=True)
def test_deepspeed_multigpu_no_schedulers(tmpdir):
"""
Test to ensure ZeRO Stage 3 works with a parallel model and no schedulers.
"""
model = ModelParallelBoringModelNoSchedulers()
trainer = Trainer(
default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=2, fast_dev_run=True, precision=16
)
trainer.fit(model)

_assert_save_model_is_equal(model, tmpdir, trainer, cls=ModelParallelBoringModelNoSchedulers)