Skip to content

Commit

Permalink
Expose DeepSpeed FP16 parameters due to loss instability (#6115)
Browse files Browse the repository at this point in the history
* Expose deepspeed config parameters to init function due to instability in parameters

* See if tests can run on normal CI, without special tests

* Add changelog

* Update pytorch_lightning/plugins/training_type/deepspeed.py

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

(cherry picked from commit 432e563)

Add missing config
  • Loading branch information
SeanNaren authored and lexierule committed Mar 16, 2021
1 parent 953c873 commit f0ad06a
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 16 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Disabled batch transfer in DP mode ([#6093](https://github.com/PyTorchLightning/pytorch-lightning/pull/6093))


- Expose DeepSpeed loss parameters to allow users to fix loss instability ([#6115](https://github.com/PyTorchLightning/pytorch-lightning/pull/6115)


## [1.2.0] - 2021-02-18

### Added
Expand Down
40 changes: 37 additions & 3 deletions pytorch_lightning/plugins/training_type/deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ def __init__(
num_nodes: int = 1,
parallel_devices: Optional[List[torch.device]] = None,
cluster_environment: Optional[ClusterEnvironment] = None,
loss_scale: float = 0,
initial_scale_power: int = 32,
loss_scale_window: int = 1000,
hysteresis: int = 2,
min_loss_scale: int = 1
) -> None:
"""
Expand Down Expand Up @@ -127,6 +132,18 @@ def __init__(
logging_level: Set logging level for deepspeed. (Default: ``logging.WARN``)
loss_scale: Loss scaling value for FP16 training.
0.0 results in dynamic loss scaling, otherwise static (Default: 0)
initial_scale_power: Power of the initial dynamic loss scale value. Loss scale is computed
by ``2^initial_scale_power`` (Default: 32)
loss_scale_window: Window in which to raise/lower the dynamic FP16 loss scaling value (Default: 1000)
hysteresis: FP16 Delay shift in Dynamic Loss scaling (Default: 2)
min_loss_scale: The minimum FP16 dynamic loss scaling value (Default: 1000)
"""
if not _DEEPSPEED_AVAILABLE:
raise MisconfigurationException(
Expand Down Expand Up @@ -154,6 +171,13 @@ def __init__(
self._config_initialized = False
deepspeed.utils.logging.logger.setLevel(logging_level)

# default FP16 parameters.
self.loss_scale = loss_scale
self.initial_scale_power = initial_scale_power
self.loss_scale_window = loss_scale_window
self.hysteresis = hysteresis
self.min_loss_scale = min_loss_scale

def _load_config(self, config):
if config is None and self.DEEPSPEED_ENV_VAR in os.environ:
rank_zero_info(f"Loading DeepSpeed config from set {self.DEEPSPEED_ENV_VAR} environment variable")
Expand Down Expand Up @@ -299,9 +323,19 @@ def _format_precision_config(self):
amp_level = self.lightning_module.trainer.accelerator_connector.amp_level
precision = self.lightning_module.trainer.accelerator_connector.precision
if precision == 16:
if "amp" not in self.config and amp_type == AMPType.NATIVE:
self.config["fp16"] = {"enabled": True}
elif "apex" not in self.config and amp_type == AMPType.APEX:
if "fp16" not in self.config and amp_type == AMPType.NATIVE:
# FP16 is a DeepSpeed standalone AMP implementation
rank_zero_info("Enabling DeepSpeed FP16.")
self.config["fp16"] = {
"enabled": True,
"loss_scale": self.loss_scale,
"initial_scale_power": self.initial_scale_power,
"loss_scale_window": self.loss_scale_window,
"hysteresis": self.hysteresis,
"min_loss_scale": self.min_loss_scale
}
elif "amp" not in self.config and amp_type == AMPType.APEX:
rank_zero_only("Enabling DeepSpeed APEX Implementation.")
self.config["amp"] = {
"enabled": True,
"opt_level": amp_level,
Expand Down
68 changes: 58 additions & 10 deletions tests/plugins/test_deepspeed_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ def deepspeed_config():
}


@pytest.fixture
def deepspeed_zero_config(deepspeed_config):
return {**deepspeed_config, 'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2}}


@pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
def test_deepspeed_plugin_string(tmpdir):
"""
Expand Down Expand Up @@ -165,9 +170,6 @@ def test_invalid_deepspeed_defaults_no_precision(tmpdir):

@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
@pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
@pytest.mark.skipif(
not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
)
def test_warn_deepspeed_override_backward(tmpdir):
"""
Test to ensure that if the backward hook in the LightningModule is overridden, we throw a warning.
Expand All @@ -191,9 +193,6 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args

@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
@pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
@pytest.mark.skipif(
not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
)
def test_deepspeed_run_configure_optimizers(tmpdir):
"""
Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation),
Expand Down Expand Up @@ -223,10 +222,7 @@ def on_train_start(self) -> None:

@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
@pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
@pytest.mark.skipif(
not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
)
def test_deepspeed_config(tmpdir, deepspeed_config):
def test_deepspeed_config(tmpdir, deepspeed_zero_config):
"""
Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers
and saves the model weights to load correctly.
Expand Down Expand Up @@ -255,6 +251,58 @@ def on_train_start(self) -> None:
_assert_save_model_is_equal(model, tmpdir, trainer)


@pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
def test_deepspeed_custom_precision_params(tmpdir):
"""
Ensure if we modify the FP16 parameters via the DeepSpeedPlugin, the deepspeed config contains these changes.
"""

class TestModel(BoringModel):

def on_train_start(self) -> None:
assert self.trainer.training_type_plugin.config['fp16']['loss_scale'] == 10
assert self.trainer.training_type_plugin.config['fp16']['initial_scale_power'] == 10
assert self.trainer.training_type_plugin.config['fp16']['loss_scale_window'] == 10
assert self.trainer.training_type_plugin.config['fp16']['hysteresis'] == 10
assert self.trainer.training_type_plugin.config['fp16']['min_loss_scale'] == 10
raise SystemExit()

model = TestModel()
trainer = Trainer(
plugins=[
DeepSpeedPlugin(
loss_scale=10, initial_scale_power=10, loss_scale_window=10, hysteresis=10, min_loss_scale=10
)
],
precision=16,
gpus=1
)
with pytest.raises(SystemExit):
trainer.fit(model)


@pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
def test_deepspeed_assert_config_zero_offload_disabled(tmpdir, deepspeed_zero_config):
"""
Ensure if we use a config and turn off cpu_offload, that this is set to False within the config.
"""

deepspeed_zero_config['zero_optimization']['cpu_offload'] = False

class TestModel(BoringModel):

def on_train_start(self) -> None:
assert self.trainer.training_type_plugin.config['zero_optimization']['cpu_offload'] is False
raise SystemExit()

model = TestModel()
trainer = Trainer(plugins=[DeepSpeedPlugin(config=deepspeed_zero_config)], precision=16, gpus=1)
with pytest.raises(SystemExit):
trainer.fit(model)


@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
@pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
Expand Down
3 changes: 0 additions & 3 deletions tests/special_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,6 @@ export PL_RUNNING_SPECIAL_TESTS=1
DEFAULTS="-m coverage run --source pytorch_lightning -a -m pytest --verbose --capture=no"
python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp
python ${DEFAULTS} tests/models/test_sync_batchnorm.py::test_sync_batchnorm_ddp
python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_warn_deepspeed_override_backward
python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_run_configure_optimizers
python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_config
python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_multigpu
python ${DEFAULTS} tests/plugins/test_rpc_plugin.py::test_rpc_function_calls_ddp
python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual
Expand Down

0 comments on commit f0ad06a

Please sign in to comment.