Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose DeepSpeed FP16 parameters due to loss instability #6115

Merged
merged 4 commits into from
Feb 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Move lightning module to correct device type when using LightningDistributedWrapper ([#6070](https://github.com/PyTorchLightning/pytorch-lightning/pull/6070)


- Expose DeepSpeed loss parameters to allow users to fix loss instability ([#6115](https://github.com/PyTorchLightning/pytorch-lightning/pull/6115)


## [1.2.0] - 2021-02-18

### Added
Expand Down
40 changes: 37 additions & 3 deletions pytorch_lightning/plugins/training_type/deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ def __init__(
num_nodes: int = 1,
parallel_devices: Optional[List[torch.device]] = None,
cluster_environment: Optional[ClusterEnvironment] = None,
loss_scale: float = 0,
initial_scale_power: int = 32,
loss_scale_window: int = 1000,
hysteresis: int = 2,
min_loss_scale: int = 1
) -> None:
"""

Expand Down Expand Up @@ -127,6 +132,18 @@ def __init__(

logging_level: Set logging level for deepspeed. (Default: ``logging.WARN``)

loss_scale: Loss scaling value for FP16 training.
0.0 results in dynamic loss scaling, otherwise static (Default: 0)

initial_scale_power: Power of the initial dynamic loss scale value. Loss scale is computed
by ``2^initial_scale_power`` (Default: 32)

loss_scale_window: Window in which to raise/lower the dynamic FP16 loss scaling value (Default: 1000)

hysteresis: FP16 Delay shift in Dynamic Loss scaling (Default: 2)

min_loss_scale: The minimum FP16 dynamic loss scaling value (Default: 1000)

"""
if not _DEEPSPEED_AVAILABLE:
raise MisconfigurationException(
Expand Down Expand Up @@ -154,6 +171,13 @@ def __init__(
self._config_initialized = False
deepspeed.utils.logging.logger.setLevel(logging_level)

# default FP16 parameters.
self.loss_scale = loss_scale
self.initial_scale_power = initial_scale_power
self.loss_scale_window = loss_scale_window
self.hysteresis = hysteresis
self.min_loss_scale = min_loss_scale

def _load_config(self, config):
if config is None and self.DEEPSPEED_ENV_VAR in os.environ:
rank_zero_info(f"Loading DeepSpeed config from set {self.DEEPSPEED_ENV_VAR} environment variable")
Expand Down Expand Up @@ -297,9 +321,19 @@ def _format_precision_config(self):
amp_level = self.lightning_module.trainer.accelerator_connector.amp_level
precision = self.lightning_module.trainer.accelerator_connector.precision
if precision == 16:
if "amp" not in self.config and amp_type == AMPType.NATIVE:
self.config["fp16"] = {"enabled": True}
elif "apex" not in self.config and amp_type == AMPType.APEX:
if "fp16" not in self.config and amp_type == AMPType.NATIVE:
# FP16 is a DeepSpeed standalone AMP implementation
rank_zero_info("Enabling DeepSpeed FP16.")
self.config["fp16"] = {
"enabled": True,
"loss_scale": self.loss_scale,
"initial_scale_power": self.initial_scale_power,
"loss_scale_window": self.loss_scale_window,
"hysteresis": self.hysteresis,
"min_loss_scale": self.min_loss_scale
}
elif "amp" not in self.config and amp_type == AMPType.APEX:
rank_zero_only("Enabling DeepSpeed APEX Implementation.")
self.config["amp"] = {
"enabled": True,
"opt_level": amp_level,
Expand Down
61 changes: 52 additions & 9 deletions tests/plugins/test_deepspeed_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,9 +211,6 @@ def test_invalid_deepspeed_defaults_no_precision(tmpdir):

@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
@pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
@pytest.mark.skipif(
not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
)
def test_warn_deepspeed_override_backward(tmpdir):
"""
Test to ensure that if the backward hook in the LightningModule is overridden, we throw a warning.
Expand All @@ -232,9 +229,6 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args

@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
@pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
@pytest.mark.skipif(
not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
)
def test_deepspeed_run_configure_optimizers(tmpdir):
"""
Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation),
Expand Down Expand Up @@ -268,9 +262,6 @@ def on_train_start(self) -> None:

@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
@pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
@pytest.mark.skipif(
not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', reason="test should be run outside of pytest"
)
def test_deepspeed_config(tmpdir, deepspeed_zero_config):
"""
Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers
Expand Down Expand Up @@ -304,6 +295,58 @@ def on_train_start(self) -> None:
_assert_save_model_is_equal(model, tmpdir, trainer)


@pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
def test_deepspeed_custom_precision_params(tmpdir):
"""
Ensure if we modify the FP16 parameters via the DeepSpeedPlugin, the deepspeed config contains these changes.
"""

class TestModel(BoringModel):

def on_train_start(self) -> None:
assert self.trainer.training_type_plugin.config['fp16']['loss_scale'] == 10
assert self.trainer.training_type_plugin.config['fp16']['initial_scale_power'] == 10
assert self.trainer.training_type_plugin.config['fp16']['loss_scale_window'] == 10
assert self.trainer.training_type_plugin.config['fp16']['hysteresis'] == 10
assert self.trainer.training_type_plugin.config['fp16']['min_loss_scale'] == 10
raise SystemExit()

model = TestModel()
trainer = Trainer(
plugins=[
DeepSpeedPlugin(
loss_scale=10, initial_scale_power=10, loss_scale_window=10, hysteresis=10, min_loss_scale=10
SeanNaren marked this conversation as resolved.
Show resolved Hide resolved
)
],
precision=16,
gpus=1
)
with pytest.raises(SystemExit):
trainer.fit(model)


@pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
def test_deepspeed_assert_config_zero_offload_disabled(tmpdir, deepspeed_zero_config):
"""
Ensure if we use a config and turn off cpu_offload, that this is set to False within the config.
"""

deepspeed_zero_config['zero_optimization']['cpu_offload'] = False

class TestModel(BoringModel):

def on_train_start(self) -> None:
assert self.trainer.training_type_plugin.config['zero_optimization']['cpu_offload'] is False
raise SystemExit()

model = TestModel()
trainer = Trainer(plugins=[DeepSpeedPlugin(config=deepspeed_zero_config)], precision=16, gpus=1)
with pytest.raises(SystemExit):
trainer.fit(model)


@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
@pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
Expand Down
3 changes: 0 additions & 3 deletions tests/special_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,6 @@ export PL_RUNNING_SPECIAL_TESTS=1
DEFAULTS="-m coverage run --source pytorch_lightning -a -m pytest --verbose --capture=no"
python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp
python ${DEFAULTS} tests/models/test_sync_batchnorm.py::test_sync_batchnorm_ddp
python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_warn_deepspeed_override_backward
python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_run_configure_optimizers
python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_config
python ${DEFAULTS} tests/plugins/test_deepspeed_plugin.py::test_deepspeed_multigpu
python ${DEFAULTS} tests/plugins/test_rpc_plugin.py::test_rpc_function_calls_ddp
python ${DEFAULTS} tests/plugins/test_rpc_sequential_plugin.py::test_rpc_sequential_plugin_manual
Expand Down