Skip to content

Commit

Permalink
fix amp/apex misconfiguration error for cpu (#6107)
Browse files Browse the repository at this point in the history
* fix weird test

* fix apex plugin test

* fix raise

* cpu test

* fix type

* add changelog
  • Loading branch information
awaelchli authored Feb 22, 2021
1 parent 97b4b3e commit ae6ce17
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 99 deletions.
9 changes: 6 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,16 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed incorrect yield logic for the amp autocast context manager ([#6080](https://github.com/PyTorchLightning/pytorch-lightning/pull/6080))


- Made the `Plugin.reduce` method more consistent across all Plugins to reflect a mean-reduction by default ([#6011](https://github.com/PyTorchLightning/pytorch-lightning/pull/6011)
- Made the `Plugin.reduce` method more consistent across all Plugins to reflect a mean-reduction by default ([#6011](https://github.com/PyTorchLightning/pytorch-lightning/pull/6011))


- Fixed priority of plugin/accelerator when setting distributed mode ([#6089](https://github.com/PyTorchLightning/pytorch-lightning/pull/6089))


- Move lightning module to correct device type when using LightningDistributedWrapper ([#6070](https://github.com/PyTorchLightning/pytorch-lightning/pull/6070)
- Move lightning module to correct device type when using LightningDistributedWrapper ([#6070](https://github.com/PyTorchLightning/pytorch-lightning/pull/6070))


- Fixed error message for AMP + CPU incompatibility ([#6107](https://github.com/PyTorchLightning/pytorch-lightning/pull/6107))


- Expose DeepSpeed loss parameters to allow users to fix loss instability ([#6115](https://github.com/PyTorchLightning/pytorch-lightning/pull/6115)
Expand All @@ -40,7 +43,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Added

- Added `DataType`, `AverageMethod` and `MDMCAverageMethod` enum in metrics ([#5657](https://github.com/PyTorchLightning/pytorch-lightning/pull/5689)
- Added `DataType`, `AverageMethod` and `MDMCAverageMethod` enum in metrics ([#5657](https://github.com/PyTorchLightning/pytorch-lightning/pull/5689))
- Added support for summarized model total params size in megabytes ([#5590](https://github.com/PyTorchLightning/pytorch-lightning/pull/5590))
- Added support for multiple train loaders ([#1959](https://github.com/PyTorchLightning/pytorch-lightning/pull/1959))
- Added `Accuracy` metric now generalizes to Top-k accuracy for (multi-dimensional) multi-class inputs using the `top_k` parameter ([#4838](https://github.com/PyTorchLightning/pytorch-lightning/pull/4838))
Expand Down
2 changes: 1 addition & 1 deletion pytorch_lightning/accelerators/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ class CPUAccelerator(Accelerator):

def setup(self, trainer, model):
if isinstance(self.precision_plugin, MixedPrecisionPlugin):
MisconfigurationException("amp + cpu is not supported. Please use a GPU option")
raise MisconfigurationException("amp + cpu is not supported. Please use a GPU option")

if "cpu" not in str(self.root_device):
raise MisconfigurationException(f"Device should be CPU, got {self.root_device} instead")
Expand Down
21 changes: 21 additions & 0 deletions tests/accelerators/test_cpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from unittest.mock import Mock

import pytest
import torch

from pytorch_lightning.accelerators import CPUAccelerator
from pytorch_lightning.plugins import SingleDevicePlugin
from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
from pytorch_lightning.utilities.exceptions import MisconfigurationException


def test_unsupported_precision_plugins():
""" Test error messages are raised for unsupported precision plugins with CPU. """
trainer = Mock()
model = Mock()
accelerator = CPUAccelerator(
training_type_plugin=SingleDevicePlugin(torch.device("cpu")),
precision_plugin=MixedPrecisionPlugin()
)
with pytest.raises(MisconfigurationException, match=r"amp \+ cpu is not supported."):
accelerator.setup(trainer=trainer, model=model)
67 changes: 4 additions & 63 deletions tests/plugins/test_amp_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,8 @@
import torch

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.plugins import NativeMixedPrecisionPlugin
from pytorch_lightning.utilities import _NATIVE_AMP_AVAILABLE
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from tests.helpers.boring_model import BoringModel


Expand All @@ -25,78 +23,21 @@
)
@mock.patch('torch.cuda.device_count', return_value=2)
@pytest.mark.parametrize(
['ddp_backend', 'gpus', 'num_processes'],
[('ddp_cpu', None, 2), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)],
['ddp_backend', 'gpus'],
[('ddp', 2), ('ddp2', 2), ('ddp_spawn', 2)],
)
def on_fit_start(tmpdir, ddp_backend, gpus, num_processes):

class CB(Callback):

def on_fit_start(self, trainer, pl_module):
assert isinstance(trainer.precision_plugin, NativeMixedPrecisionPlugin)
raise SystemExit()

def train():
model = BoringModel()
trainer = Trainer(
fast_dev_run=True,
precision=16,
amp_backend='native',
gpus=gpus,
num_processes=num_processes,
accelerator=ddp_backend,
callbacks=[CB()],
)
trainer.fit(model)

if ddp_backend == "ddp_cpu":
with pytest.raises(MisconfigurationException, match="MP is only available on GPU"):
train()
else:
with pytest.raises(SystemExit):
train()


@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="Minimal PT version is set to 1.6")
@mock.patch.dict(
os.environ, {
"CUDA_VISIBLE_DEVICES": "0,1",
"SLURM_NTASKS": "2",
"SLURM_JOB_NAME": "SOME_NAME",
"SLURM_NODEID": "0",
"LOCAL_RANK": "0",
"SLURM_LOCALID": "0"
}
)
@mock.patch('torch.cuda.device_count', return_value=2)
@pytest.mark.parametrize(
['ddp_backend', 'gpus', 'num_processes'],
[('ddp_cpu', None, 2), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)],
)
def test_amp_choice_custom_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
def test_amp_choice_custom_ddp_cpu(device_count_mock, ddp_backend, gpus):

class MyNativeAMP(NativeMixedPrecisionPlugin):
pass

class CB(Callback):

def on_fit_start(self, trainer, pl_module):
assert isinstance(trainer.precision_plugin, MyNativeAMP)
raise SystemExit()

model = BoringModel()
trainer = Trainer(
fast_dev_run=True,
precision=16,
amp_backend='native',
num_processes=num_processes,
accelerator=ddp_backend,
plugins=[MyNativeAMP()],
callbacks=[CB()],
)

with pytest.raises(SystemExit):
trainer.fit(model)
assert isinstance(trainer.precision_plugin, MyNativeAMP)


class GradientUnscaleBoringModel(BoringModel):
Expand Down
40 changes: 8 additions & 32 deletions tests/plugins/test_apex_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@
import pytest

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.plugins import ApexMixedPrecisionPlugin
from pytorch_lightning.utilities import _APEX_AVAILABLE
from tests.helpers.boring_model import BoringModel


@pytest.mark.skipif(not _APEX_AVAILABLE, reason="test requires apex")
Expand All @@ -23,30 +21,19 @@
)
@mock.patch('torch.cuda.device_count', return_value=2)
@pytest.mark.parametrize(
['ddp_backend', 'gpus', 'num_processes'],
[('ddp_cpu', None, 2), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)],
['ddp_backend', 'gpus'],
[('ddp', 2), ('ddp2', 2), ('ddp_spawn', 2)],
)
def test_amp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
def test_amp_choice_default_ddp(mocked_device_count, ddp_backend, gpus):

class CB(Callback):

def on_fit_start(self, trainer, pl_module):
assert isinstance(trainer.precision_plugin, ApexMixedPrecisionPlugin)
raise SystemExit()

model = BoringModel()
trainer = Trainer(
fast_dev_run=True,
precision=16,
amp_backend='apex',
gpus=gpus,
num_processes=num_processes,
accelerator=ddp_backend,
callbacks=[CB()],
)

with pytest.raises(SystemExit):
trainer.fit(model)
assert isinstance(trainer.precision_plugin, ApexMixedPrecisionPlugin)


@pytest.mark.skipif(not _APEX_AVAILABLE, reason="test requires apex")
Expand All @@ -62,31 +49,20 @@ def on_fit_start(self, trainer, pl_module):
)
@mock.patch('torch.cuda.device_count', return_value=2)
@pytest.mark.parametrize(
['ddp_backend', 'gpus', 'num_processes'],
[('ddp_cpu', None, 2), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)],
['ddp_backend', 'gpus'],
[('ddp', 2), ('ddp2', 2), ('ddp_spawn', 2)],
)
def test_amp_choice_custom_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes):
def test_amp_choice_custom_ddp(mocked_device_count, ddp_backend, gpus):

class MyApexPlugin(ApexMixedPrecisionPlugin):
pass

class CB(Callback):

def on_fit_start(self, trainer, pl_module):
assert isinstance(trainer.precision_plugin, MyApexPlugin)
raise SystemExit()

model = BoringModel()
trainer = Trainer(
fast_dev_run=True,
precision=16,
amp_backend='apex',
gpus=gpus,
num_processes=num_processes,
accelerator=ddp_backend,
plugins=[MyApexPlugin(amp_level="O2")],
callbacks=[CB()],
)

with pytest.raises(SystemExit):
trainer.fit(model)
assert isinstance(trainer.precision_plugin, MyApexPlugin)

0 comments on commit ae6ce17

Please sign in to comment.