Skip to content

Commit

Permalink
Fix validation when accelerator is a string (#13417)
Browse files Browse the repository at this point in the history
Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
  • Loading branch information
awaelchli and rohitgr7 committed Jul 1, 2022
1 parent 9e0cc17 commit a6d1246
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 247 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed Model Summary when using DeepSpeed Stage 3 ([#13427](https://github.com/PyTorchLightning/pytorch-lightning/pull/13427))
- Fixed `pytorch_lightning.utilities.distributed.gather_all_tensors` to handle tensors of different dimensions ([#12630](https://github.com/PyTorchLightning/pytorch-lightning/pull/12630))
- The loops now call `.set_epoch()` also on batch samplers if the dataloader has one wrapped in a distributed sampler ([#13396](https://github.com/PyTorchLightning/pytorch-lightning/pull/13396))
- Fixed the input validation for the accelerator Trainer argument when passed as a string ([#13417](https://github.com/PyTorchLightning/pytorch-lightning/pull/13417))


## [1.6.4] - 2022-06-01
Expand Down
29 changes: 12 additions & 17 deletions pytorch_lightning/trainer/connectors/accelerator_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,17 +290,18 @@ def _check_config_and_set_final_flags(
f" and you can only specify one strategy, but you have passed {plugin} as a plugin."
)

if accelerator is not None:
if accelerator in self._accelerator_types or accelerator == "auto" or isinstance(accelerator, Accelerator):
self._accelerator_flag = accelerator
elif accelerator in self._registered_strategies or isinstance(accelerator, Strategy):
rank_zero_deprecation(
f"Passing `Trainer(accelerator={accelerator!r})` has been deprecated"
f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={accelerator!r})` instead."
)
self._strategy_flag = accelerator
elif accelerator == "ddp_cpu" and not self._strategy_flag:
self._strategy_flag = accelerator
if (
accelerator is not None
and accelerator not in self._accelerator_types
and accelerator != "auto"
and not isinstance(accelerator, Accelerator)
):
raise ValueError(
f"You selected an invalid accelerator name: `accelerator={accelerator!r}`."
f" Available names are: {', '.join(self._accelerator_types)}."
)

self._accelerator_flag = accelerator

if precision is not None:
if str(precision) not in self._precision_types:
Expand Down Expand Up @@ -504,12 +505,6 @@ def _set_parallel_devices_and_init_accelerator(self) -> None:
self.accelerator: Accelerator = self._accelerator_flag
else:
assert self._accelerator_flag is not None
self._accelerator_flag = self._accelerator_flag.lower()
if self._accelerator_flag not in AcceleratorRegistry:
raise MisconfigurationException(
"When passing string value for the `accelerator` argument of `Trainer`,"
f" it can only be one of {self._accelerator_types}."
)
self.accelerator = AcceleratorRegistry.get(self._accelerator_flag)

if not self.accelerator.is_available():
Expand Down
233 changes: 3 additions & 230 deletions tests/accelerators/test_accelerator_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,229 +56,9 @@ def test_accelerator_choice_cpu(tmpdir):
assert isinstance(trainer.strategy, SingleDeviceStrategy)


@pytest.mark.parametrize(("devices", "num_nodes"), ([(1, 1), (1, 2), (2, 1), (2, 2)]))
def test_accelerator_choice_ddp_cpu(tmpdir, devices: int, num_nodes: int):
trainer = Trainer(fast_dev_run=True, accelerator="ddp_cpu", devices=devices, num_nodes=num_nodes)
assert isinstance(trainer.accelerator, CPUAccelerator)
no_spawn = devices == 1 and num_nodes > 1
assert isinstance(trainer.strategy, DDPStrategy if no_spawn else DDPSpawnStrategy)
assert isinstance(trainer.strategy.cluster_environment, LightningEnvironment)


@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
@mock.patch("torch.cuda.device_count", return_value=2)
@mock.patch("torch.cuda.is_available", return_value=True)
def test_accelerator_choice_ddp(cuda_available_mock, device_count_mock):
with pytest.deprecated_call(match=r"accelerator='ddp'\)` has been deprecated"):
trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=1)
assert isinstance(trainer.accelerator, GPUAccelerator)
assert isinstance(trainer.strategy, DDPStrategy)
assert isinstance(trainer.strategy.cluster_environment, LightningEnvironment)


@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
@mock.patch("torch.cuda.device_count", return_value=2)
@mock.patch("torch.cuda.is_available", return_value=True)
def test_accelerator_choice_ddp_spawn(cuda_available_mock, device_count_mock):
with pytest.deprecated_call(match=r"accelerator='ddp_spawn'\)` has been deprecated"):
trainer = Trainer(fast_dev_run=True, accelerator="ddp_spawn", gpus=1)
assert isinstance(trainer.accelerator, GPUAccelerator)
assert isinstance(trainer.strategy, DDPSpawnStrategy)
assert isinstance(trainer.strategy.cluster_environment, LightningEnvironment)


@mock.patch.dict(
os.environ,
{
"CUDA_VISIBLE_DEVICES": "0,1",
"SLURM_NTASKS": "2",
"SLURM_JOB_NAME": "SOME_NAME",
"SLURM_NODEID": "0",
"SLURM_PROCID": "1",
"SLURM_LOCALID": "1",
},
)
@mock.patch("torch.cuda.set_device")
@mock.patch("torch.cuda.device_count", return_value=2)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
@mock.patch("torch.cuda.is_available", return_value=True)
def test_accelerator_choice_ddp_slurm(*_):
with pytest.deprecated_call(match=r"accelerator='ddp'\)` has been deprecated in v1.5"):
trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=2)
assert trainer._accelerator_connector._is_slurm_managing_tasks()
assert isinstance(trainer.accelerator, GPUAccelerator)
assert isinstance(trainer.strategy, DDPStrategy)
assert isinstance(trainer.strategy.cluster_environment, SLURMEnvironment)
assert trainer.strategy.cluster_environment.local_rank() == 1
assert trainer.strategy.local_rank == 1


@mock.patch.dict(
os.environ,
{
"CUDA_VISIBLE_DEVICES": "0,1",
"SLURM_NTASKS": "2",
"SLURM_JOB_NAME": "SOME_NAME",
"SLURM_NODEID": "0",
"SLURM_PROCID": "1",
"SLURM_LOCALID": "1",
},
)
@mock.patch("torch.cuda.set_device")
@mock.patch("torch.cuda.device_count", return_value=2)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
@mock.patch("torch.cuda.is_available", return_value=True)
def test_accelerator_choice_ddp2_slurm(*_):
with pytest.deprecated_call(match=r"accelerator='ddp2'\)` has been deprecated in v1.5"):
trainer = Trainer(fast_dev_run=True, accelerator="ddp2", gpus=2)
assert trainer._accelerator_connector._is_slurm_managing_tasks()
assert isinstance(trainer.accelerator, GPUAccelerator)
assert isinstance(trainer.strategy, DDP2Strategy)
assert isinstance(trainer.strategy.cluster_environment, SLURMEnvironment)
assert trainer.strategy.cluster_environment.local_rank() == 1
assert trainer.strategy.local_rank == 1


@mock.patch.dict(
os.environ,
{
"CUDA_VISIBLE_DEVICES": "0,1",
"WORLD_SIZE": "2",
"LOCAL_WORLD_SIZE": "2",
"RANK": "1",
"LOCAL_RANK": "1",
"GROUP_RANK": "0",
"TORCHELASTIC_RUN_ID": "1", # present for torch >= 1.9.1
},
)
@mock.patch("torch.cuda.set_device")
@mock.patch("torch.cuda.device_count", return_value=1)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
@mock.patch("torch.cuda.is_available", return_value=True)
def test_accelerator_choice_ddp_te(*_):
with pytest.deprecated_call(match=r"accelerator='ddp'\)` has been deprecated in v1.5"):
trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=2)
assert isinstance(trainer.accelerator, GPUAccelerator)
assert isinstance(trainer.strategy, DDPStrategy)
assert isinstance(trainer.strategy.cluster_environment, TorchElasticEnvironment)
assert trainer.strategy.cluster_environment.local_rank() == 1
assert trainer.strategy.local_rank == 1


@mock.patch.dict(
os.environ,
{
"CUDA_VISIBLE_DEVICES": "0,1",
"WORLD_SIZE": "2",
"LOCAL_WORLD_SIZE": "2",
"RANK": "1",
"LOCAL_RANK": "1",
"GROUP_RANK": "0",
"TORCHELASTIC_RUN_ID": "1",
},
)
@mock.patch("torch.cuda.set_device")
@mock.patch("torch.cuda.device_count", return_value=1)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
@mock.patch("torch.cuda.is_available", return_value=True)
def test_accelerator_choice_ddp2_te(*_):
with pytest.deprecated_call(match=r"accelerator='ddp2'\)` has been deprecated in v1.5"):
trainer = Trainer(fast_dev_run=True, accelerator="ddp2", gpus=2)
assert isinstance(trainer.accelerator, GPUAccelerator)
assert isinstance(trainer.strategy, DDP2Strategy)
assert isinstance(trainer.strategy.cluster_environment, TorchElasticEnvironment)
assert trainer.strategy.cluster_environment.local_rank() == 1
assert trainer.strategy.local_rank == 1


@mock.patch.dict(
os.environ,
{
"WORLD_SIZE": "2",
"LOCAL_WORLD_SIZE": "2",
"RANK": "1",
"LOCAL_RANK": "1",
"GROUP_RANK": "0",
"TORCHELASTIC_RUN_ID": "1",
},
)
@mock.patch("torch.cuda.device_count", return_value=0)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
def test_accelerator_choice_ddp_cpu_te(*_):
trainer = Trainer(fast_dev_run=True, accelerator="ddp_cpu", devices=2)
assert isinstance(trainer.accelerator, CPUAccelerator)
assert isinstance(trainer.strategy, DDPStrategy)
assert isinstance(trainer.strategy.cluster_environment, TorchElasticEnvironment)
assert trainer.strategy.cluster_environment.local_rank() == 1
assert trainer.strategy.local_rank == 1


@mock.patch.dict(
os.environ,
{
"CUDA_VISIBLE_DEVICES": "0",
"KUBERNETES_PORT": "tcp://127.0.0.1:443",
"MASTER_ADDR": "1.2.3.4",
"MASTER_PORT": "500",
"WORLD_SIZE": "20",
"RANK": "1",
},
)
@mock.patch("torch.cuda.set_device")
@mock.patch("torch.cuda.device_count", return_value=1)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
@mock.patch("torch.cuda.is_available", return_value=True)
def test_accelerator_choice_ddp_kubeflow(*_):
with pytest.deprecated_call(match=r"accelerator='ddp'\)` has been deprecated in v1.5"):
trainer = Trainer(fast_dev_run=True, accelerator="ddp", gpus=1)
assert isinstance(trainer.accelerator, GPUAccelerator)
assert isinstance(trainer.strategy, DDPStrategy)
assert isinstance(trainer.strategy.cluster_environment, KubeflowEnvironment)
assert trainer.strategy.cluster_environment.local_rank() == 0
assert trainer.strategy.local_rank == 0


@mock.patch.dict(
os.environ,
{
"KUBERNETES_PORT": "tcp://127.0.0.1:443",
"MASTER_ADDR": "1.2.3.4",
"MASTER_PORT": "500",
"WORLD_SIZE": "20",
"RANK": "1",
},
)
@mock.patch("torch.cuda.device_count", return_value=0)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
def test_accelerator_choice_ddp_cpu_kubeflow(*_):
trainer = Trainer(fast_dev_run=True, accelerator="ddp_cpu", devices=1)
assert isinstance(trainer.accelerator, CPUAccelerator)
assert isinstance(trainer.strategy, DDPStrategy)
assert isinstance(trainer.strategy.cluster_environment, KubeflowEnvironment)
assert trainer.strategy.cluster_environment.local_rank() == 0
assert trainer.strategy.local_rank == 0


@mock.patch.dict(
os.environ,
{
"SLURM_NTASKS": "2",
"SLURM_JOB_NAME": "SOME_NAME",
"SLURM_NODEID": "0",
"LOCAL_RANK": "0",
"SLURM_PROCID": "0",
"SLURM_LOCALID": "0",
},
)
@mock.patch("torch.cuda.device_count", return_value=0)
@mock.patch("pytorch_lightning.strategies.DDPStrategy.setup_distributed", autospec=True)
def test_accelerator_choice_ddp_cpu_slurm(*_):
trainer = Trainer(fast_dev_run=True, accelerator="ddp_cpu", devices=2)
assert trainer._accelerator_connector._is_slurm_managing_tasks()
assert isinstance(trainer.accelerator, CPUAccelerator)
assert isinstance(trainer.strategy, DDPStrategy)
assert isinstance(trainer.strategy.cluster_environment, SLURMEnvironment)
assert trainer.strategy.local_rank == 0
def test_accelerator_invalid_choice():
with pytest.raises(ValueError, match="You selected an invalid accelerator name: `accelerator='invalid'`"):
Trainer(accelerator="invalid")


@RunIf(skip_windows=True, standalone=True)
Expand Down Expand Up @@ -551,13 +331,6 @@ def test_accelerator_auto_with_devices_gpu():
assert trainer.num_devices == 1


def test_validate_accelerator_and_devices():

trainer = Trainer(accelerator="ddp_cpu", devices=2)
assert isinstance(trainer.accelerator, CPUAccelerator)
assert trainer.num_devices == 2


def test_set_devices_if_none_cpu():

trainer = Trainer(accelerator="cpu", devices=3)
Expand Down

0 comments on commit a6d1246

Please sign in to comment.