From eb895aa1c9d12425cb7bcbb254c162b1a52bd114 Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 6 Aug 2021 12:25:39 +0200 Subject: [PATCH 1/8] fix gpus -1 for CPU --- pytorch_lightning/utilities/device_parser.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py index 6656b9765ba00..72cb16a3f56b5 100644 --- a/pytorch_lightning/utilities/device_parser.py +++ b/pytorch_lightning/utilities/device_parser.py @@ -168,11 +168,13 @@ def _normalize_parse_gpu_input_to_list(gpus: Union[int, List[int], Tuple[int, .. return list(range(gpus)) -def _get_all_available_gpus() -> List[int]: +def _get_all_available_gpus() -> Optional[List[int]]: """ Returns: a list of all available gpus """ + if not torch.cuda.is_available(): + return None return list(range(torch.cuda.device_count())) From 47ba4252923f8d1da7500afce4859a75eba1d7d4 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 14 Oct 2021 23:42:55 +0200 Subject: [PATCH 2/8] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos MocholĂ­ --- pytorch_lightning/utilities/device_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py index 72cb16a3f56b5..0895b963028d2 100644 --- a/pytorch_lightning/utilities/device_parser.py +++ b/pytorch_lightning/utilities/device_parser.py @@ -174,7 +174,7 @@ def _get_all_available_gpus() -> Optional[List[int]]: a list of all available gpus """ if not torch.cuda.is_available(): - return None + return [] return list(range(torch.cuda.device_count())) From 9bc5d602cc48ba433a7bdf0890c28a53cbcd5057 Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 15 Oct 2021 00:22:30 +0200 Subject: [PATCH 3/8] fixing --- .../connectors/accelerator_connector.py | 15 ++++----- pytorch_lightning/trainer/trainer.py | 11 ++++--- pytorch_lightning/utilities/device_parser.py | 6 ++-- tests/trainer/flags/test_gpus.py | 32 +++++++++++++++++++ 4 files changed, 47 insertions(+), 17 deletions(-) create mode 100644 tests/trainer/flags/test_gpus.py diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index 7b86aab64130a..4db6b85c873b9 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -553,16 +553,14 @@ def is_distributed(self) -> bool: if hasattr(self.training_type_plugin, "is_distributed") and not self.use_tpu: return self.training_type_plugin.is_distributed is_distributed = self.use_ddp or self.use_ddp2 or self.use_horovod - if self.use_tpu: + if self.use_tpu and hasattr(self.training_type_plugin, "is_distributed"): is_distributed |= self.training_type_plugin.is_distributed return is_distributed @property def num_gpus(self) -> int: gpus = self.parallel_device_ids - if gpus is None: - return 0 - return len(gpus) + return 0 if gpus is None else len(gpus) @property def num_ipus(self) -> int: @@ -581,6 +579,8 @@ def parallel_devices(self) -> List[Union[torch.device, int]]: # https://github.com/PyTorchLightning/pytorch-lightning/issues/3169 if isinstance(self.tpu_cores, int): devices = list(range(self.tpu_cores)) + else: + raise MisconfigurationException(f"`tpu_cores` has to be int, but {self.tpu_cores} given.") elif self.use_ipu: devices = list(range(self.num_ipus)) else: @@ -589,11 +589,8 @@ def parallel_devices(self) -> List[Union[torch.device, int]]: @property def root_gpu(self) -> Optional[int]: - return ( - self.accelerator.root_device.index - if not isinstance(self.accelerator, (IPUAccelerator, TPUAccelerator)) - else None - ) + if not isinstance(self.accelerator, (IPUAccelerator, TPUAccelerator)): + return self.accelerator.root_device.index @staticmethod def _is_plugin_training_type(plugin: Union[str, TrainingTypePlugin]) -> bool: diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index be0a7728edddc..d585cfa55b693 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -24,6 +24,7 @@ from weakref import proxy import torch +from torch.cuda.amp import GradScaler from torch.optim import Optimizer import pytorch_lightning as pl @@ -1490,9 +1491,9 @@ def _on_exception(self): file_path = os.path.join(self.default_root_dir, ".pl_auto_save.ckpt") self.save_checkpoint(file_path) - """ - Accelerator properties - """ + ######################## + # Accelerator properties + ######################## @property def accelerator(self) -> Accelerator: @@ -1555,7 +1556,7 @@ def root_gpu(self) -> Optional[int]: return self.accelerator_connector.root_gpu @property - def tpu_cores(self) -> int: + def tpu_cores(self) -> Optional[Union[int, List[int]]]: return self.accelerator_connector.tpu_cores @property @@ -1616,7 +1617,7 @@ def precision(self) -> Union[str, int]: return self.accelerator.precision @property - def scaler(self): + def scaler(self) -> Optional[GradScaler]: return self.accelerator.scaler @property diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py index 72cb16a3f56b5..8d393e42aa350 100644 --- a/pytorch_lightning/utilities/device_parser.py +++ b/pytorch_lightning/utilities/device_parser.py @@ -70,7 +70,7 @@ def parse_gpu_ids(gpus: Optional[Union[int, str, List[int]]]) -> Optional[List[i _check_data_type(gpus) # Handle the case when no gpus are requested - if gpus is None or isinstance(gpus, int) and gpus == 0 or str(gpus).strip() == "0": + if gpus is None or str(gpus).strip() == "0" or (str(gpus).strip() == "-1" and not torch.cuda.is_available()): return None # We know user requested GPUs therefore if some of the @@ -168,13 +168,13 @@ def _normalize_parse_gpu_input_to_list(gpus: Union[int, List[int], Tuple[int, .. return list(range(gpus)) -def _get_all_available_gpus() -> Optional[List[int]]: +def _get_all_available_gpus() -> List[int]: """ Returns: a list of all available gpus """ if not torch.cuda.is_available(): - return None + return [] return list(range(torch.cuda.device_count())) diff --git a/tests/trainer/flags/test_gpus.py b/tests/trainer/flags/test_gpus.py new file mode 100644 index 0000000000000..bb8cd1c350e9a --- /dev/null +++ b/tests/trainer/flags/test_gpus.py @@ -0,0 +1,32 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import torch + +from pytorch_lightning.trainer import Trainer +from tests.helpers import BoringModel + + +@pytest.mark.parametrize("gpus", [-1, "-1"]) +def test_all_gpus(tmpdir, gpus): + """Testing that the -1 is stable for GPU machines also if GPU is missing.""" + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + gpus=gpus, + ) + trainer.fit(model) + assert trainer.accelerator_connector.use_gpu == torch.cuda.is_available() + assert trainer.accelerator_connector.num_gpus == torch.cuda.device_count() From 51cb32ffdf05b8948f3dccad5673c882acea3d70 Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 15 Oct 2021 00:24:53 +0200 Subject: [PATCH 4/8] chlog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e9131b0921015..429809e306455 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -483,6 +483,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed parsing argument `gpus=-1`in CPU machines ([#8766](https://github.com/PyTorchLightning/pytorch-lightning/pull/8766)) + - Fixed an issue with logger outputs not being finalized correctly after prediction runs ([#8685](https://github.com/PyTorchLightning/pytorch-lightning/pull/8685)) From dd405443f66f2307254bfef90c7d71ac6a4a2303 Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 15 Oct 2021 10:08:02 +0200 Subject: [PATCH 5/8] cleaning --- tests/__init__.py | 6 + tests/loggers/test_tensorboard.py | 7 +- tests/models/test_gpu.py | 201 ------------------------------ tests/trainer/flags/test_gpus.py | 193 ++++++++++++++++++++++++++++ 4 files changed, 201 insertions(+), 206 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index 9039a6e4b16e9..d70a86f2e4fd5 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -12,16 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +import operator import os import numpy as np +from pytorch_lightning.utilities.imports import _compare_version + _TEST_ROOT = os.path.dirname(__file__) _PROJECT_ROOT = os.path.dirname(_TEST_ROOT) _TEMP_PATH = os.path.join(_PROJECT_ROOT, "test_temp") _PATH_DATASETS = os.path.join(_PROJECT_ROOT, "Datasets") _PATH_LEGACY = os.path.join(_PROJECT_ROOT, "legacy") +PL_VERSION_LT_1_5 = _compare_version("pytorch_lightning", operator.lt, "1.5") +TENSORBOARD_VERSION_GE_2_6 = _compare_version("tensorboard", operator.ge, "2.6.0") + # todo: this setting `PYTHONPATH` may not be used by other evns like Conda for import packages if _PROJECT_ROOT not in os.getenv("PYTHONPATH", ""): splitter = ":" if os.environ.get("PYTHONPATH", "") else "" diff --git a/tests/loggers/test_tensorboard.py b/tests/loggers/test_tensorboard.py index 02a809aa2ab30..19a5832e4afba 100644 --- a/tests/loggers/test_tensorboard.py +++ b/tests/loggers/test_tensorboard.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -import operator import os from argparse import Namespace from unittest import mock @@ -25,13 +24,11 @@ from pytorch_lightning import Trainer from pytorch_lightning.loggers import TensorBoardLogger -from pytorch_lightning.utilities.imports import _compare_version +from tests import TENSORBOARD_VERSION_GE_2_6 from tests.helpers import BoringModel -@pytest.mark.skipif( - _compare_version("tensorboard", operator.ge, "2.6.0"), reason="cannot import EventAccumulator in >= 2.6.0" -) +@pytest.mark.skipif(TENSORBOARD_VERSION_GE_2_6, reason="cannot import EventAccumulator in >= 2.6.0") def test_tensorboard_hparams_reload(tmpdir): from tensorboard.backend.event_processing.event_accumulator import EventAccumulator diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index d0b25b2af9960..a80b8656c04c6 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -11,31 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import operator -import os from collections import namedtuple -from unittest import mock from unittest.mock import patch -import pytest import torch import tests.helpers.pipelines as tpipes import tests.helpers.utils as tutils from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.utilities import device_parser -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _compare_version -from tests.helpers import BoringModel from tests.helpers.datamodules import ClassifDataModule from tests.helpers.imports import Batch, Dataset, Example, Field, LabelField from tests.helpers.runif import RunIf from tests.helpers.simple_models import ClassificationModel -PL_VERSION_LT_1_5 = _compare_version("pytorch_lightning", operator.lt, "1.5") -PRETEND_N_OF_GPUS = 16 - @RunIf(min_gpus=2) def test_multi_gpu_none_backend(tmpdir): @@ -55,195 +43,6 @@ def test_multi_gpu_none_backend(tmpdir): tpipes.run_model_test(trainer_options, model, dm) -@RunIf(min_gpus=2) -@pytest.mark.parametrize("gpus", [1, [0], [1]]) -def test_single_gpu_model(tmpdir, gpus): - """Make sure single GPU works (DP mode).""" - trainer_options = dict( - default_root_dir=tmpdir, - enable_progress_bar=False, - max_epochs=1, - limit_train_batches=0.1, - limit_val_batches=0.1, - gpus=gpus, - ) - - model = BoringModel() - tpipes.run_model_test(trainer_options, model) - - -@pytest.fixture -def mocked_device_count(monkeypatch): - def device_count(): - return PRETEND_N_OF_GPUS - - def is_available(): - return True - - monkeypatch.setattr(torch.cuda, "is_available", is_available) - monkeypatch.setattr(torch.cuda, "device_count", device_count) - - -@pytest.fixture -def mocked_device_count_0(monkeypatch): - def device_count(): - return 0 - - monkeypatch.setattr(torch.cuda, "device_count", device_count) - - -@pytest.mark.parametrize( - ["gpus", "expected_num_gpus", "distributed_backend"], - [ - pytest.param(None, 0, None, id="None - expect 0 gpu to use."), - pytest.param(0, 0, None, id="Oth gpu, expect 1 gpu to use."), - pytest.param(1, 1, None, id="1st gpu, expect 1 gpu to use."), - pytest.param(-1, PRETEND_N_OF_GPUS, "ddp", id="-1 - use all gpus"), - pytest.param("-1", PRETEND_N_OF_GPUS, "ddp", id="'-1' - use all gpus"), - pytest.param(3, 3, "ddp", id="3rd gpu - 1 gpu to use (backend:ddp)"), - ], -) -def test_trainer_gpu_parse(mocked_device_count, gpus, expected_num_gpus, distributed_backend): - assert Trainer(gpus=gpus, accelerator=distributed_backend).num_gpus == expected_num_gpus - - -@pytest.mark.parametrize( - ["gpus", "expected_num_gpus", "distributed_backend"], - [ - pytest.param(None, 0, None, id="None - expect 0 gpu to use."), - pytest.param(None, 0, "ddp", id="None - expect 0 gpu to use."), - ], -) -def test_trainer_num_gpu_0(mocked_device_count_0, gpus, expected_num_gpus, distributed_backend): - assert Trainer(gpus=gpus, accelerator=distributed_backend).num_gpus == expected_num_gpus - - -@pytest.mark.parametrize( - ["gpus", "expected_root_gpu", "distributed_backend"], - [ - pytest.param(None, None, "ddp", id="None is None"), - pytest.param(0, None, "ddp", id="O gpus, expect gpu root device to be None."), - pytest.param(1, 0, "ddp", id="1 gpu, expect gpu root device to be 0."), - pytest.param(-1, 0, "ddp", id="-1 - use all gpus, expect gpu root device to be 0."), - pytest.param("-1", 0, "ddp", id="'-1' - use all gpus, expect gpu root device to be 0."), - pytest.param(3, 0, "ddp", id="3 gpus, expect gpu root device to be 0.(backend:ddp)"), - ], -) -def test_root_gpu_property(mocked_device_count, gpus, expected_root_gpu, distributed_backend): - assert Trainer(gpus=gpus, accelerator=distributed_backend).root_gpu == expected_root_gpu - - -@pytest.mark.parametrize( - ["gpus", "expected_root_gpu", "distributed_backend"], - [ - pytest.param(None, None, None, id="None is None"), - pytest.param(None, None, "ddp", id="None is None"), - pytest.param(0, None, "ddp", id="None is None"), - ], -) -def test_root_gpu_property_0_passing(mocked_device_count_0, gpus, expected_root_gpu, distributed_backend): - assert Trainer(gpus=gpus, accelerator=distributed_backend).root_gpu == expected_root_gpu - - -# Asking for a gpu when non are available will result in a MisconfigurationException -@pytest.mark.parametrize( - ["gpus", "expected_root_gpu", "distributed_backend"], - [ - (1, None, "ddp"), - (3, None, "ddp"), - (3, None, "ddp"), - ([1, 2], None, "ddp"), - ([0, 1], None, "ddp"), - (-1, None, "ddp"), - ("-1", None, "ddp"), - ], -) -def test_root_gpu_property_0_raising(mocked_device_count_0, gpus, expected_root_gpu, distributed_backend): - with pytest.raises(MisconfigurationException): - Trainer(gpus=gpus, accelerator=distributed_backend) - - -@pytest.mark.parametrize( - ["gpus", "expected_root_gpu"], - [ - pytest.param(None, None, id="No gpus, expect gpu root device to be None"), - pytest.param([0], 0, id="Oth gpu, expect gpu root device to be 0."), - pytest.param([1], 1, id="1st gpu, expect gpu root device to be 1."), - pytest.param([3], 3, id="3rd gpu, expect gpu root device to be 3."), - pytest.param([1, 2], 1, id="[1, 2] gpus, expect gpu root device to be 1."), - ], -) -def test_determine_root_gpu_device(gpus, expected_root_gpu): - assert device_parser.determine_root_gpu_device(gpus) == expected_root_gpu - - -@pytest.mark.parametrize( - ["gpus", "expected_gpu_ids"], - [ - (None, None), - (0, None), - (1, [0]), - (3, [0, 1, 2]), - pytest.param(-1, list(range(PRETEND_N_OF_GPUS)), id="-1 - use all gpus"), - ([0], [0]), - ([1, 3], [1, 3]), - ((1, 3), [1, 3]), - ("0", None), - ("3", [0, 1, 2]), - ("1, 3", [1, 3]), - ("2,", [2]), - pytest.param("-1", list(range(PRETEND_N_OF_GPUS)), id="'-1' - use all gpus"), - ], -) -def test_parse_gpu_ids(mocked_device_count, gpus, expected_gpu_ids): - assert device_parser.parse_gpu_ids(gpus) == expected_gpu_ids - - -@pytest.mark.parametrize("gpus", [0.1, -2, False, [], [-1], [None], ["0"], [0, 0]]) -def test_parse_gpu_fail_on_unsupported_inputs(mocked_device_count, gpus): - with pytest.raises(MisconfigurationException): - device_parser.parse_gpu_ids(gpus) - - -@pytest.mark.parametrize("gpus", [[1, 2, 19], -1, "-1"]) -def test_parse_gpu_fail_on_non_existent_id(mocked_device_count_0, gpus): - with pytest.raises(MisconfigurationException): - device_parser.parse_gpu_ids(gpus) - - -def test_parse_gpu_fail_on_non_existent_id_2(mocked_device_count): - with pytest.raises(MisconfigurationException): - device_parser.parse_gpu_ids([1, 2, 19]) - - -@pytest.mark.parametrize("gpus", [-1, "-1"]) -def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_count_0, gpus): - with pytest.raises(MisconfigurationException): - device_parser.parse_gpu_ids(gpus) - - -@mock.patch.dict( - os.environ, - { - "CUDA_VISIBLE_DEVICES": "0", - "LOCAL_RANK": "1", - "GROUP_RANK": "1", - "RANK": "3", - "WORLD_SIZE": "4", - "LOCAL_WORLD_SIZE": "2", - }, -) -@mock.patch("torch.cuda.device_count", return_value=1) -@pytest.mark.parametrize("gpus", [[0, 1, 2], 2, "0"]) -def test_torchelastic_gpu_parsing(mocked_device_count, gpus): - """Ensure when using torchelastic and nproc_per_node is set to the default of 1 per GPU device That we omit - sanitizing the gpus as only one of the GPUs is visible.""" - trainer = Trainer(gpus=gpus) - assert isinstance(trainer.accelerator_connector.cluster_environment, TorchElasticEnvironment) - assert trainer.accelerator_connector.parallel_device_ids == device_parser.parse_gpu_ids(gpus) - assert trainer.gpus == gpus - - @RunIf(min_gpus=1) def test_single_gpu_batch_parse(): trainer = Trainer(gpus=1) diff --git a/tests/trainer/flags/test_gpus.py b/tests/trainer/flags/test_gpus.py index bb8cd1c350e9a..69676ab19036f 100644 --- a/tests/trainer/flags/test_gpus.py +++ b/tests/trainer/flags/test_gpus.py @@ -11,11 +11,204 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os +from unittest import mock + import pytest import torch +import tests.helpers.pipelines as tpipes +from pytorch_lightning.plugins.environments import TorchElasticEnvironment from pytorch_lightning.trainer import Trainer +from pytorch_lightning.utilities import device_parser +from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel +from tests.helpers.runif import RunIf + +PRETEND_N_OF_GPUS = 16 + + +@RunIf(min_gpus=2) +@pytest.mark.parametrize("gpus", [1, [0], [1]]) +def test_single_gpu_model(tmpdir, gpus): + """Make sure single GPU works (DP mode).""" + trainer_options = dict( + default_root_dir=tmpdir, + enable_progress_bar=False, + max_epochs=1, + limit_train_batches=0.1, + limit_val_batches=0.1, + gpus=gpus, + ) + + model = BoringModel() + tpipes.run_model_test(trainer_options, model) + + +@pytest.fixture +def mocked_device_count(monkeypatch): + def device_count(): + return PRETEND_N_OF_GPUS + + def is_available(): + return True + + monkeypatch.setattr(torch.cuda, "is_available", is_available) + monkeypatch.setattr(torch.cuda, "device_count", device_count) + + +@pytest.fixture +def mocked_device_count_0(monkeypatch): + def device_count(): + return 0 + + monkeypatch.setattr(torch.cuda, "device_count", device_count) + + +@pytest.mark.parametrize( + ["gpus", "expected_num_gpus", "distributed_backend"], + [ + pytest.param(None, 0, None, id="None - expect 0 gpu to use."), + pytest.param(0, 0, None, id="Oth gpu, expect 1 gpu to use."), + pytest.param(1, 1, None, id="1st gpu, expect 1 gpu to use."), + pytest.param(-1, PRETEND_N_OF_GPUS, "ddp", id="-1 - use all gpus"), + pytest.param("-1", PRETEND_N_OF_GPUS, "ddp", id="'-1' - use all gpus"), + pytest.param(3, 3, "ddp", id="3rd gpu - 1 gpu to use (backend:ddp)"), + ], +) +def test_trainer_gpu_parse(mocked_device_count, gpus, expected_num_gpus, distributed_backend): + assert Trainer(gpus=gpus, accelerator=distributed_backend).num_gpus == expected_num_gpus + + +@pytest.mark.parametrize( + ["gpus", "expected_num_gpus", "distributed_backend"], + [ + pytest.param(None, 0, None, id="None - expect 0 gpu to use."), + pytest.param(None, 0, "ddp", id="None - expect 0 gpu to use."), + ], +) +def test_trainer_num_gpu_0(mocked_device_count_0, gpus, expected_num_gpus, distributed_backend): + assert Trainer(gpus=gpus, accelerator=distributed_backend).num_gpus == expected_num_gpus + + +@pytest.mark.parametrize( + ["gpus", "expected_root_gpu", "distributed_backend"], + [ + pytest.param(None, None, "ddp", id="None is None"), + pytest.param(0, None, "ddp", id="O gpus, expect gpu root device to be None."), + pytest.param(1, 0, "ddp", id="1 gpu, expect gpu root device to be 0."), + pytest.param(-1, 0, "ddp", id="-1 - use all gpus, expect gpu root device to be 0."), + pytest.param("-1", 0, "ddp", id="'-1' - use all gpus, expect gpu root device to be 0."), + pytest.param(3, 0, "ddp", id="3 gpus, expect gpu root device to be 0.(backend:ddp)"), + ], +) +def test_root_gpu_property(mocked_device_count, gpus, expected_root_gpu, distributed_backend): + assert Trainer(gpus=gpus, accelerator=distributed_backend).root_gpu == expected_root_gpu + + +@pytest.mark.parametrize( + ["gpus", "expected_root_gpu", "distributed_backend"], + [ + pytest.param(None, None, None, id="None is None"), + pytest.param(None, None, "ddp", id="None is None"), + pytest.param(0, None, "ddp", id="None is None"), + ], +) +def test_root_gpu_property_0_passing(mocked_device_count_0, gpus, expected_root_gpu, distributed_backend): + assert Trainer(gpus=gpus, accelerator=distributed_backend).root_gpu == expected_root_gpu + + +# Asking for a gpu when non are available will result in a MisconfigurationException +@pytest.mark.parametrize( + ["gpus", "expected_root_gpu", "distributed_backend"], + [ + (1, None, "ddp"), + (3, None, "ddp"), + (3, None, "ddp"), + ([1, 2], None, "ddp"), + ([0, 1], None, "ddp"), + (-1, None, "ddp"), + ("-1", None, "ddp"), + ], +) +def test_root_gpu_property_0_raising(mocked_device_count_0, gpus, expected_root_gpu, distributed_backend): + with pytest.raises(MisconfigurationException): + Trainer(gpus=gpus, accelerator=distributed_backend) + + +@pytest.mark.parametrize( + ["gpus", "expected_root_gpu"], + [ + pytest.param(None, None, id="No gpus, expect gpu root device to be None"), + pytest.param([0], 0, id="Oth gpu, expect gpu root device to be 0."), + pytest.param([1], 1, id="1st gpu, expect gpu root device to be 1."), + pytest.param([3], 3, id="3rd gpu, expect gpu root device to be 3."), + pytest.param([1, 2], 1, id="[1, 2] gpus, expect gpu root device to be 1."), + ], +) +def test_determine_root_gpu_device(gpus, expected_root_gpu): + assert device_parser.determine_root_gpu_device(gpus) == expected_root_gpu + + +@pytest.mark.parametrize( + ["gpus", "expected_gpu_ids"], + [ + (None, None), + (0, None), + (1, [0]), + (3, [0, 1, 2]), + pytest.param(-1, list(range(PRETEND_N_OF_GPUS)), id="-1 - use all gpus"), + ([0], [0]), + ([1, 3], [1, 3]), + ((1, 3), [1, 3]), + ("0", None), + ("3", [0, 1, 2]), + ("1, 3", [1, 3]), + ("2,", [2]), + pytest.param("-1", list(range(PRETEND_N_OF_GPUS)), id="'-1' - use all gpus"), + ], +) +def test_parse_gpu_ids(mocked_device_count, gpus, expected_gpu_ids): + assert device_parser.parse_gpu_ids(gpus) == expected_gpu_ids + + +@pytest.mark.parametrize("gpus", [0.1, -2, False, [], [-1], [None], ["0"], [0, 0]]) +def test_parse_gpu_fail_on_unsupported_inputs(mocked_device_count, gpus): + with pytest.raises(MisconfigurationException): + device_parser.parse_gpu_ids(gpus) + + +@pytest.mark.parametrize("gpus", [[1, 2, 19]]) +def test_parse_gpu_fail_on_non_existent_id(mocked_device_count_0, gpus): + with pytest.raises(MisconfigurationException): + device_parser.parse_gpu_ids(gpus) + + +def test_parse_gpu_fail_on_non_existent_id_2(mocked_device_count): + with pytest.raises(MisconfigurationException): + device_parser.parse_gpu_ids([1, 2, 19]) + + +@mock.patch.dict( + os.environ, + { + "CUDA_VISIBLE_DEVICES": "0", + "LOCAL_RANK": "1", + "GROUP_RANK": "1", + "RANK": "3", + "WORLD_SIZE": "4", + "LOCAL_WORLD_SIZE": "2", + }, +) +@mock.patch("torch.cuda.device_count", return_value=1) +@pytest.mark.parametrize("gpus", [[0, 1, 2], 2, "0"]) +def test_torchelastic_gpu_parsing(mocked_device_count, gpus): + """Ensure when using torchelastic and nproc_per_node is set to the default of 1 per GPU device That we omit + sanitizing the gpus as only one of the GPUs is visible.""" + trainer = Trainer(gpus=gpus) + assert isinstance(trainer.accelerator_connector.cluster_environment, TorchElasticEnvironment) + assert trainer.accelerator_connector.parallel_device_ids == device_parser.parse_gpu_ids(gpus) + assert trainer.gpus == gpus @pytest.mark.parametrize("gpus", [-1, "-1"]) From 955af45381bae3db2120d41ebda39a96b150904b Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 15 Oct 2021 10:11:43 +0200 Subject: [PATCH 6/8] typo Co-authored-by: Aki Nitta --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 429809e306455..ecb6ffd5b7325 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -483,7 +483,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- Fixed parsing argument `gpus=-1`in CPU machines ([#8766](https://github.com/PyTorchLightning/pytorch-lightning/pull/8766)) +- Fixed parsing argument `gpus=-1` in CPU machines ([#8766](https://github.com/PyTorchLightning/pytorch-lightning/pull/8766)) - Fixed an issue with logger outputs not being finalized correctly after prediction runs ([#8685](https://github.com/PyTorchLightning/pytorch-lightning/pull/8685)) From 9abc72ce9a4208ddee5aeaf411e46818e4320a5a Mon Sep 17 00:00:00 2001 From: Jirka Date: Tue, 26 Oct 2021 23:31:37 +0200 Subject: [PATCH 7/8] flags --- tests/trainer/flags/test_gpus.py | 36 ++++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/tests/trainer/flags/test_gpus.py b/tests/trainer/flags/test_gpus.py index 69676ab19036f..84110fff10629 100644 --- a/tests/trainer/flags/test_gpus.py +++ b/tests/trainer/flags/test_gpus.py @@ -66,7 +66,7 @@ def device_count(): @pytest.mark.parametrize( - ["gpus", "expected_num_gpus", "distributed_backend"], + ["gpus", "expected_num_gpus", "strategy"], [ pytest.param(None, 0, None, id="None - expect 0 gpu to use."), pytest.param(0, 0, None, id="Oth gpu, expect 1 gpu to use."), @@ -76,23 +76,23 @@ def device_count(): pytest.param(3, 3, "ddp", id="3rd gpu - 1 gpu to use (backend:ddp)"), ], ) -def test_trainer_gpu_parse(mocked_device_count, gpus, expected_num_gpus, distributed_backend): - assert Trainer(gpus=gpus, accelerator=distributed_backend).num_gpus == expected_num_gpus +def test_trainer_gpu_parse(mocked_device_count, gpus, expected_num_gpus, strategy): + assert Trainer(gpus=gpus, strategy=strategy).num_gpus == expected_num_gpus @pytest.mark.parametrize( - ["gpus", "expected_num_gpus", "distributed_backend"], + ["gpus", "expected_num_gpus", "strategy"], [ pytest.param(None, 0, None, id="None - expect 0 gpu to use."), pytest.param(None, 0, "ddp", id="None - expect 0 gpu to use."), ], ) -def test_trainer_num_gpu_0(mocked_device_count_0, gpus, expected_num_gpus, distributed_backend): - assert Trainer(gpus=gpus, accelerator=distributed_backend).num_gpus == expected_num_gpus +def test_trainer_num_gpu_0(mocked_device_count_0, gpus, expected_num_gpus, strategy): + assert Trainer(gpus=gpus, strategy=strategy).num_gpus == expected_num_gpus @pytest.mark.parametrize( - ["gpus", "expected_root_gpu", "distributed_backend"], + ["gpus", "expected_root_gpu", "strategy"], [ pytest.param(None, None, "ddp", id="None is None"), pytest.param(0, None, "ddp", id="O gpus, expect gpu root device to be None."), @@ -102,25 +102,25 @@ def test_trainer_num_gpu_0(mocked_device_count_0, gpus, expected_num_gpus, distr pytest.param(3, 0, "ddp", id="3 gpus, expect gpu root device to be 0.(backend:ddp)"), ], ) -def test_root_gpu_property(mocked_device_count, gpus, expected_root_gpu, distributed_backend): - assert Trainer(gpus=gpus, accelerator=distributed_backend).root_gpu == expected_root_gpu +def test_root_gpu_property(mocked_device_count, gpus, expected_root_gpu, strategy): + assert Trainer(gpus=gpus, strategy=strategy).root_gpu == expected_root_gpu @pytest.mark.parametrize( - ["gpus", "expected_root_gpu", "distributed_backend"], + ["gpus", "expected_root_gpu", "strategy"], [ pytest.param(None, None, None, id="None is None"), pytest.param(None, None, "ddp", id="None is None"), pytest.param(0, None, "ddp", id="None is None"), ], ) -def test_root_gpu_property_0_passing(mocked_device_count_0, gpus, expected_root_gpu, distributed_backend): - assert Trainer(gpus=gpus, accelerator=distributed_backend).root_gpu == expected_root_gpu +def test_root_gpu_property_0_passing(mocked_device_count_0, gpus, expected_root_gpu, strategy): + assert Trainer(gpus=gpus, strategy=strategy).root_gpu == expected_root_gpu # Asking for a gpu when non are available will result in a MisconfigurationException @pytest.mark.parametrize( - ["gpus", "expected_root_gpu", "distributed_backend"], + ["gpus", "expected_root_gpu", "strategy"], [ (1, None, "ddp"), (3, None, "ddp"), @@ -131,9 +131,9 @@ def test_root_gpu_property_0_passing(mocked_device_count_0, gpus, expected_root_ ("-1", None, "ddp"), ], ) -def test_root_gpu_property_0_raising(mocked_device_count_0, gpus, expected_root_gpu, distributed_backend): +def test_root_gpu_property_0_raising(mocked_device_count_0, gpus, expected_root_gpu, strategy): with pytest.raises(MisconfigurationException): - Trainer(gpus=gpus, accelerator=distributed_backend) + Trainer(gpus=gpus, strategy=strategy) @pytest.mark.parametrize( @@ -178,7 +178,7 @@ def test_parse_gpu_fail_on_unsupported_inputs(mocked_device_count, gpus): device_parser.parse_gpu_ids(gpus) -@pytest.mark.parametrize("gpus", [[1, 2, 19]]) +@pytest.mark.parametrize("gpus", [[1, 2, 19], -1, "-1"]) def test_parse_gpu_fail_on_non_existent_id(mocked_device_count_0, gpus): with pytest.raises(MisconfigurationException): device_parser.parse_gpu_ids(gpus) @@ -206,8 +206,8 @@ def test_torchelastic_gpu_parsing(mocked_device_count, gpus): """Ensure when using torchelastic and nproc_per_node is set to the default of 1 per GPU device That we omit sanitizing the gpus as only one of the GPUs is visible.""" trainer = Trainer(gpus=gpus) - assert isinstance(trainer.accelerator_connector.cluster_environment, TorchElasticEnvironment) - assert trainer.accelerator_connector.parallel_device_ids == device_parser.parse_gpu_ids(gpus) + assert isinstance(trainer._accelerator_connector.cluster_environment, TorchElasticEnvironment) + assert trainer._accelerator_connector.parallel_device_ids == device_parser.parse_gpu_ids(gpus) assert trainer.gpus == gpus From 3f90fd47f991ea98cbefcb49453a544cdb0a63b4 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 26 Oct 2021 23:34:33 +0200 Subject: [PATCH 8/8] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos MocholĂ­ --- tests/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/__init__.py b/tests/__init__.py index d70a86f2e4fd5..f8d79890ff84b 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -25,7 +25,6 @@ _PATH_DATASETS = os.path.join(_PROJECT_ROOT, "Datasets") _PATH_LEGACY = os.path.join(_PROJECT_ROOT, "legacy") -PL_VERSION_LT_1_5 = _compare_version("pytorch_lightning", operator.lt, "1.5") TENSORBOARD_VERSION_GE_2_6 = _compare_version("tensorboard", operator.ge, "2.6.0") # todo: this setting `PYTHONPATH` may not be used by other evns like Conda for import packages