Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix gpus -1 for CPU #8766

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Fixed

- Fixed parsing argument `gpus=-1`in CPU machines ([#8766](https://github.com/PyTorchLightning/pytorch-lightning/pull/8766))
Borda marked this conversation as resolved.
Show resolved Hide resolved


- Fixed an issue with logger outputs not being finalized correctly after prediction runs ([#8685](https://github.com/PyTorchLightning/pytorch-lightning/pull/8685))

Expand Down
15 changes: 6 additions & 9 deletions pytorch_lightning/trainer/connectors/accelerator_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,16 +553,14 @@ def is_distributed(self) -> bool:
if hasattr(self.training_type_plugin, "is_distributed") and not self.use_tpu:
return self.training_type_plugin.is_distributed
is_distributed = self.use_ddp or self.use_ddp2 or self.use_horovod
if self.use_tpu:
if self.use_tpu and hasattr(self.training_type_plugin, "is_distributed"):
Borda marked this conversation as resolved.
Show resolved Hide resolved
is_distributed |= self.training_type_plugin.is_distributed
return is_distributed

@property
def num_gpus(self) -> int:
gpus = self.parallel_device_ids
if gpus is None:
return 0
return len(gpus)
return 0 if gpus is None else len(gpus)

@property
def num_ipus(self) -> int:
Expand All @@ -581,6 +579,8 @@ def parallel_devices(self) -> List[Union[torch.device, int]]:
# https://github.com/PyTorchLightning/pytorch-lightning/issues/3169
if isinstance(self.tpu_cores, int):
devices = list(range(self.tpu_cores))
else:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you there ? tpu_cores=[5] is also a valid value.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so far reading code not...
cc: @kaushikb11 ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

raise MisconfigurationException(f"`tpu_cores` has to be int, but {self.tpu_cores} given.")
elif self.use_ipu:
devices = list(range(self.num_ipus))
else:
Expand All @@ -589,11 +589,8 @@ def parallel_devices(self) -> List[Union[torch.device, int]]:

@property
def root_gpu(self) -> Optional[int]:
return (
self.accelerator.root_device.index
if not isinstance(self.accelerator, (IPUAccelerator, TPUAccelerator))
else None
)
if not isinstance(self.accelerator, (IPUAccelerator, TPUAccelerator)):
Borda marked this conversation as resolved.
Show resolved Hide resolved
return self.accelerator.root_device.index

@staticmethod
def _is_plugin_training_type(plugin: Union[str, TrainingTypePlugin]) -> bool:
Expand Down
11 changes: 6 additions & 5 deletions pytorch_lightning/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from weakref import proxy

import torch
from torch.cuda.amp import GradScaler
from torch.optim import Optimizer

import pytorch_lightning as pl
Expand Down Expand Up @@ -1490,9 +1491,9 @@ def _on_exception(self):
file_path = os.path.join(self.default_root_dir, ".pl_auto_save.ckpt")
self.save_checkpoint(file_path)

"""
Accelerator properties
"""
########################
carmocca marked this conversation as resolved.
Show resolved Hide resolved
# Accelerator properties
########################
Borda marked this conversation as resolved.
Show resolved Hide resolved

@property
def accelerator(self) -> Accelerator:
Expand Down Expand Up @@ -1555,7 +1556,7 @@ def root_gpu(self) -> Optional[int]:
return self.accelerator_connector.root_gpu

@property
def tpu_cores(self) -> int:
def tpu_cores(self) -> Optional[Union[int, List[int]]]:
return self.accelerator_connector.tpu_cores

@property
Expand Down Expand Up @@ -1616,7 +1617,7 @@ def precision(self) -> Union[str, int]:
return self.accelerator.precision

@property
def scaler(self):
def scaler(self) -> Optional[GradScaler]:
return self.accelerator.scaler

@property
Expand Down
4 changes: 3 additions & 1 deletion pytorch_lightning/utilities/device_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def parse_gpu_ids(gpus: Optional[Union[int, str, List[int]]]) -> Optional[List[i
_check_data_type(gpus)

# Handle the case when no gpus are requested
if gpus is None or isinstance(gpus, int) and gpus == 0 or str(gpus).strip() == "0":
if gpus is None or str(gpus).strip() == "0" or (str(gpus).strip() == "-1" and not torch.cuda.is_available()):
return None

# We know user requested GPUs therefore if some of the
Expand Down Expand Up @@ -173,6 +173,8 @@ def _get_all_available_gpus() -> List[int]:
Returns:
a list of all available gpus
"""
if not torch.cuda.is_available():
return []
return list(range(torch.cuda.device_count()))


Expand Down
6 changes: 6 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import operator
import os

import numpy as np

from pytorch_lightning.utilities.imports import _compare_version

_TEST_ROOT = os.path.dirname(__file__)
_PROJECT_ROOT = os.path.dirname(_TEST_ROOT)
_TEMP_PATH = os.path.join(_PROJECT_ROOT, "test_temp")
_PATH_DATASETS = os.path.join(_PROJECT_ROOT, "Datasets")
_PATH_LEGACY = os.path.join(_PROJECT_ROOT, "legacy")

PL_VERSION_LT_1_5 = _compare_version("pytorch_lightning", operator.lt, "1.5")
Borda marked this conversation as resolved.
Show resolved Hide resolved
TENSORBOARD_VERSION_GE_2_6 = _compare_version("tensorboard", operator.ge, "2.6.0")

# todo: this setting `PYTHONPATH` may not be used by other evns like Conda for import packages
if _PROJECT_ROOT not in os.getenv("PYTHONPATH", ""):
splitter = ":" if os.environ.get("PYTHONPATH", "") else ""
Expand Down
7 changes: 2 additions & 5 deletions tests/loggers/test_tensorboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import operator
import os
from argparse import Namespace
from unittest import mock
Expand All @@ -25,13 +24,11 @@

from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.utilities.imports import _compare_version
from tests import TENSORBOARD_VERSION_GE_2_6
from tests.helpers import BoringModel


@pytest.mark.skipif(
_compare_version("tensorboard", operator.ge, "2.6.0"), reason="cannot import EventAccumulator in >= 2.6.0"
)
@pytest.mark.skipif(TENSORBOARD_VERSION_GE_2_6, reason="cannot import EventAccumulator in >= 2.6.0")
def test_tensorboard_hparams_reload(tmpdir):
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator

Expand Down
201 changes: 0 additions & 201 deletions tests/models/test_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,19 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import operator
import os
from collections import namedtuple
from unittest import mock
from unittest.mock import patch

import pytest
import torch

import tests.helpers.pipelines as tpipes
import tests.helpers.utils as tutils
from pytorch_lightning import Trainer
from pytorch_lightning.plugins.environments import TorchElasticEnvironment
from pytorch_lightning.utilities import device_parser
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.imports import _compare_version
from tests.helpers import BoringModel
from tests.helpers.datamodules import ClassifDataModule
from tests.helpers.imports import Batch, Dataset, Example, Field, LabelField
from tests.helpers.runif import RunIf
from tests.helpers.simple_models import ClassificationModel

PL_VERSION_LT_1_5 = _compare_version("pytorch_lightning", operator.lt, "1.5")
PRETEND_N_OF_GPUS = 16


@RunIf(min_gpus=2)
def test_multi_gpu_none_backend(tmpdir):
Expand All @@ -55,195 +43,6 @@ def test_multi_gpu_none_backend(tmpdir):
tpipes.run_model_test(trainer_options, model, dm)


@RunIf(min_gpus=2)
@pytest.mark.parametrize("gpus", [1, [0], [1]])
def test_single_gpu_model(tmpdir, gpus):
"""Make sure single GPU works (DP mode)."""
trainer_options = dict(
default_root_dir=tmpdir,
enable_progress_bar=False,
max_epochs=1,
limit_train_batches=0.1,
limit_val_batches=0.1,
gpus=gpus,
)

model = BoringModel()
tpipes.run_model_test(trainer_options, model)


@pytest.fixture
def mocked_device_count(monkeypatch):
def device_count():
return PRETEND_N_OF_GPUS

def is_available():
return True

monkeypatch.setattr(torch.cuda, "is_available", is_available)
monkeypatch.setattr(torch.cuda, "device_count", device_count)


@pytest.fixture
def mocked_device_count_0(monkeypatch):
def device_count():
return 0

monkeypatch.setattr(torch.cuda, "device_count", device_count)


@pytest.mark.parametrize(
["gpus", "expected_num_gpus", "distributed_backend"],
[
pytest.param(None, 0, None, id="None - expect 0 gpu to use."),
pytest.param(0, 0, None, id="Oth gpu, expect 1 gpu to use."),
pytest.param(1, 1, None, id="1st gpu, expect 1 gpu to use."),
pytest.param(-1, PRETEND_N_OF_GPUS, "ddp", id="-1 - use all gpus"),
pytest.param("-1", PRETEND_N_OF_GPUS, "ddp", id="'-1' - use all gpus"),
pytest.param(3, 3, "ddp", id="3rd gpu - 1 gpu to use (backend:ddp)"),
],
)
def test_trainer_gpu_parse(mocked_device_count, gpus, expected_num_gpus, distributed_backend):
assert Trainer(gpus=gpus, accelerator=distributed_backend).num_gpus == expected_num_gpus


@pytest.mark.parametrize(
["gpus", "expected_num_gpus", "distributed_backend"],
[
pytest.param(None, 0, None, id="None - expect 0 gpu to use."),
pytest.param(None, 0, "ddp", id="None - expect 0 gpu to use."),
],
)
def test_trainer_num_gpu_0(mocked_device_count_0, gpus, expected_num_gpus, distributed_backend):
assert Trainer(gpus=gpus, accelerator=distributed_backend).num_gpus == expected_num_gpus


@pytest.mark.parametrize(
["gpus", "expected_root_gpu", "distributed_backend"],
[
pytest.param(None, None, "ddp", id="None is None"),
pytest.param(0, None, "ddp", id="O gpus, expect gpu root device to be None."),
pytest.param(1, 0, "ddp", id="1 gpu, expect gpu root device to be 0."),
pytest.param(-1, 0, "ddp", id="-1 - use all gpus, expect gpu root device to be 0."),
pytest.param("-1", 0, "ddp", id="'-1' - use all gpus, expect gpu root device to be 0."),
pytest.param(3, 0, "ddp", id="3 gpus, expect gpu root device to be 0.(backend:ddp)"),
],
)
def test_root_gpu_property(mocked_device_count, gpus, expected_root_gpu, distributed_backend):
assert Trainer(gpus=gpus, accelerator=distributed_backend).root_gpu == expected_root_gpu


@pytest.mark.parametrize(
["gpus", "expected_root_gpu", "distributed_backend"],
[
pytest.param(None, None, None, id="None is None"),
pytest.param(None, None, "ddp", id="None is None"),
pytest.param(0, None, "ddp", id="None is None"),
],
)
def test_root_gpu_property_0_passing(mocked_device_count_0, gpus, expected_root_gpu, distributed_backend):
assert Trainer(gpus=gpus, accelerator=distributed_backend).root_gpu == expected_root_gpu


# Asking for a gpu when non are available will result in a MisconfigurationException
@pytest.mark.parametrize(
["gpus", "expected_root_gpu", "distributed_backend"],
[
(1, None, "ddp"),
(3, None, "ddp"),
(3, None, "ddp"),
([1, 2], None, "ddp"),
([0, 1], None, "ddp"),
(-1, None, "ddp"),
("-1", None, "ddp"),
],
)
def test_root_gpu_property_0_raising(mocked_device_count_0, gpus, expected_root_gpu, distributed_backend):
with pytest.raises(MisconfigurationException):
Trainer(gpus=gpus, accelerator=distributed_backend)


@pytest.mark.parametrize(
["gpus", "expected_root_gpu"],
[
pytest.param(None, None, id="No gpus, expect gpu root device to be None"),
pytest.param([0], 0, id="Oth gpu, expect gpu root device to be 0."),
pytest.param([1], 1, id="1st gpu, expect gpu root device to be 1."),
pytest.param([3], 3, id="3rd gpu, expect gpu root device to be 3."),
pytest.param([1, 2], 1, id="[1, 2] gpus, expect gpu root device to be 1."),
],
)
def test_determine_root_gpu_device(gpus, expected_root_gpu):
assert device_parser.determine_root_gpu_device(gpus) == expected_root_gpu


@pytest.mark.parametrize(
["gpus", "expected_gpu_ids"],
[
(None, None),
(0, None),
(1, [0]),
(3, [0, 1, 2]),
pytest.param(-1, list(range(PRETEND_N_OF_GPUS)), id="-1 - use all gpus"),
([0], [0]),
([1, 3], [1, 3]),
((1, 3), [1, 3]),
("0", None),
("3", [0, 1, 2]),
("1, 3", [1, 3]),
("2,", [2]),
pytest.param("-1", list(range(PRETEND_N_OF_GPUS)), id="'-1' - use all gpus"),
],
)
def test_parse_gpu_ids(mocked_device_count, gpus, expected_gpu_ids):
assert device_parser.parse_gpu_ids(gpus) == expected_gpu_ids


@pytest.mark.parametrize("gpus", [0.1, -2, False, [], [-1], [None], ["0"], [0, 0]])
def test_parse_gpu_fail_on_unsupported_inputs(mocked_device_count, gpus):
with pytest.raises(MisconfigurationException):
device_parser.parse_gpu_ids(gpus)


@pytest.mark.parametrize("gpus", [[1, 2, 19], -1, "-1"])
def test_parse_gpu_fail_on_non_existent_id(mocked_device_count_0, gpus):
with pytest.raises(MisconfigurationException):
device_parser.parse_gpu_ids(gpus)


def test_parse_gpu_fail_on_non_existent_id_2(mocked_device_count):
with pytest.raises(MisconfigurationException):
device_parser.parse_gpu_ids([1, 2, 19])


@pytest.mark.parametrize("gpus", [-1, "-1"])
def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_count_0, gpus):
with pytest.raises(MisconfigurationException):
device_parser.parse_gpu_ids(gpus)


@mock.patch.dict(
os.environ,
{
"CUDA_VISIBLE_DEVICES": "0",
"LOCAL_RANK": "1",
"GROUP_RANK": "1",
"RANK": "3",
"WORLD_SIZE": "4",
"LOCAL_WORLD_SIZE": "2",
},
)
@mock.patch("torch.cuda.device_count", return_value=1)
@pytest.mark.parametrize("gpus", [[0, 1, 2], 2, "0"])
def test_torchelastic_gpu_parsing(mocked_device_count, gpus):
"""Ensure when using torchelastic and nproc_per_node is set to the default of 1 per GPU device That we omit
sanitizing the gpus as only one of the GPUs is visible."""
trainer = Trainer(gpus=gpus)
assert isinstance(trainer.accelerator_connector.cluster_environment, TorchElasticEnvironment)
assert trainer.accelerator_connector.parallel_device_ids == device_parser.parse_gpu_ids(gpus)
assert trainer.gpus == gpus


@RunIf(min_gpus=1)
def test_single_gpu_batch_parse():
trainer = Trainer(gpus=1)
Expand Down
Loading