From ae1f2285df8fede35328e21d1722a9254a2f54e1 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 8 Sep 2020 13:20:51 -0400 Subject: [PATCH 1/9] ref: train loop refactors part 2: 1/n --- pytorch_lightning/trainer/trainer.py | 2 +- .../trainer/{training_loop_temp.py => training_loop.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename pytorch_lightning/trainer/{training_loop_temp.py => training_loop.py} (100%) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index f5a8f276594e8..d81c89327f79a 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -56,7 +56,7 @@ from pytorch_lightning.accelerators.accelerator_connector import AcceleratorConnector from pytorch_lightning.trainer.logger_connector import LoggerConnector from pytorch_lightning.trainer.lr_scheduler_connector import LRSchedulerConnector -from pytorch_lightning.trainer.training_loop_temp import TrainLoop +from pytorch_lightning.trainer.training_loop import TrainLoop from pytorch_lightning import _logger as log from pytorch_lightning.trainer.tuning import Tuner from pytorch_lightning.utilities.model_utils import is_overridden diff --git a/pytorch_lightning/trainer/training_loop_temp.py b/pytorch_lightning/trainer/training_loop.py similarity index 100% rename from pytorch_lightning/trainer/training_loop_temp.py rename to pytorch_lightning/trainer/training_loop.py From 8b27b8f7c9df6d8c7fc06eb39efadabbb1bf8a20 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 8 Sep 2020 14:01:16 -0400 Subject: [PATCH 2/9] ref: device parser --- pytorch_lightning/trainer/distrib_parts.py | 277 ------------------ pytorch_lightning/trainer/trainer.py | 14 +- pytorch_lightning/tuner/__init__.py | 0 pytorch_lightning/tuner/auto_gpu_select.py | 36 +++ .../{trainer => tuner}/batch_size_scaling.py | 0 .../{trainer => tuner}/tuning.py | 6 +- pytorch_lightning/utilities/device_parser.py | 199 +++++++++++++ tests/models/test_gpu.py | 14 +- 8 files changed, 254 insertions(+), 292 deletions(-) create mode 100644 pytorch_lightning/tuner/__init__.py create mode 100644 pytorch_lightning/tuner/auto_gpu_select.py rename pytorch_lightning/{trainer => tuner}/batch_size_scaling.py (100%) rename pytorch_lightning/{trainer => tuner}/tuning.py (83%) create mode 100644 pytorch_lightning/utilities/device_parser.py diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index c085c8acef375..1a98734a802d9 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -18,43 +18,16 @@ """ -from contextlib import ExitStack -import os from abc import ABC, abstractmethod -import time -import random import torch -from torch.optim.lr_scheduler import _LRScheduler from typing import Union, Callable, Any, List, Optional, Tuple, MutableSequence from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning import _logger as log from pytorch_lightning.overrides.data_parallel import ( LightningDistributedDataParallel, LightningDataParallel, ) from pytorch_lightning.utilities import move_data_to_device, AMPType -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.distributed import rank_zero_only - -try: - from apex import amp -except ImportError: - amp = None - -try: - import torch_xla.core.xla_model as xm -except ImportError: - XLA_AVAILABLE = False -else: - XLA_AVAILABLE = True - -try: - import horovod.torch as hvd -except (ModuleNotFoundError, ImportError): - HOROVOD_AVAILABLE = False -else: - HOROVOD_AVAILABLE = True class TrainerDPMixin(ABC): @@ -82,30 +55,10 @@ class TrainerDPMixin(ABC): logger: ... amp_backend: AMPType - @abstractmethod - def call_setup_hook(self, *args): - """Warning: this is just empty shell for code implemented in other class.""" - - @abstractmethod - def init_optimizers(self, *args) -> Tuple[List, List, List]: - """Warning: this is just empty shell for code implemented in other class.""" - @abstractmethod def get_model(self) -> LightningModule: """Warning: this is just empty shell for code implemented in other class.""" - @abstractmethod - def reinit_scheduler_properties(self, *args): - """Warning: this is just empty shell for code implemented in other class.""" - - @abstractmethod - def setup(self, *args) -> None: - """Warning: this is just empty shell for code implemented in other class.""" - - @abstractmethod - def is_function_implemented(self, *args) -> bool: - """Warning: this is just empty shell for code implemented in other class.""" - def copy_trainer_model_properties(self, model): if isinstance(model, LightningDataParallel): ref_model = model.module @@ -152,233 +105,3 @@ def __transfer_batch_to_device(self, batch: Any, device: torch.device): if model is not None: return model.transfer_batch_to_device(batch, device) return move_data_to_device(batch, device) - - -def _normalize_parse_gpu_string_input(s: Union[int, str, List[int]]) -> Union[int, List[int]]: - if isinstance(s, str): - if s == '-1': - return -1 - else: - return [int(x.strip()) for x in s.split(',') if len(x) > 0] - else: - return s - - -def get_all_available_gpus() -> List[int]: - """ - Returns: - a list of all available gpus - """ - return list(range(torch.cuda.device_count())) - - -def _check_data_type(device_ids: Any) -> None: - """ - Checks that the device_ids argument is one of: None, Int, String or List. - Raises a MisconfigurationException otherwise. - - Args: - device_ids: gpus/tpu_cores parameter as passed to the Trainer - """ - if device_ids is not None and (not isinstance(device_ids, (int, str, MutableSequence)) or isinstance(device_ids, bool)): - raise MisconfigurationException("Device ID's (GPU/TPU) must be int, string or sequence of ints or None.") - - -def _normalize_parse_gpu_input_to_list(gpus: Union[int, List[int]]) -> Optional[List[int]]: - assert gpus is not None - if isinstance(gpus, MutableSequence): - return list(gpus) - - # must be an int - if not gpus: # gpus==0 - return None - if gpus == -1: - return get_all_available_gpus() - - return list(range(gpus)) - - -def sanitize_gpu_ids(gpus: List[int]) -> List[int]: - """ - Checks that each of the GPUs in the list is actually available. - Raises a MisconfigurationException if any of the GPUs is not available. - - Args: - gpus: list of ints corresponding to GPU indices - - Returns: - unmodified gpus variable - """ - all_available_gpus = get_all_available_gpus() - misconfig = False - for gpu in gpus: - if gpu not in all_available_gpus: - misconfig = True - - if misconfig: - # sometimes auto ddp might have different flags - # but this is not what the user intended - # correct for the user - if len(gpus) == len(all_available_gpus): - gpus = all_available_gpus - else: - raise MisconfigurationException(f""" - You requested GPUs: {gpus} - But your machine only has: {all_available_gpus} - """) - return gpus - - -def _parse_gpu_ids(gpus: Optional[Union[int, str, List[int]]]) -> Optional[List[int]]: - """ - Parses the GPU ids given in the format as accepted by the - :class:`~pytorch_lightning.trainer.Trainer`. - - Args: - gpus: An int -1 or string '-1' indicate that all available GPUs should be used. - A list of ints or a string containing list of comma separated integers - indicates specific GPUs to use. - An int 0 means that no GPUs should be used. - Any int N > 0 indicates that GPUs [0..N) should be used. - - Returns: - a list of gpus to be used or ``None`` if no GPUs were requested - - If no GPUs are available but the value of gpus variable indicates request for GPUs - then a MisconfigurationException is raised. - """ - - # nothing was passed into the GPUs argument - if callable(gpus): - return None - - # Check that gpus param is None, Int, String or List - _check_data_type(gpus) - - # Handle the case when no gpus are requested - if gpus is None or isinstance(gpus, int) and gpus == 0: - return None - - # We know user requested GPUs therefore if some of the - # requested GPUs are not available an exception is thrown. - - gpus = _normalize_parse_gpu_string_input(gpus) - gpus = _normalize_parse_gpu_input_to_list(gpus) - if not gpus: - raise MisconfigurationException("GPUs requested but none are available.") - gpus = sanitize_gpu_ids(gpus) - - return gpus - - -def determine_root_gpu_device(gpus: List[int]) -> Optional[int]: - """ - Args: - gpus: non-empty list of ints representing which gpus to use - - Returns: - designated root GPU device id - """ - if gpus is None: - return None - - assert isinstance(gpus, list), "gpus should be a list" - assert len(gpus) > 0, "gpus should be a non empty list" - - # set root gpu - root_gpu = gpus[0] - - return root_gpu - - -def retry_jittered_backoff(func: Callable, num_retries: int = 5, cap_delay: float = 1.0, base_delay: float = 0.01): - """Retry jittered backoff. - - Based on: - https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ - - Args: - func: tested function - num_retries: number of tries - cap_delay: max sleep time - base_delay: initial sleep time is 10ms - """ - sleep_delay = base_delay # initial sleep time is 10ms - - for i in range(num_retries): - try: - return func() - except RuntimeError as err: - if i == num_retries - 1: - raise err - else: - continue - time.sleep(sleep_delay) - sleep_delay = min(cap_delay, random.uniform(base_delay, sleep_delay * 3)) - - -def _parse_tpu_cores(tpu_cores: Union[int, str, List]) -> Optional[Union[List[int], int]]: - """ - Parses the tpu_cores given in the format as accepted by the - :class:`~pytorch_lightning.trainer.Trainer`. - - Args: - tpu_cores: An int 1 or string '1' indicate that 1 core with multi-processing should be used - An int 8 or string '8' indicate that all 8 cores with multi-processing should be used - A list of int or a string containing list of comma separated integer - indicates specific TPU core to use. - - Returns: - a list of tpu_cores to be used or ``None`` if no TPU cores were requested - """ - - if callable(tpu_cores): - return None - - _check_data_type(tpu_cores) - - if isinstance(tpu_cores, str): - tpu_cores = _parse_tpu_cores_str(tpu_cores.strip()) - - if not _tpu_cores_valid(tpu_cores): - raise MisconfigurationException("`tpu_cores` can only be 1, 8 or [<1-8>]") - - return tpu_cores - - -def _tpu_cores_valid(tpu_cores): - return tpu_cores in (1, 8, None) or ( - isinstance(tpu_cores, (list, tuple, set)) and - len(tpu_cores) == 1 and - tpu_cores[0] in range(1, 9) - ) - - -def _parse_tpu_cores_str(tpu_cores): - if tpu_cores in ('1', '8'): - tpu_cores = int(tpu_cores) - else: - tpu_cores = [int(x.strip()) for x in tpu_cores.split(',') if len(x) > 0] - return tpu_cores - - -def pick_single_gpu(exclude_gpus: list): - for i in range(torch.cuda.device_count()): - if i in exclude_gpus: - continue - # Try to allocate on device: - device = torch.device(f"cuda:{i}") - try: - torch.ones(1).to(device) - except RuntimeError: - continue - return i - raise RuntimeError("No GPUs available.") - - -def pick_multiple_gpus(nb): - picked = [] - for _ in range(nb): - picked.append(pick_single_gpu(exclude_gpus=picked)) - - return picked diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index d81c89327f79a..63eb3a122d845 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -36,8 +36,8 @@ from pytorch_lightning.trainer.data_loading import TrainerDataLoadingMixin from pytorch_lightning.trainer.deprecated_api import TrainerDeprecatedAPITillVer0_10 from pytorch_lightning.trainer.distrib_data_parallel import TrainerDDPMixin -from pytorch_lightning.trainer.distrib_parts import (TrainerDPMixin, _parse_gpu_ids, _parse_tpu_cores, - determine_root_gpu_device, pick_multiple_gpus) +from pytorch_lightning.utilities import device_parser +from pytorch_lightning.trainer.distrib_parts import (TrainerDPMixin) from pytorch_lightning.trainer.evaluation_loop import TrainerEvaluationLoopMixin from pytorch_lightning.trainer.logging import TrainerLoggingMixin from pytorch_lightning.trainer.lr_finder import TrainerLRFinderMixin @@ -58,7 +58,7 @@ from pytorch_lightning.trainer.lr_scheduler_connector import LRSchedulerConnector from pytorch_lightning.trainer.training_loop import TrainLoop from pytorch_lightning import _logger as log -from pytorch_lightning.trainer.tuning import Tuner +from pytorch_lightning.tuner.tuning import Tuner from pytorch_lightning.utilities.model_utils import is_overridden # warnings to ignore in trainer @@ -449,7 +449,7 @@ def __init__( raise MisconfigurationException("track_grad_norm can be an int, a float or 'inf' (infinity norm).") self.track_grad_norm = float(track_grad_norm) - self.tpu_cores = _parse_tpu_cores(tpu_cores) + self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores) self.on_tpu = self.tpu_cores is not None self.tpu_id = self.tpu_cores[0] if isinstance(self.tpu_cores, list) else None @@ -507,12 +507,12 @@ def __init__( # for gpus allow int, string and gpu list if auto_select_gpus and isinstance(gpus, int): - self.gpus = pick_multiple_gpus(gpus) + self.gpus = self.tuner.pick_multiple_gpus(gpus) else: self.gpus = gpus - self.data_parallel_device_ids = _parse_gpu_ids(self.gpus) - self.root_gpu = determine_root_gpu_device(self.data_parallel_device_ids) + self.data_parallel_device_ids = device_parser.parse_gpu_ids(self.gpus) + self.root_gpu = device_parser.determine_root_gpu_device(self.data_parallel_device_ids) self.root_device = torch.device("cpu") self.on_gpu = True if (self.data_parallel_device_ids and torch.cuda.is_available()) else False diff --git a/pytorch_lightning/tuner/__init__.py b/pytorch_lightning/tuner/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pytorch_lightning/tuner/auto_gpu_select.py b/pytorch_lightning/tuner/auto_gpu_select.py new file mode 100644 index 0000000000000..f1b13a69745bc --- /dev/null +++ b/pytorch_lightning/tuner/auto_gpu_select.py @@ -0,0 +1,36 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch + + +def pick_multiple_gpus(nb): + picked = [] + for _ in range(nb): + picked.append(pick_single_gpu(exclude_gpus=picked)) + + return picked + + +def pick_single_gpu(exclude_gpus: list): + for i in range(torch.cuda.device_count()): + if i in exclude_gpus: + continue + # Try to allocate on device: + device = torch.device(f"cuda:{i}") + try: + torch.ones(1).to(device) + except RuntimeError: + continue + return i + raise RuntimeError("No GPUs available.") diff --git a/pytorch_lightning/trainer/batch_size_scaling.py b/pytorch_lightning/tuner/batch_size_scaling.py similarity index 100% rename from pytorch_lightning/trainer/batch_size_scaling.py rename to pytorch_lightning/tuner/batch_size_scaling.py diff --git a/pytorch_lightning/trainer/tuning.py b/pytorch_lightning/tuner/tuning.py similarity index 83% rename from pytorch_lightning/trainer/tuning.py rename to pytorch_lightning/tuner/tuning.py index 331805c9668ac..6601cb93fc62d 100644 --- a/pytorch_lightning/trainer/tuning.py +++ b/pytorch_lightning/tuner/tuning.py @@ -11,7 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.trainer.batch_size_scaling import scale_batch_size +from pytorch_lightning.tuner.batch_size_scaling import scale_batch_size +from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus class Tuner: @@ -30,3 +31,6 @@ def scale_batch_size(self, return scale_batch_size( self.trainer, model, mode, steps_per_trial, init_val, max_trials, batch_arg_name, **fit_kwargs ) + + def pick_multiple_gpus(self, num_gpus: int): + return pick_multiple_gpus(num_gpus) diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py new file mode 100644 index 0000000000000..35ae7aa9040dc --- /dev/null +++ b/pytorch_lightning/utilities/device_parser.py @@ -0,0 +1,199 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +from typing import Union, Any, List, Optional, MutableSequence +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +def determine_root_gpu_device(gpus: List[int]) -> Optional[int]: + """ + Args: + gpus: non-empty list of ints representing which gpus to use + + Returns: + designated root GPU device id + """ + if gpus is None: + return None + + assert isinstance(gpus, list), "gpus should be a list" + assert len(gpus) > 0, "gpus should be a non empty list" + + # set root gpu + root_gpu = gpus[0] + + return root_gpu + + +def parse_gpu_ids(gpus: Optional[Union[int, str, List[int]]]) -> Optional[List[int]]: + """ + Parses the GPU ids given in the format as accepted by the + :class:`~pytorch_lightning.trainer.Trainer`. + + Args: + gpus: An int -1 or string '-1' indicate that all available GPUs should be used. + A list of ints or a string containing list of comma separated integers + indicates specific GPUs to use. + An int 0 means that no GPUs should be used. + Any int N > 0 indicates that GPUs [0..N) should be used. + + Returns: + a list of gpus to be used or ``None`` if no GPUs were requested + + If no GPUs are available but the value of gpus variable indicates request for GPUs + then a MisconfigurationException is raised. + """ + + # nothing was passed into the GPUs argument + if callable(gpus): + return None + + # Check that gpus param is None, Int, String or List + _check_data_type(gpus) + + # Handle the case when no gpus are requested + if gpus is None or isinstance(gpus, int) and gpus == 0: + return None + + # We know user requested GPUs therefore if some of the + # requested GPUs are not available an exception is thrown. + + gpus = _normalize_parse_gpu_string_input(gpus) + gpus = _normalize_parse_gpu_input_to_list(gpus) + if not gpus: + raise MisconfigurationException("GPUs requested but none are available.") + gpus = _sanitize_gpu_ids(gpus) + + return gpus + + +def parse_tpu_cores(tpu_cores: Union[int, str, List]) -> Optional[Union[List[int], int]]: + """ + Parses the tpu_cores given in the format as accepted by the + :class:`~pytorch_lightning.trainer.Trainer`. + + Args: + tpu_cores: An int 1 or string '1' indicate that 1 core with multi-processing should be used + An int 8 or string '8' indicate that all 8 cores with multi-processing should be used + A list of int or a string containing list of comma separated integer + indicates specific TPU core to use. + + Returns: + a list of tpu_cores to be used or ``None`` if no TPU cores were requested + """ + + if callable(tpu_cores): + return None + + _check_data_type(tpu_cores) + + if isinstance(tpu_cores, str): + tpu_cores = _parse_tpu_cores_str(tpu_cores.strip()) + + if not _tpu_cores_valid(tpu_cores): + raise MisconfigurationException("`tpu_cores` can only be 1, 8 or [<1-8>]") + + return tpu_cores + + +def _normalize_parse_gpu_string_input(s: Union[int, str, List[int]]) -> Union[int, List[int]]: + if isinstance(s, str): + if s == '-1': + return -1 + else: + return [int(x.strip()) for x in s.split(',') if len(x) > 0] + else: + return s + + +def _sanitize_gpu_ids(gpus: List[int]) -> List[int]: + """ + Checks that each of the GPUs in the list is actually available. + Raises a MisconfigurationException if any of the GPUs is not available. + + Args: + gpus: list of ints corresponding to GPU indices + + Returns: + unmodified gpus variable + """ + all_available_gpus = _get_all_available_gpus() + misconfig = False + for gpu in gpus: + if gpu not in all_available_gpus: + misconfig = True + + if misconfig: + # sometimes auto ddp might have different flags + # but this is not what the user intended + # correct for the user + if len(gpus) == len(all_available_gpus): + gpus = all_available_gpus + else: + raise MisconfigurationException(f""" + You requested GPUs: {gpus} + But your machine only has: {all_available_gpus} + """) + return gpus + + +def _normalize_parse_gpu_input_to_list(gpus: Union[int, List[int]]) -> Optional[List[int]]: + assert gpus is not None + if isinstance(gpus, MutableSequence): + return list(gpus) + + # must be an int + if not gpus: # gpus==0 + return None + if gpus == -1: + return _get_all_available_gpus() + + return list(range(gpus)) + + +def _get_all_available_gpus() -> List[int]: + """ + Returns: + a list of all available gpus + """ + return list(range(torch.cuda.device_count())) + + +def _check_data_type(device_ids: Any) -> None: + """ + Checks that the device_ids argument is one of: None, Int, String or List. + Raises a MisconfigurationException otherwise. + + Args: + device_ids: gpus/tpu_cores parameter as passed to the Trainer + """ + if device_ids is not None and \ + (not isinstance(device_ids, (int, str, MutableSequence)) or isinstance(device_ids, bool)): + raise MisconfigurationException("Device ID's (GPU/TPU) must be int, string or sequence of ints or None.") + + +def _tpu_cores_valid(tpu_cores): + return tpu_cores in (1, 8, None) or ( + isinstance(tpu_cores, (list, tuple, set)) and + len(tpu_cores) == 1 and + tpu_cores[0] in range(1, 9) + ) + + +def _parse_tpu_cores_str(tpu_cores): + if tpu_cores in ('1', '8'): + tpu_cores = int(tpu_cores) + else: + tpu_cores = [int(x.strip()) for x in tpu_cores.split(',') if len(x) > 0] + return tpu_cores diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index b6a2efbb8621b..fa13aaf4a7e93 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -14,7 +14,7 @@ import tests.base.develop_utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.core import memory -from pytorch_lightning.trainer.distrib_parts import _parse_gpu_ids, determine_root_gpu_device +from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import EvalModelTemplate from tests.models.data.ddp import train_test_variations @@ -275,7 +275,7 @@ def test_root_gpu_property_0_raising(mocked_device_count_0, gpus, expected_root_ pytest.param([1, 2], 1, id="[1, 2] gpus, expect gpu root device to be 1."), ]) def test_determine_root_gpu_device(gpus, expected_root_gpu): - assert determine_root_gpu_device(gpus) == expected_root_gpu + assert device_parser.determine_root_gpu_device(gpus) == expected_root_gpu @pytest.mark.gpus_param_tests @@ -294,7 +294,7 @@ def test_determine_root_gpu_device(gpus, expected_root_gpu): pytest.param('-1', list(range(PRETEND_N_OF_GPUS)), id="'-1' - use all gpus"), ]) def test_parse_gpu_ids(mocked_device_count, gpus, expected_gpu_ids): - assert _parse_gpu_ids(gpus) == expected_gpu_ids + assert device_parser.parse_gpu_ids(gpus) == expected_gpu_ids @pytest.mark.gpus_param_tests @@ -310,27 +310,27 @@ def test_parse_gpu_ids(mocked_device_count, gpus, expected_gpu_ids): ]) def test_parse_gpu_fail_on_unsupported_inputs(mocked_device_count, gpus): with pytest.raises(MisconfigurationException): - _parse_gpu_ids(gpus) + device_parser.parse_gpu_ids(gpus) @pytest.mark.gpus_param_tests @pytest.mark.parametrize("gpus", [[1, 2, 19], -1, '-1']) def test_parse_gpu_fail_on_non_existent_id(mocked_device_count_0, gpus): with pytest.raises(MisconfigurationException): - _parse_gpu_ids(gpus) + device_parser.parse_gpu_ids(gpus) @pytest.mark.gpus_param_tests def test_parse_gpu_fail_on_non_existent_id_2(mocked_device_count): with pytest.raises(MisconfigurationException): - _parse_gpu_ids([1, 2, 19]) + device_parser.parse_gpu_ids([1, 2, 19]) @pytest.mark.gpus_param_tests @pytest.mark.parametrize("gpus", [-1, '-1']) def test_parse_gpu_returns_none_when_no_devices_are_available(mocked_device_count_0, gpus): with pytest.raises(MisconfigurationException): - _parse_gpu_ids(gpus) + device_parser.parse_gpu_ids(gpus) @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") From 42508271ce10b93eddf3262fe4c2ca57aec2a018 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 8 Sep 2020 14:07:51 -0400 Subject: [PATCH 3/9] ref: device parser --- pytorch_lightning/utilities/device_parser.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py index 35ae7aa9040dc..ef1807b43f3ab 100644 --- a/pytorch_lightning/utilities/device_parser.py +++ b/pytorch_lightning/utilities/device_parser.py @@ -184,11 +184,17 @@ def _check_data_type(device_ids: Any) -> None: def _tpu_cores_valid(tpu_cores): - return tpu_cores in (1, 8, None) or ( - isinstance(tpu_cores, (list, tuple, set)) and - len(tpu_cores) == 1 and - tpu_cores[0] in range(1, 9) - ) + # allow 1 or 8 cores + if tpu_cores in (1, 8 , None): + return True + + # allow picking 1 of 8 indexes + is_tpu_list = isinstance(tpu_cores, (list, tuple, set)) + has_1_tpu_idx = len(tpu_cores) == 1 + is_valid_tpu_idx = tpu_cores[0] in range(1, 9) + + is_valid_tpu_core_choice = is_tpu_list and has_1_tpu_idx and is_valid_tpu_idx + return is_valid_tpu_core_choice def _parse_tpu_cores_str(tpu_cores): From 8c2c1d27475230d5817a217c7fd5fdf15c354597 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 8 Sep 2020 14:09:53 -0400 Subject: [PATCH 4/9] ref: device parser --- pytorch_lightning/utilities/device_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py index ef1807b43f3ab..619d722a35f25 100644 --- a/pytorch_lightning/utilities/device_parser.py +++ b/pytorch_lightning/utilities/device_parser.py @@ -185,7 +185,7 @@ def _check_data_type(device_ids: Any) -> None: def _tpu_cores_valid(tpu_cores): # allow 1 or 8 cores - if tpu_cores in (1, 8 , None): + if tpu_cores in (1, 8, None): return True # allow picking 1 of 8 indexes From a2c17ab70ccb057d26f51f72d800c22715d04d48 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 8 Sep 2020 14:15:34 -0400 Subject: [PATCH 5/9] ref: device parser --- docs/source/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 721e8b763619a..d37498e9e9f5d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -140,3 +140,4 @@ Indices and tables api/pytorch_lightning.profiler api/pytorch_lightning.trainer api/pytorch_lightning.utilities + api/pytorch_lightning.tuner From a9b4e212485b0be948b94d1353d4a9932f23c75b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 8 Sep 2020 14:29:13 -0400 Subject: [PATCH 6/9] ref: device parser --- pytorch_lightning/utilities/device_parser.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py index 619d722a35f25..4f798918f16df 100644 --- a/pytorch_lightning/utilities/device_parser.py +++ b/pytorch_lightning/utilities/device_parser.py @@ -189,12 +189,12 @@ def _tpu_cores_valid(tpu_cores): return True # allow picking 1 of 8 indexes - is_tpu_list = isinstance(tpu_cores, (list, tuple, set)) - has_1_tpu_idx = len(tpu_cores) == 1 - is_valid_tpu_idx = tpu_cores[0] in range(1, 9) + if isinstance(tpu_cores, (list, tuple, set)): + has_1_tpu_idx = len(tpu_cores) == 1 + is_valid_tpu_idx = tpu_cores[0] in range(1, 9) - is_valid_tpu_core_choice = is_tpu_list and has_1_tpu_idx and is_valid_tpu_idx - return is_valid_tpu_core_choice + is_valid_tpu_core_choice = has_1_tpu_idx and is_valid_tpu_idx + return is_valid_tpu_core_choice def _parse_tpu_cores_str(tpu_cores): From 4955db537fdfeff96ccbb7280450a622ff0ccdbf Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 8 Sep 2020 14:45:50 -0400 Subject: [PATCH 7/9] ref: device parser --- pytorch_lightning/utilities/device_parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/utilities/device_parser.py b/pytorch_lightning/utilities/device_parser.py index 4f798918f16df..f67b09eccf51d 100644 --- a/pytorch_lightning/utilities/device_parser.py +++ b/pytorch_lightning/utilities/device_parser.py @@ -196,6 +196,8 @@ def _tpu_cores_valid(tpu_cores): is_valid_tpu_core_choice = has_1_tpu_idx and is_valid_tpu_idx return is_valid_tpu_core_choice + return False + def _parse_tpu_cores_str(tpu_cores): if tpu_cores in ('1', '8'): From ba03a3de9f526f3a3c22be269a8d8a367d64d2ca Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 8 Sep 2020 15:08:25 -0400 Subject: [PATCH 8/9] ref: device parser --- pytorch_lightning/trainer/distrib_parts.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index 1a98734a802d9..95a0010238ad7 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -29,6 +29,13 @@ ) from pytorch_lightning.utilities import move_data_to_device, AMPType +try: + import horovod.torch as hvd +except (ModuleNotFoundError, ImportError): + HOROVOD_AVAILABLE = False +else: + HOROVOD_AVAILABLE = True + class TrainerDPMixin(ABC): From 35a7982d595c48c3d7dd969d0d23d20df6d1ff68 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 8 Sep 2020 18:07:04 -0400 Subject: [PATCH 9/9] ref: device parser --- pytorch_lightning/trainer/trainer.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 63eb3a122d845..b3b74575ea89d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -373,6 +373,20 @@ def __init__( if 'LOCAL_RANK' in os.environ: rank_zero_only.rank = int(os.environ['LOCAL_RANK']) + # tracks internal state for debugging + self.dev_debugger = InternalDebugger(self) + self.config_validator = ConfigValidator(self) + self.data_connector = DataConnector(self) + self.lr_scheduler_connector = LRSchedulerConnector(self) + self.accelerator_connector = AcceleratorConnector(self) + self.logger_connector = LoggerConnector(self) + self.tuner = Tuner(self) + self.accelerator_backend = None + + # loops + self.evaluation_loop = EvaluationLoop(self) + self.train_loop = TrainLoop(self) + # training bookeeping self.total_batch_idx = 0 self.running_loss = TensorRunningAccum(window_length=20) @@ -605,20 +619,6 @@ def __init__( self.on_colab_kaggle = os.getenv('COLAB_GPU') or os.getenv('KAGGLE_URL_BASE') - # tracks internal state for debugging - self.dev_debugger = InternalDebugger(self) - self.config_validator = ConfigValidator(self) - self.data_connector = DataConnector(self) - self.lr_scheduler_connector = LRSchedulerConnector(self) - self.accelerator_connector = AcceleratorConnector(self) - self.logger_connector = LoggerConnector(self) - self.tuner = Tuner(self) - self.accelerator_backend = None - - # loops - self.evaluation_loop = EvaluationLoop(self) - self.train_loop = TrainLoop(self) - # Callback system self.on_init_end()