Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor: Runif for TPU and Horovod 5/n #6301

Merged
merged 8 commits into from
Mar 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/accelerators/test_accelerator_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def test_accelerator_choice_ddp_spawn(cuda_available_mock, device_count_mock):
assert isinstance(trainer.training_type_plugin.cluster_environment, TorchElasticEnvironment)


@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@RunIf(min_gpus=2)
@mock.patch.dict(
os.environ, {
"CUDA_VISIBLE_DEVICES": "0,1",
Expand Down
10 changes: 5 additions & 5 deletions tests/accelerators/test_tpu_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@

from pytorch_lightning import Trainer
from pytorch_lightning.trainer.states import TrainerState
from pytorch_lightning.utilities import _TPU_AVAILABLE
from tests.helpers.boring_model import BoringModel
from tests.helpers.runif import RunIf
from tests.helpers.utils import pl_multi_process_test


Expand All @@ -39,7 +39,7 @@ def forward(self, x):
return x


@pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine")
@RunIf(tpu=True)
@pl_multi_process_test
def test_resume_training_on_cpu(tmpdir):
""" Checks if training can be resumed from a saved checkpoint on CPU"""
Expand Down Expand Up @@ -70,7 +70,7 @@ def test_resume_training_on_cpu(tmpdir):
assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"


@pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine")
@RunIf(tpu=True)
@pl_multi_process_test
def test_if_test_works_after_train(tmpdir):
""" Ensure that .test() works after .fit() """
Expand All @@ -82,7 +82,7 @@ def test_if_test_works_after_train(tmpdir):
assert len(trainer.test(model)) == 1


@pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine")
@RunIf(tpu=True)
@pl_multi_process_test
def test_weight_tying_warning(tmpdir, capsys=None):
"""
Expand All @@ -98,7 +98,7 @@ def test_weight_tying_warning(tmpdir, capsys=None):
assert result


@pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine")
@RunIf(tpu=True)
@pl_multi_process_test
def test_if_weights_tied(tmpdir, capsys=None):
"""
Expand Down
7 changes: 2 additions & 5 deletions tests/checkpointing/test_model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import math
import os
import pickle
import platform
import re
from argparse import Namespace
from pathlib import Path
Expand All @@ -38,6 +37,7 @@
from pytorch_lightning.utilities.cloud_io import load as pl_load
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from tests.helpers import BoringModel
from tests.helpers.runif import RunIf


class LogInTwoMethods(BoringModel):
Expand Down Expand Up @@ -364,10 +364,7 @@ def on_train_end(self, trainer, pl_module):
assert torch.save.call_count == 0


@pytest.mark.skipif(
platform.system() == "Windows",
reason="Distributed training is not supported on Windows",
)
@RunIf(skip_windows=True)
def test_model_checkpoint_no_extraneous_invocations(tmpdir):
"""Test to ensure that the model callback saves the checkpoints only once in distributed mode."""
model = LogInTwoMethods()
Expand Down
47 changes: 43 additions & 4 deletions tests/helpers/runif.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,21 @@
import torch
from pkg_resources import get_distribution

from pytorch_lightning.utilities import _APEX_AVAILABLE, _NATIVE_AMP_AVAILABLE, _TORCH_QUANTIZE_AVAILABLE
from pytorch_lightning.utilities import (
_APEX_AVAILABLE,
_HOROVOD_AVAILABLE,
_NATIVE_AMP_AVAILABLE,
_TORCH_QUANTIZE_AVAILABLE,
_TPU_AVAILABLE,
)

try:
from horovod.common.util import nccl_built
nccl_built()
except (ImportError, ModuleNotFoundError, AttributeError):
_HOROVOD_NCCL_AVAILABLE = False
finally:
_HOROVOD_NCCL_AVAILABLE = True


class RunIf:
Expand All @@ -37,9 +51,13 @@ def __new__(
*args,
min_gpus: int = 0,
min_torch: Optional[str] = None,
min_python: Optional[str] = None,
quantization: bool = False,
amp_apex: bool = False,
amp_native: bool = False,
tpu: bool = False,
horovod: bool = False,
horovod_nccl: bool = False,
skip_windows: bool = False,
**kwargs
):
Expand All @@ -48,9 +66,13 @@ def __new__(
args: native pytest.mark.skipif arguments
min_gpus: min number of gpus required to run test
min_torch: minimum pytorch version to run test
min_python: minimum python version required to run test
quantization: if `torch.quantization` package is required to run test
Borda marked this conversation as resolved.
Show resolved Hide resolved
amp_apex: NVIDIA Apex is installed
amp_native: if native PyTorch native AMP is supported
tpu: if TPU is available
horovod: if Horovod is installed
horovod_nccl: if Horovod is installed with NCCL support
skip_windows: skip test for Windows platform (typically fo some limited torch functionality)
kwargs: native pytest.mark.skipif keyword arguments
"""
Expand All @@ -66,23 +88,40 @@ def __new__(
conditions.append(torch_version < LooseVersion(min_torch))
reasons.append(f"torch>={min_torch}")

if min_python:
py_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
conditions.append(py_version < LooseVersion(min_python))
reasons.append(f"python>={min_python}")
Borda marked this conversation as resolved.
Show resolved Hide resolved

if quantization:
_miss_default = 'fbgemm' not in torch.backends.quantized.supported_engines
conditions.append(not _TORCH_QUANTIZE_AVAILABLE or _miss_default)
reasons.append("missing PyTorch quantization")
reasons.append("PyTorch quantization")

if amp_native:
conditions.append(not _NATIVE_AMP_AVAILABLE)
reasons.append("missing native AMP")
reasons.append("native AMP")

if amp_apex:
conditions.append(not _APEX_AVAILABLE)
reasons.append("missing NVIDIA Apex")
reasons.append("NVIDIA Apex")

if skip_windows:
conditions.append(sys.platform == "win32")
reasons.append("unimplemented on Windows")

if tpu:
conditions.append(not _TPU_AVAILABLE)
reasons.append("TPU")

if horovod:
conditions.append(not _HOROVOD_AVAILABLE)
reasons.append("Horovod")

if horovod_nccl:
conditions.append(not _HOROVOD_NCCL_AVAILABLE)
reasons.append("Horovod with NCCL")

Borda marked this conversation as resolved.
Show resolved Hide resolved
reasons = [rs for cond, rs in zip(conditions, reasons) if cond]
return pytest.mark.skipif(
*args,
Expand Down
6 changes: 2 additions & 4 deletions tests/loggers/test_tensorboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,10 @@
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from tests.helpers import BoringModel
from tests.helpers.runif import RunIf


@pytest.mark.skipif(
LooseVersion(torch.__version__) < LooseVersion("1.5.0"),
reason="Minimal PT version is set to 1.5",
)
@RunIf(min_torch="1.5.0")
def test_tensorboard_hparams_reload(tmpdir):

class CustomModel(BoringModel):
Expand Down
30 changes: 8 additions & 22 deletions tests/models/test_horovod.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,6 @@
# This script will run the actual test model training in parallel
TEST_SCRIPT = os.path.join(os.path.dirname(__file__), 'data', 'horovod', 'train_default_model.py')

try:
from horovod.common.util import nccl_built
nccl_built()
except (ImportError, ModuleNotFoundError, AttributeError):
_HOROVOD_NCCL_AVAILABLE = False
finally:
_HOROVOD_NCCL_AVAILABLE = True


def _run_horovod(trainer_options, on_gpu=False):
"""Execute the training script across multiple workers in parallel."""
Expand Down Expand Up @@ -99,8 +91,7 @@ def test_horovod_cpu_implicit(tmpdir):
_run_horovod(trainer_options)


@pytest.mark.skipif(not _HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
@RunIf(min_gpus=2, skip_windows=True)
@RunIf(min_gpus=2, skip_windows=True, horovod_nccl=True)
def test_horovod_multi_gpu(tmpdir):
"""Test Horovod with multi-GPU support."""
trainer_options = dict(
Expand All @@ -118,9 +109,8 @@ def test_horovod_multi_gpu(tmpdir):
_run_horovod(trainer_options, on_gpu=True)


@pytest.mark.skip(reason="Horovod has a problem with broadcast when using apex?")
@pytest.mark.skipif(not _HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
@RunIf(min_gpus=2, skip_windows=True, amp_apex=True)
@pytest.mark.skip(reason="Horovod has a problem with broadcast when using apex?") # todo
@RunIf(min_gpus=2, skip_windows=True, amp_apex=True, horovod_nccl=True)
def test_horovod_apex(tmpdir):
"""Test Horovod with multi-GPU support using apex amp."""
trainer_options = dict(
Expand All @@ -140,9 +130,8 @@ def test_horovod_apex(tmpdir):
_run_horovod(trainer_options, on_gpu=True)


@pytest.mark.skip(reason="Skip till Horovod fixes integration with Native torch.cuda.amp")
@pytest.mark.skipif(not _HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
@RunIf(min_gpus=2, skip_windows=True, amp_native=True)
@pytest.mark.skip(reason="Skip till Horovod fixes integration with Native torch.cuda.amp") # todo
@RunIf(min_gpus=2, skip_windows=True, amp_native=True, horovod_nccl=True)
def test_horovod_amp(tmpdir):
"""Test Horovod with multi-GPU support using native amp."""
trainer_options = dict(
Expand All @@ -162,8 +151,7 @@ def test_horovod_amp(tmpdir):
_run_horovod(trainer_options, on_gpu=True)


@pytest.mark.skipif(not _HOROVOD_NCCL_AVAILABLE, reason="test requires Horovod with NCCL support")
@RunIf(min_gpus=1, skip_windows=True)
@RunIf(min_gpus=1, skip_windows=True, horovod_nccl=True)
def test_horovod_transfer_batch_to_gpu(tmpdir):

class TestTrainingStepModel(BoringModel):
Expand Down Expand Up @@ -225,8 +213,7 @@ def get_optimizer_params(optimizer):

# TODO: unclear Horovod failure...
@pytest.mark.skip(reason="unclear Horovod failure...")
@pytest.mark.skipif(not _HOROVOD_AVAILABLE, reason="Horovod is unavailable")
@RunIf(skip_windows=True)
@RunIf(skip_windows=True, horovod=True)
def test_result_reduce_horovod(tmpdir):
"""Make sure result logging works with Horovod.

Expand Down Expand Up @@ -276,8 +263,7 @@ def training_epoch_end(self, outputs) -> None:

# TODO: unclear Horovod failure...
@pytest.mark.skip(reason="unclear Horovod failure...")
@pytest.mark.skipif(not _HOROVOD_AVAILABLE, reason="Horovod is unavailable")
@RunIf(skip_windows=True)
@RunIf(skip_windows=True, horovod=True)
def test_accuracy_metric_horovod():
num_batches = 10
batch_size = 16
Expand Down
6 changes: 1 addition & 5 deletions tests/models/test_torchscript.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from distutils.version import LooseVersion

import pytest
import torch
Expand Down Expand Up @@ -130,10 +129,7 @@ def test_torchscript_properties(tmpdir, modelclass):
ParityModuleRNN,
BasicGAN,
])
@pytest.mark.skipif(
LooseVersion(torch.__version__) < LooseVersion("1.5.0"),
reason="torch.save/load has bug loading script modules on torch <= 1.4",
)
@RunIf(min_torch="1.5.0")
def test_torchscript_save_load(tmpdir, modelclass):
""" Test that scripted LightningModule is correctly saved and can be loaded. """
model = modelclass()
Expand Down
Loading