diff --git a/tests/models/data/ddp/train_test_variations.py b/tests/models/data/ddp/train_test_variations.py new file mode 100644 index 0000000000000..1ac2e110dd599 --- /dev/null +++ b/tests/models/data/ddp/train_test_variations.py @@ -0,0 +1,65 @@ +""" +Runs several combinations of `.fit()` and `.test()` on a single node across multiple gpus. +""" +from argparse import ArgumentParser + +from pytorch_lightning import Trainer, seed_everything +from tests.base import EvalModelTemplate + + +def variation_fit_test(trainer, model): + trainer.fit(model) + trainer.test(model) + + +def variation_test_fit(trainer, model): + trainer.test(model) + trainer.fit(model) + + +def variation_fit_fit(trainer, model): + trainer.fit(model) + trainer.fit(model) + + +def variation_test_test(trainer, model): + trainer.test(model) + trainer.test(model) + + +def variation_test_fit_test(trainer, model): + trainer.test(model) + trainer.fit(model) + trainer.test(model) + + +def get_variations(): + variations = [ + "variation_fit_test", + "variation_test_fit", + "variation_fit_fit", + "variation_test_test", + "variation_test_fit_test", + ] + return variations + + +def main(): + seed_everything(1234) + parser = ArgumentParser(add_help=False) + parser = Trainer.add_argparse_args(parser) + parser.add_argument('--variation', default=variation_fit_test.__name__) + parser.set_defaults(gpus=2) + parser.set_defaults(distributed_backend="ddp") + args = parser.parse_args() + + model = EvalModelTemplate() + trainer = Trainer.from_argparse_args(args) + + # run the chosen variation + run_variation = globals()[args.variation] + run_variation(trainer, model) + + +if __name__ == '__main__': + main() diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 7497a53083612..39137c9805437 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -1,4 +1,9 @@ +import os +import subprocess +import sys from collections import namedtuple +from pathlib import Path +from unittest import mock import pytest import torch @@ -6,11 +11,13 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils +import pytorch_lightning from pytorch_lightning import Trainer from pytorch_lightning.core import memory from pytorch_lightning.trainer.distrib_parts import _parse_gpu_ids, determine_root_gpu_device from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import EvalModelTemplate +from tests.models.data.ddp import train_test_variations PRETEND_N_OF_GPUS = 16 @@ -93,6 +100,32 @@ def test_multi_gpu_model_dp(tmpdir): memory.get_memory_profile('min_max') +@pytest.mark.parametrize('cli_args', [ + pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'), +]) +@pytest.mark.parametrize('variation', train_test_variations.get_variations()) +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): + file = Path(train_test_variations.__file__).absolute() + cli_args = cli_args.split(' ') if cli_args else [] + cli_args += ['--default_root_dir', str(tmpdir)] + command = [sys.executable, str(file), '--variation', variation] + cli_args + env = os.environ.copy() + env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '') + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) + p.communicate() + + std, err = p.communicate(timeout=60) + std = std.decode('utf-8').strip() + err = err.decode('utf-8').strip() + assert std + if p.returncode: + print(std) + print(err) + print(command) + pytest.fail(err) + + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model_ddp_spawn(tmpdir): tutils.set_random_master_port()