From 52745c9aeea687324bac5f826132bcf082456566 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 23:32:14 +0200 Subject: [PATCH 01/32] add ddp script variations --- tests/models/data/ddp/train_default_model.py | 54 ++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 tests/models/data/ddp/train_default_model.py diff --git a/tests/models/data/ddp/train_default_model.py b/tests/models/data/ddp/train_default_model.py new file mode 100644 index 0000000000000..2949750951099 --- /dev/null +++ b/tests/models/data/ddp/train_default_model.py @@ -0,0 +1,54 @@ +""" +Runs several combinations of `.fit()` and `.test()` on a single node across multiple gpus. +""" +from argparse import ArgumentParser + +from pytorch_lightning import Trainer, seed_everything +from tests.base import EvalModelTemplate + + +def variation_fit_test(trainer, model): + trainer.fit(model) + trainer.test(model) + + +def variation_test_fit(trainer, model): + trainer.test(model) + trainer.fit(model) + + +def variation_test_test(trainer, model): + trainer.test(model) + trainer.test(model) + + +def variation_test_fit_test(trainer, model): + trainer.test(model) + trainer.fit(model) + trainer.test(model) + + +def get_variations(): + variations = [v for v in globals() if v.startswith("variation")] + return variations + + +def main(): + seed_everything(1234) + parser = ArgumentParser(add_help=False) + parser = Trainer.add_argparse_args(parser) + parser.add_argument('--variation', default=variation_fit_test.__name__) + parser.set_defaults(gpus=2) + parser.set_defaults(distributed_backend="ddp") + args = parser.parse_args() + + model = EvalModelTemplate() + trainer = Trainer.from_argparse_args(args) + + # run the chosen variation + run_variation = globals()[args.variation] + run_variation(trainer, model) + + +if __name__ == '__main__': + main() From 891bb93ffb1429074edda8389cdf50929eeff2ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 6 Aug 2020 23:34:01 +0200 Subject: [PATCH 02/32] add ddp test --- tests/models/test_gpu.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 509de4c07563a..0e308598b3691 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -1,5 +1,8 @@ +import subprocess +import sys from collections import namedtuple from unittest.mock import patch +from pathlib import Path import pytest import torch @@ -12,6 +15,7 @@ from pytorch_lightning.trainer.distrib_parts import _parse_gpu_ids, determine_root_gpu_device from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import EvalModelTemplate +from tests.models.data.ddp import train_default_model PRETEND_N_OF_GPUS = 16 @@ -94,6 +98,26 @@ def test_multi_gpu_model_dp(tmpdir): memory.get_memory_profile('min_max') +@pytest.mark.parametrize('cli_args', [ + pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'), +]) +@pytest.mark.parametrize('variation', train_default_model.get_variations()) +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): + file = Path(train_default_model.__file__).absolute() + cli_args = cli_args.split(' ') if cli_args else [] + cli_args += ['--default_root_dir', str(tmpdir)] + command = [sys.executable, file, '--variation', variation] + cli_args + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + std, err = p.communicate(timeout=60) + #assert std and not err + if p.returncode: + print(std) + print(err) + print(command) + raise RuntimeError('error') + + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model_ddp_spawn(tmpdir): tutils.set_random_master_port() From 030ac29c4ff980b76522e3f4865c21be91fb49f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 7 Aug 2020 02:22:07 +0200 Subject: [PATCH 03/32] rename --- .../{train_default_model.py => train_test_variations.py} | 0 tests/models/test_gpu.py | 8 ++++---- 2 files changed, 4 insertions(+), 4 deletions(-) rename tests/models/data/ddp/{train_default_model.py => train_test_variations.py} (100%) diff --git a/tests/models/data/ddp/train_default_model.py b/tests/models/data/ddp/train_test_variations.py similarity index 100% rename from tests/models/data/ddp/train_default_model.py rename to tests/models/data/ddp/train_test_variations.py diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 0e308598b3691..2a4d335fa5918 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -15,7 +15,7 @@ from pytorch_lightning.trainer.distrib_parts import _parse_gpu_ids, determine_root_gpu_device from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import EvalModelTemplate -from tests.models.data.ddp import train_default_model +from tests.models.data.ddp import train_test_variations PRETEND_N_OF_GPUS = 16 @@ -101,14 +101,14 @@ def test_multi_gpu_model_dp(tmpdir): @pytest.mark.parametrize('cli_args', [ pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'), ]) -@pytest.mark.parametrize('variation', train_default_model.get_variations()) +@pytest.mark.parametrize('variation', train_test_variations.get_variations()) @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): - file = Path(train_default_model.__file__).absolute() + file = Path(train_test_variations.__file__).absolute() cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] command = [sys.executable, file, '--variation', variation] + cli_args - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) std, err = p.communicate(timeout=60) #assert std and not err if p.returncode: From 5d7b95b35cf0e9e7be9f6acf80629559cfe3538d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 7 Aug 2020 02:38:53 +0200 Subject: [PATCH 04/32] shell --- tests/models/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 2a4d335fa5918..35611a37b6690 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -108,7 +108,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] command = [sys.executable, file, '--variation', variation] + cli_args - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std, err = p.communicate(timeout=60) #assert std and not err if p.returncode: From fa4474987a392d0df0ee2096022cff7f69e8c070 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 7 Aug 2020 02:43:22 +0200 Subject: [PATCH 05/32] test --- tests/models/test_gpu.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 35611a37b6690..7f70e7f37e185 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -110,12 +110,14 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): command = [sys.executable, file, '--variation', variation] + cli_args p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) std, err = p.communicate(timeout=60) - #assert std and not err + std = std.decode('utf-8').strip() + err = err.decode('utf-8').strip() + assert std and not err if p.returncode: print(std) print(err) print(command) - raise RuntimeError('error') + pytest.fail(err) @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") From f18735232c90e6e770332e06cbc7a64cbbedb4fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 7 Aug 2020 02:55:54 +0200 Subject: [PATCH 06/32] test --- tests/models/test_gpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 7f70e7f37e185..65b5d1f55e275 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -112,7 +112,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): std, err = p.communicate(timeout=60) std = std.decode('utf-8').strip() err = err.decode('utf-8').strip() - assert std and not err + # assert std and not err if p.returncode: print(std) print(err) From ca6558b0bedc299d335d4d4845da5c40b7875c0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 7 Aug 2020 03:39:02 +0200 Subject: [PATCH 07/32] try call --- tests/models/test_gpu.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 65b5d1f55e275..6d05fefdd418b 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -108,16 +108,17 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] command = [sys.executable, file, '--variation', variation] + cli_args - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - std, err = p.communicate(timeout=60) - std = std.decode('utf-8').strip() - err = err.decode('utf-8').strip() + exitcode = subprocess.call(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + raise SystemExit(exitcode) + # std, err = p.communicate(timeout=60) + # std = std.decode('utf-8').strip() + # err = err.decode('utf-8').strip() # assert std and not err - if p.returncode: - print(std) - print(err) - print(command) - pytest.fail(err) + # if p.returncode: + # print(std) + # print(err) + # print(command) + # pytest.fail(err) @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") From a89bfa95234fd917ccc1ca35f8b0e8b52a6d269f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 7 Aug 2020 15:59:04 +0200 Subject: [PATCH 08/32] try without subprocess --- tests/models/test_gpu.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 6d05fefdd418b..84b67ebf17209 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -3,6 +3,7 @@ from collections import namedtuple from unittest.mock import patch from pathlib import Path +from unittest import mock import pytest import torch @@ -107,9 +108,9 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): file = Path(train_test_variations.__file__).absolute() cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] - command = [sys.executable, file, '--variation', variation] + cli_args - exitcode = subprocess.call(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - raise SystemExit(exitcode) + # command = [sys.executable, file, '--variation', variation] + cli_args + # exitcode = subprocess.call(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + # raise SystemExit(exitcode) # std, err = p.communicate(timeout=60) # std = std.decode('utf-8').strip() # err = err.decode('utf-8').strip() @@ -120,6 +121,11 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): # print(command) # pytest.fail(err) + cli_args += ['--variation', variation] + from tests.models.data.ddp.train_test_variations import main + with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): + main() + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model_ddp_spawn(tmpdir): From 3e128968e31ef558f7cb4bf40a5da5993ac52181 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 7 Aug 2020 16:07:59 +0200 Subject: [PATCH 09/32] test --- tests/models/test_gpu.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 84b67ebf17209..6c727002e4049 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -108,9 +108,10 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): file = Path(train_test_variations.__file__).absolute() cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] - # command = [sys.executable, file, '--variation', variation] + cli_args - # exitcode = subprocess.call(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - # raise SystemExit(exitcode) + command = [sys.executable, file, '--variation', variation] + cli_args + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p.communicate() + assert p.returncode == 0 # std, err = p.communicate(timeout=60) # std = std.decode('utf-8').strip() # err = err.decode('utf-8').strip() @@ -121,10 +122,10 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): # print(command) # pytest.fail(err) - cli_args += ['--variation', variation] - from tests.models.data.ddp.train_test_variations import main - with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): - main() + # cli_args += ['--variation', variation] + # from tests.models.data.ddp.train_test_variations import main + # with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): + # main() @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") From dea0278bc4ece67d92005b68cf3d7af98ba7dbcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 7 Aug 2020 16:54:10 +0200 Subject: [PATCH 10/32] display the error --- tests/models/test_gpu.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 6c727002e4049..912e32659560a 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -111,16 +111,16 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): command = [sys.executable, file, '--variation', variation] + cli_args p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.communicate() - assert p.returncode == 0 - # std, err = p.communicate(timeout=60) - # std = std.decode('utf-8').strip() - # err = err.decode('utf-8').strip() + # assert p.returncode == 0 + std, err = p.communicate(timeout=60) + std = std.decode('utf-8').strip() + err = err.decode('utf-8').strip() # assert std and not err - # if p.returncode: - # print(std) - # print(err) - # print(command) - # pytest.fail(err) + if p.returncode: + print(std) + print(err) + print(command) + pytest.fail(err) # cli_args += ['--variation', variation] # from tests.models.data.ddp.train_test_variations import main From be6ba0610a53b89b560b388bfbeeaf85e64181ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 8 Aug 2020 05:38:14 +0200 Subject: [PATCH 11/32] list all variations --- tests/models/data/ddp/train_test_variations.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/models/data/ddp/train_test_variations.py b/tests/models/data/ddp/train_test_variations.py index 2949750951099..1ac2e110dd599 100644 --- a/tests/models/data/ddp/train_test_variations.py +++ b/tests/models/data/ddp/train_test_variations.py @@ -17,6 +17,11 @@ def variation_test_fit(trainer, model): trainer.fit(model) +def variation_fit_fit(trainer, model): + trainer.fit(model) + trainer.fit(model) + + def variation_test_test(trainer, model): trainer.test(model) trainer.test(model) @@ -29,7 +34,13 @@ def variation_test_fit_test(trainer, model): def get_variations(): - variations = [v for v in globals() if v.startswith("variation")] + variations = [ + "variation_fit_test", + "variation_test_fit", + "variation_fit_fit", + "variation_test_test", + "variation_test_fit_test", + ] return variations From c4408674e93387de9470cc53abd94c63f75b973c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 9 Aug 2020 05:12:50 +0200 Subject: [PATCH 12/32] try string --- tests/models/test_gpu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 912e32659560a..b2c117f505638 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -108,7 +108,8 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): file = Path(train_test_variations.__file__).absolute() cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] - command = [sys.executable, file, '--variation', variation] + cli_args + # command = [sys.executable, file, '--variation', variation] + cli_args + command = ['python', file, '--variation', variation] + cli_args p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.communicate() # assert p.returncode == 0 From 31cae82c4a91755d0a25548b867a50ed50f23027 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 9 Aug 2020 05:48:37 +0200 Subject: [PATCH 13/32] try copy env --- tests/models/test_gpu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index b2c117f505638..fff38d4343fe9 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -1,3 +1,4 @@ +import os import subprocess import sys from collections import namedtuple @@ -110,7 +111,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): cli_args += ['--default_root_dir', str(tmpdir)] # command = [sys.executable, file, '--variation', variation] + cli_args command = ['python', file, '--variation', variation] + cli_args - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy()) p.communicate() # assert p.returncode == 0 std, err = p.communicate(timeout=60) From e75a35ccb70b176e21dcda845caf630d1ffb9df9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 9 Aug 2020 06:27:04 +0200 Subject: [PATCH 14/32] debug --- tests/models/test_gpu.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index fff38d4343fe9..d897c3a1632ae 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -111,6 +111,14 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): cli_args += ['--default_root_dir', str(tmpdir)] # command = [sys.executable, file, '--variation', variation] + cli_args command = ['python', file, '--variation', variation] + cli_args + + # debugging WHY SUBPROCESS PYTHON CANNOT IMPORT PL + p = subprocess.Popen(['pip', 'freeze'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p.communicate() + std, err = p.communicate() + std = std.decode('utf-8') + print(std) + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy()) p.communicate() # assert p.returncode == 0 From 6b45e7df2e00de92ae1ab61a0a2817281961f8c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 9 Aug 2020 06:44:08 +0200 Subject: [PATCH 15/32] pythonpath --- tests/models/test_gpu.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index d897c3a1632ae..751ac9127baf4 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -12,6 +12,7 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils +import pytorch_lightning from pytorch_lightning import Trainer from pytorch_lightning.core import memory from pytorch_lightning.trainer.distrib_parts import _parse_gpu_ids, determine_root_gpu_device @@ -113,13 +114,17 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): command = ['python', file, '--variation', variation] + cli_args # debugging WHY SUBPROCESS PYTHON CANNOT IMPORT PL + p = subprocess.Popen(['pip', 'freeze'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) p.communicate() std, err = p.communicate() std = std.decode('utf-8') print(std) - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy()) + env = os.environ.copy() + env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env['PYTHONPATH'] + + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) p.communicate() # assert p.returncode == 0 std, err = p.communicate(timeout=60) From bf5826a33db3774ee7458270a0335fc8ee2d9d2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 9 Aug 2020 06:55:42 +0200 Subject: [PATCH 16/32] path --- tests/models/test_gpu.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 751ac9127baf4..eeefc0eb80899 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -110,8 +110,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): file = Path(train_test_variations.__file__).absolute() cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] - # command = [sys.executable, file, '--variation', variation] + cli_args - command = ['python', file, '--variation', variation] + cli_args + command = [sys.executable, str(file), '--variation', variation] + cli_args # debugging WHY SUBPROCESS PYTHON CANNOT IMPORT PL @@ -122,8 +121,8 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): print(std) env = os.environ.copy() - env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env['PYTHONPATH'] - + env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '') + print('python path', env['PYTHONPATH']) p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) p.communicate() # assert p.returncode == 0 From 379009994083b6f1e09fb6005b23bd55a55af59a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 9 Aug 2020 07:07:52 +0200 Subject: [PATCH 17/32] update test --- tests/models/test_gpu.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index eeefc0eb80899..a1b22709dbdee 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -111,36 +111,20 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] command = [sys.executable, str(file), '--variation', variation] + cli_args - - # debugging WHY SUBPROCESS PYTHON CANNOT IMPORT PL - - p = subprocess.Popen(['pip', 'freeze'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - p.communicate() - std, err = p.communicate() - std = std.decode('utf-8') - print(std) - env = os.environ.copy() env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '') - print('python path', env['PYTHONPATH']) p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) p.communicate() - # assert p.returncode == 0 std, err = p.communicate(timeout=60) std = std.decode('utf-8').strip() err = err.decode('utf-8').strip() - # assert std and not err + assert std if p.returncode: print(std) print(err) print(command) pytest.fail(err) - # cli_args += ['--variation', variation] - # from tests.models.data.ddp.train_test_variations import main - # with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): - # main() - @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model_ddp_spawn(tmpdir): From bb40a2eebebf063e07db1878b339ab1e033a20bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 9 Aug 2020 07:17:20 +0200 Subject: [PATCH 18/32] change --- tests/models/test_gpu.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index a1b22709dbdee..db26edece1fbf 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -115,6 +115,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '') p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) p.communicate() + std, err = p.communicate(timeout=60) std = std.decode('utf-8').strip() err = err.decode('utf-8').strip() From 1cd63aa83ccd448b55358f2faab62d41d879ecb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 16 Aug 2020 06:11:01 +0200 Subject: [PATCH 19/32] simple ddp test --- .../models/data/ddp/train_test_variations.py | 33 ++++--------------- tests/models/test_gpu.py | 7 ++-- 2 files changed, 10 insertions(+), 30 deletions(-) diff --git a/tests/models/data/ddp/train_test_variations.py b/tests/models/data/ddp/train_test_variations.py index 1ac2e110dd599..f37bd27e8a005 100644 --- a/tests/models/data/ddp/train_test_variations.py +++ b/tests/models/data/ddp/train_test_variations.py @@ -1,5 +1,5 @@ """ -Runs several combinations of `.fit()` and `.test()` on a single node across multiple gpus. +Runs either `.fit()` or `.test()` on a single node across multiple gpus. """ from argparse import ArgumentParser @@ -7,39 +7,18 @@ from tests.base import EvalModelTemplate -def variation_fit_test(trainer, model): +def variation_fit(trainer, model): trainer.fit(model) - trainer.test(model) - - -def variation_test_fit(trainer, model): - trainer.test(model) - trainer.fit(model) - -def variation_fit_fit(trainer, model): - trainer.fit(model) - trainer.fit(model) - -def variation_test_test(trainer, model): - trainer.test(model) - trainer.test(model) - - -def variation_test_fit_test(trainer, model): - trainer.test(model) - trainer.fit(model) +def variation_test(trainer, model): trainer.test(model) def get_variations(): variations = [ - "variation_fit_test", - "variation_test_fit", - "variation_fit_fit", - "variation_test_test", - "variation_test_fit_test", + "variation_fit", + "variation_test", ] return variations @@ -48,7 +27,7 @@ def main(): seed_everything(1234) parser = ArgumentParser(add_help=False) parser = Trainer.add_argparse_args(parser) - parser.add_argument('--variation', default=variation_fit_test.__name__) + parser.add_argument('--variation', default=variation_fit.__name__) parser.set_defaults(gpus=2) parser.set_defaults(distributed_backend="ddp") args = parser.parse_args() diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index db26edece1fbf..245ba798c3c55 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -110,7 +110,8 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): file = Path(train_test_variations.__file__).absolute() cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] - command = [sys.executable, str(file), '--variation', variation] + cli_args + cli_args += ['--variation', variation] + command = [sys.executable, str(file)] + cli_args env = os.environ.copy() env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '') p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) @@ -119,8 +120,8 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): std, err = p.communicate(timeout=60) std = std.decode('utf-8').strip() err = err.decode('utf-8').strip() - assert std - if p.returncode: + # assert std, f"{variation} produced no output" + if p.returncode > 0: print(std) print(err) print(command) From febf1147c55eb8849d560e3d1c5277750257e33f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 16 Aug 2020 06:32:17 +0200 Subject: [PATCH 20/32] replace --- .../trainer/distrib_data_parallel.py | 36 +++++++++---------- pytorch_lightning/utilities/distributed.py | 10 ++++++ 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 5555c26172b11..9566525cfd250 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -163,9 +163,9 @@ def train_fx(trial_hparams, cluster_manager, _): else: XLA_AVAILABLE = True -PID = os.getpid() -RNG1 = np.random.RandomState(PID) -RANDOM_PORTS = RNG1.randint(10000, 19999, 1000) +# PID = os.getpid() +# RNG1 = np.random.RandomState(PID) +# RANDOM_PORTS = RNG1.randint(10000, 19999, 1000) class TrainerDDPMixin(ABC): @@ -389,21 +389,21 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids): # don't make this debug... this is good UX rank_zero_info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]') - def set_random_port(self, force=False): - """ - When running DDP NOT managed by SLURM, the ports might collide - """ - # pick a random port first - assert self.num_nodes == 1, 'random port can only be called from single node training' - global RANDOM_PORTS - default_port = RANDOM_PORTS[-1] - RANDOM_PORTS = RANDOM_PORTS[:-1] - - # when not forced, use the user port - if not force: - default_port = os.environ.get('MASTER_PORT', default_port) - - os.environ['MASTER_PORT'] = str(default_port) + # def set_random_port(self, force=False): + # """ + # When running DDP NOT managed by SLURM, the ports might collide + # """ + # # pick a random port first + # assert self.num_nodes == 1, 'random port can only be called from single node training' + # global RANDOM_PORTS + # default_port = RANDOM_PORTS[-1] + # RANDOM_PORTS = RANDOM_PORTS[:-1] + # + # # when not forced, use the user port + # if not force: + # default_port = os.environ.get('MASTER_PORT', default_port) + # + # os.environ['MASTER_PORT'] = str(default_port) def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results): if self.distributed_backend.lower() not in ['ddp_spawn', 'ddp_cpu', 'tpu']: diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index cd0621496fe42..a44a6c0ec0749 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -34,3 +34,13 @@ def _debug(*args, **kwargs): rank_zero_debug = rank_zero_only(_debug) rank_zero_info = rank_zero_only(_info) rank_zero_warn = rank_zero_only(_warn) + + +def find_free_network_port(): + import socket + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.bind(("", 0)) + s.listen(1) + port = s.getsockname()[1] + s.close() + return port From 78c05945e8c818b255ee059fd76a18959633d1a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 16 Aug 2020 06:49:37 +0200 Subject: [PATCH 21/32] remove random port --- pytorch_lightning/accelerators/ddp_backend.py | 16 ++++++---------- .../accelerators/ddp_spawn_backend.py | 5 +++-- pytorch_lightning/core/lightning.py | 4 ---- pytorch_lightning/trainer/trainer.py | 2 -- 4 files changed, 9 insertions(+), 18 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index ca4ade2d11f6b..d0ea497548a3e 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -24,7 +24,7 @@ from pytorch_lightning import _logger as log from pytorch_lightning.utilities import AMPType -from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities.distributed import rank_zero_only, find_free_network_port try: from hydra.utils import to_absolute_path, get_original_cwd @@ -56,19 +56,15 @@ def train(self, model): self.ddp_train(process_idx=self.task_idx, mp_queue=None, model=model) def spawn_ddp_children(self, model): - port = os.environ['MASTER_PORT'] + assert self.trainer.global_rank == 0 - master_address = '127.0.0.1' if 'MASTER_ADDR' not in os.environ else os.environ['MASTER_ADDR'] - os.environ['MASTER_PORT'] = f'{port}' - os.environ['MASTER_ADDR'] = f'{master_address}' + os.environ['MASTER_ADDR'] = os.environ.get('MASTER_ADDR', '127.0.0.1') + os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', find_free_network_port()) # allow the user to pass the node rank node_rank = '0' - if 'NODE_RANK' in os.environ: - node_rank = os.environ['NODE_RANK'] - if 'GROUP_RANK' in os.environ: - node_rank = os.environ['GROUP_RANK'] - + node_rank = os.environ.get('NODE_RANK', node_rank) + node_rank = os.environ.get('GROUP_RANK', node_rank) os.environ['NODE_RANK'] = node_rank os.environ['LOCAL_RANK'] = '0' diff --git a/pytorch_lightning/accelerators/ddp_spawn_backend.py b/pytorch_lightning/accelerators/ddp_spawn_backend.py index 6fce4cb7a530e..b9d203f74583f 100644 --- a/pytorch_lightning/accelerators/ddp_spawn_backend.py +++ b/pytorch_lightning/accelerators/ddp_spawn_backend.py @@ -11,13 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License +import os import torch import torch.multiprocessing as mp from pytorch_lightning import _logger as log from pytorch_lightning.utilities import AMPType -from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities.distributed import rank_zero_only, find_free_network_port try: from apex import amp @@ -32,7 +33,7 @@ def __init__(self, trainer): self.mp_queue = None def setup(self): - self.trainer.set_random_port() + os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', find_free_network_port()) # pass in a state q smp = mp.get_context('spawn') diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 66d067a5146b6..ba00e702f26c8 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -893,10 +893,6 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank+1}/{world_size}") torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) - """ - configure_sync_batchnorm - ^^^^^^^^^^^^^^^^^^^^^^^^ - """ def configure_sync_batchnorm(self, model: 'LightningModule') -> 'LightningModule': """ Add global batchnorm for a model spread across multiple GPUs and nodes. diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 8c2eb4bfe4c32..17021f3f1225a 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1377,7 +1377,6 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders): # run tests self.tested_ckpt_path = ckpt_path - self.set_random_port(force=True) self.testing = True os.environ['PL_TESTING_MODE'] = '1' self.model = model @@ -1400,7 +1399,6 @@ def __test_given_model(self, model, test_dataloaders): # run test # sets up testing so we short circuit to eval - self.set_random_port(force=True) self.testing = True self.model = model results = self.fit(model) From d2efd396de96dbd83ca2166502995e86c3ab54b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 16 Aug 2020 06:51:15 +0200 Subject: [PATCH 22/32] random port --- pytorch_lightning/trainer/trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 17021f3f1225a..98d93ad6357fa 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1039,7 +1039,6 @@ def fit( # ddp elif self.distributed_backend == 'ddp': - self.set_random_port() self.accelerator_backend = DDPBackend(self) results = self.accelerator_backend.spawn_ddp_children(model) From b519feeebfd2d07b866e6fd61a9095490aacd433 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 16 Aug 2020 06:52:13 +0200 Subject: [PATCH 23/32] str --- pytorch_lightning/accelerators/ddp_backend.py | 2 +- pytorch_lightning/accelerators/ddp_spawn_backend.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index d0ea497548a3e..24365c673e931 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -59,7 +59,7 @@ def spawn_ddp_children(self, model): assert self.trainer.global_rank == 0 os.environ['MASTER_ADDR'] = os.environ.get('MASTER_ADDR', '127.0.0.1') - os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', find_free_network_port()) + os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port())) # allow the user to pass the node rank node_rank = '0' diff --git a/pytorch_lightning/accelerators/ddp_spawn_backend.py b/pytorch_lightning/accelerators/ddp_spawn_backend.py index b9d203f74583f..ee4954057964d 100644 --- a/pytorch_lightning/accelerators/ddp_spawn_backend.py +++ b/pytorch_lightning/accelerators/ddp_spawn_backend.py @@ -33,7 +33,7 @@ def __init__(self, trainer): self.mp_queue = None def setup(self): - os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', find_free_network_port()) + os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port())) # pass in a state q smp = mp.get_context('spawn') From e46862af7cd350574945883e04d0972902e3f7fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 16 Aug 2020 06:56:32 +0200 Subject: [PATCH 24/32] clean up --- pytorch_lightning/core/saving.py | 1 + tests/base/model_valid_epoch_ends.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/core/saving.py b/pytorch_lightning/core/saving.py index 16a9467c71554..1835251b2de54 100644 --- a/pytorch_lightning/core/saving.py +++ b/pytorch_lightning/core/saving.py @@ -353,6 +353,7 @@ def save_hparams_to_yaml(config_yaml, hparams: Union[dict, Namespace]) -> None: with open(config_yaml, 'w', newline='') as fp: yaml.dump(hparams, fp) + def convert(val: str) -> Union[int, float, bool, str]: try: return ast.literal_eval(val) diff --git a/tests/base/model_valid_epoch_ends.py b/tests/base/model_valid_epoch_ends.py index 8974224409624..943227e738cae 100644 --- a/tests/base/model_valid_epoch_ends.py +++ b/tests/base/model_valid_epoch_ends.py @@ -21,7 +21,6 @@ def _mean(res, key): # recursive mean for multilevel dicts return torch.stack([x[key] if isinstance(x, dict) else _mean(x, key) for x in res]).mean() - print('in validation epoch end') val_loss_mean = _mean(outputs, 'val_loss') val_acc_mean = _mean(outputs, 'val_acc') From fed39041a1104fdbf484a3b01326daf14a8091b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 16 Aug 2020 07:08:11 +0200 Subject: [PATCH 25/32] check run spawn --- pytorch_lightning/accelerators/ddp_backend.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py index 24365c673e931..6866a66543e4a 100644 --- a/pytorch_lightning/accelerators/ddp_backend.py +++ b/pytorch_lightning/accelerators/ddp_backend.py @@ -45,6 +45,7 @@ class DDPBackend(object): def __init__(self, trainer): self.trainer = trainer self.task_idx = None + self._has_spawned_children = False def slurm_setup(self): self.task_idx = int(os.environ['SLURM_LOCALID']) @@ -57,6 +58,8 @@ def train(self, model): def spawn_ddp_children(self, model): assert self.trainer.global_rank == 0 + self._check_can_spawn_children() + self._has_spawned_children = True os.environ['MASTER_ADDR'] = os.environ.get('MASTER_ADDR', '127.0.0.1') os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port())) @@ -231,3 +234,10 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 if self.trainer.global_rank == 0 and self.trainer.distributed_backend not in ['ddp_spawn', 'ddp_cpu']: return results + + def _check_can_spawn_children(self): + if self._has_spawned_children: + raise RuntimeError( + "You tried to run `.fit` or `.test` multiple times in the same script." + " This is not supported in DDP mode, switch to `distributed_backend='ddp_spawn'` instead." + ) From a5ae50809a2d4ccc30f57008316682e9efecd094 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 16 Aug 2020 07:19:19 +0200 Subject: [PATCH 26/32] clean up --- .../trainer/distrib_data_parallel.py | 21 ------------------- tests/models/test_gpu.py | 13 ++++++------ 2 files changed, 6 insertions(+), 28 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 9566525cfd250..438e4be359441 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -132,7 +132,6 @@ def train_fx(trial_hparams, cluster_manager, _): from abc import ABC, abstractmethod from typing import Union, List, Optional, Tuple -import numpy as np import torch from pytorch_lightning import _logger as log @@ -163,10 +162,6 @@ def train_fx(trial_hparams, cluster_manager, _): else: XLA_AVAILABLE = True -# PID = os.getpid() -# RNG1 = np.random.RandomState(PID) -# RANDOM_PORTS = RNG1.randint(10000, 19999, 1000) - class TrainerDDPMixin(ABC): @@ -389,22 +384,6 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids): # don't make this debug... this is good UX rank_zero_info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]') - # def set_random_port(self, force=False): - # """ - # When running DDP NOT managed by SLURM, the ports might collide - # """ - # # pick a random port first - # assert self.num_nodes == 1, 'random port can only be called from single node training' - # global RANDOM_PORTS - # default_port = RANDOM_PORTS[-1] - # RANDOM_PORTS = RANDOM_PORTS[:-1] - # - # # when not forced, use the user port - # if not force: - # default_port = os.environ.get('MASTER_PORT', default_port) - # - # os.environ['MASTER_PORT'] = str(default_port) - def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results): if self.distributed_backend.lower() not in ['ddp_spawn', 'ddp_cpu', 'tpu']: return diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 245ba798c3c55..929b887a29105 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -2,17 +2,16 @@ import subprocess import sys from collections import namedtuple -from unittest.mock import patch from pathlib import Path -from unittest import mock +from unittest.mock import patch import pytest import torch from torchtext.data import Batch, Dataset, Example, Field, LabelField +import pytorch_lightning import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils -import pytorch_lightning from pytorch_lightning import Trainer from pytorch_lightning.core import memory from pytorch_lightning.trainer.distrib_parts import _parse_gpu_ids, determine_root_gpu_device @@ -107,24 +106,24 @@ def test_multi_gpu_model_dp(tmpdir): @pytest.mark.parametrize('variation', train_test_variations.get_variations()) @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): + """ Runs a basic training and test run with distributed_backend=ddp. """ file = Path(train_test_variations.__file__).absolute() cli_args = cli_args.split(' ') if cli_args else [] cli_args += ['--default_root_dir', str(tmpdir)] cli_args += ['--variation', variation] command = [sys.executable, str(file)] + cli_args + # need to set the PYTHONPATH in case pytorch_lightning was not installed into the environment env = os.environ.copy() env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '') + # for running in ddp mode, we need to lauch it's own process or pytest will get stuck p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) p.communicate() std, err = p.communicate(timeout=60) std = std.decode('utf-8').strip() err = err.decode('utf-8').strip() - # assert std, f"{variation} produced no output" + assert std, f"{variation} produced no output" if p.returncode > 0: - print(std) - print(err) - print(command) pytest.fail(err) From fed97a9a4ad13f1113cdeb3bf85d298aec351a14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 16 Aug 2020 07:57:02 +0200 Subject: [PATCH 27/32] docs --- docs/source/multi_gpu.rst | 11 ++++++++--- pytorch_lightning/utilities/distributed.py | 7 ++++++- tests/models/test_gpu.py | 2 ++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index 706a977290f35..020044bf1b7a0 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -286,8 +286,6 @@ variables: MASTER_ADDR=localhost MASTER_PORT=random() WORLD_SIZE=3 NODE_RANK=1 LOCAL_RANK=0 python my_file.py --gpus 3 --etc MASTER_ADDR=localhost MASTER_PORT=random() WORLD_SIZE=3 NODE_RANK=2 LOCAL_RANK=0 python my_file.py --gpus 3 --etc -If your code does not support this (ie: jupyter notebook, colab, or a nested script without a root package), -use `dp` or `ddp_spawn`. We use DDP this way because `ddp_spawn` has a few limitations (due to Python and PyTorch): 1. Since `.spawn()` trains the model in subprocesses, the model on the main process does not get updated. @@ -296,7 +294,14 @@ We use DDP this way because `ddp_spawn` has a few limitations (due to Python and 3. Forces everything to be picklable. -However, if you don't mind these limitations, you can use `ddp_spawn`. +There are cases in which it is not possible to use DDP. Examples are: + +- Jupyter Notebook +- Google COLAB, Kaggle, etc. +- You have a nested script without a root package +- Your script needs to invoke `.fit` or `.test` multiple times + +In these situations you should use `dp` or `ddp_spawn` instead. Distributed Data Parallel 2 ^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index a44a6c0ec0749..5375bfd0fa6ce 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -36,7 +36,12 @@ def _debug(*args, **kwargs): rank_zero_warn = rank_zero_only(_warn) -def find_free_network_port(): +def find_free_network_port() -> int: + """ + Finds a free port on localhost. + It is useful in single-node training when we don't want to connect to a real master node but + have to set the `MASTER_PORT` environment variable. + """ import socket s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(("", 0)) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index 929b887a29105..e60bf70644aa7 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -112,9 +112,11 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): cli_args += ['--default_root_dir', str(tmpdir)] cli_args += ['--variation', variation] command = [sys.executable, str(file)] + cli_args + # need to set the PYTHONPATH in case pytorch_lightning was not installed into the environment env = os.environ.copy() env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '') + # for running in ddp mode, we need to lauch it's own process or pytest will get stuck p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) p.communicate() From 1b212d3384e32b9a5a0d600457c74e3a819e79f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 16 Aug 2020 08:00:26 +0200 Subject: [PATCH 28/32] docs --- docs/source/multi_gpu.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index 020044bf1b7a0..c93d5634fa1c2 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -289,9 +289,7 @@ variables: We use DDP this way because `ddp_spawn` has a few limitations (due to Python and PyTorch): 1. Since `.spawn()` trains the model in subprocesses, the model on the main process does not get updated. - 2. Dataloader(num_workers=N), where N is large, bottlenecks training with DDP... ie: it will be VERY slow or won't work at all. This is a PyTorch limitation. - 3. Forces everything to be picklable. There are cases in which it is not possible to use DDP. Examples are: From 9652305615be166bf236443984f35f299572f411 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 16 Aug 2020 08:01:44 +0200 Subject: [PATCH 29/32] update test --- tests/models/test_gpu.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py index e60bf70644aa7..b6a2efbb8621b 100644 --- a/tests/models/test_gpu.py +++ b/tests/models/test_gpu.py @@ -119,7 +119,6 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation): # for running in ddp mode, we need to lauch it's own process or pytest will get stuck p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) - p.communicate() std, err = p.communicate(timeout=60) std = std.decode('utf-8').strip() From d1ed49a9442b2297d9139fa4631dd47dfc7ca54c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 16 Aug 2020 08:02:44 +0200 Subject: [PATCH 30/32] docs --- docs/source/multi_gpu.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index c93d5634fa1c2..57b3e0813f54f 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -294,8 +294,7 @@ We use DDP this way because `ddp_spawn` has a few limitations (due to Python and There are cases in which it is not possible to use DDP. Examples are: -- Jupyter Notebook -- Google COLAB, Kaggle, etc. +- Jupyter Notebook, Google COLAB, Kaggle, etc. - You have a nested script without a root package - Your script needs to invoke `.fit` or `.test` multiple times From 0fbc3751502d388baaf90e7b9d181674f1c8eabb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 16 Aug 2020 08:05:34 +0200 Subject: [PATCH 31/32] changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e639a5c53f6f..1a79aa8026fc9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -138,6 +138,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed checkpointing to remote file paths ([#2925](https://github.com/PyTorchLightning/pytorch-lightning/pull/2925)) +- Fixed an issue when running `Trainer.test()` in ddp mode ([#2997](https://github.com/PyTorchLightning/pytorch-lightning/pull/2997)) + ## [0.8.5] - 2020-07-09 ### Added From 92a1ea37a76a40efbf3d84e7b86c324ada3d5083 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 16 Aug 2020 08:06:43 +0200 Subject: [PATCH 32/32] changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a79aa8026fc9..c5879503a2303 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -138,7 +138,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed checkpointing to remote file paths ([#2925](https://github.com/PyTorchLightning/pytorch-lightning/pull/2925)) -- Fixed an issue when running `Trainer.test()` in ddp mode ([#2997](https://github.com/PyTorchLightning/pytorch-lightning/pull/2997)) +- Fixed an issue that caused `Trainer.test()` to stall in ddp mode ([#2997](https://github.com/PyTorchLightning/pytorch-lightning/pull/2997)) ## [0.8.5] - 2020-07-09