From 52745c9aeea687324bac5f826132bcf082456566 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <adrian.waelchli@inf.unibe.ch>
Date: Thu, 6 Aug 2020 23:32:14 +0200
Subject: [PATCH 01/32] add ddp script variations

---
 tests/models/data/ddp/train_default_model.py | 54 ++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 tests/models/data/ddp/train_default_model.py

diff --git a/tests/models/data/ddp/train_default_model.py b/tests/models/data/ddp/train_default_model.py
new file mode 100644
index 0000000000000..2949750951099
--- /dev/null
+++ b/tests/models/data/ddp/train_default_model.py
@@ -0,0 +1,54 @@
+"""
+Runs several combinations of `.fit()` and `.test()` on a single node across multiple gpus.
+"""
+from argparse import ArgumentParser
+
+from pytorch_lightning import Trainer, seed_everything
+from tests.base import EvalModelTemplate
+
+
+def variation_fit_test(trainer, model):
+    trainer.fit(model)
+    trainer.test(model)
+
+
+def variation_test_fit(trainer, model):
+    trainer.test(model)
+    trainer.fit(model)
+
+
+def variation_test_test(trainer, model):
+    trainer.test(model)
+    trainer.test(model)
+
+
+def variation_test_fit_test(trainer, model):
+    trainer.test(model)
+    trainer.fit(model)
+    trainer.test(model)
+
+
+def get_variations():
+    variations = [v for v in globals() if v.startswith("variation")]
+    return variations
+
+
+def main():
+    seed_everything(1234)
+    parser = ArgumentParser(add_help=False)
+    parser = Trainer.add_argparse_args(parser)
+    parser.add_argument('--variation', default=variation_fit_test.__name__)
+    parser.set_defaults(gpus=2)
+    parser.set_defaults(distributed_backend="ddp")
+    args = parser.parse_args()
+
+    model = EvalModelTemplate()
+    trainer = Trainer.from_argparse_args(args)
+
+    # run the chosen variation
+    run_variation = globals()[args.variation]
+    run_variation(trainer, model)
+
+
+if __name__ == '__main__':
+    main()

From 891bb93ffb1429074edda8389cdf50929eeff2ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <adrian.waelchli@inf.unibe.ch>
Date: Thu, 6 Aug 2020 23:34:01 +0200
Subject: [PATCH 02/32] add ddp test

---
 tests/models/test_gpu.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 509de4c07563a..0e308598b3691 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -1,5 +1,8 @@
+import subprocess
+import sys
 from collections import namedtuple
 from unittest.mock import patch
+from pathlib import Path
 
 import pytest
 import torch
@@ -12,6 +15,7 @@
 from pytorch_lightning.trainer.distrib_parts import _parse_gpu_ids, determine_root_gpu_device
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import EvalModelTemplate
+from tests.models.data.ddp import train_default_model
 
 PRETEND_N_OF_GPUS = 16
 
@@ -94,6 +98,26 @@ def test_multi_gpu_model_dp(tmpdir):
     memory.get_memory_profile('min_max')
 
 
+@pytest.mark.parametrize('cli_args', [
+    pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'),
+])
+@pytest.mark.parametrize('variation', train_default_model.get_variations())
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
+    file = Path(train_default_model.__file__).absolute()
+    cli_args = cli_args.split(' ') if cli_args else []
+    cli_args += ['--default_root_dir', str(tmpdir)]
+    command = [sys.executable, file, '--variation', variation] + cli_args
+    p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    std, err = p.communicate(timeout=60)
+    #assert std and not err
+    if p.returncode:
+        print(std)
+        print(err)
+        print(command)
+        raise RuntimeError('error')
+
+
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_multi_gpu_model_ddp_spawn(tmpdir):
     tutils.set_random_master_port()

From 030ac29c4ff980b76522e3f4865c21be91fb49f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <adrian.waelchli@inf.unibe.ch>
Date: Fri, 7 Aug 2020 02:22:07 +0200
Subject: [PATCH 03/32] rename

---
 .../{train_default_model.py => train_test_variations.py}  | 0
 tests/models/test_gpu.py                                  | 8 ++++----
 2 files changed, 4 insertions(+), 4 deletions(-)
 rename tests/models/data/ddp/{train_default_model.py => train_test_variations.py} (100%)

diff --git a/tests/models/data/ddp/train_default_model.py b/tests/models/data/ddp/train_test_variations.py
similarity index 100%
rename from tests/models/data/ddp/train_default_model.py
rename to tests/models/data/ddp/train_test_variations.py
diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 0e308598b3691..2a4d335fa5918 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -15,7 +15,7 @@
 from pytorch_lightning.trainer.distrib_parts import _parse_gpu_ids, determine_root_gpu_device
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import EvalModelTemplate
-from tests.models.data.ddp import train_default_model
+from tests.models.data.ddp import train_test_variations
 
 PRETEND_N_OF_GPUS = 16
 
@@ -101,14 +101,14 @@ def test_multi_gpu_model_dp(tmpdir):
 @pytest.mark.parametrize('cli_args', [
     pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'),
 ])
-@pytest.mark.parametrize('variation', train_default_model.get_variations())
+@pytest.mark.parametrize('variation', train_test_variations.get_variations())
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
-    file = Path(train_default_model.__file__).absolute()
+    file = Path(train_test_variations.__file__).absolute()
     cli_args = cli_args.split(' ') if cli_args else []
     cli_args += ['--default_root_dir', str(tmpdir)]
     command = [sys.executable, file, '--variation', variation] + cli_args
-    p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
     std, err = p.communicate(timeout=60)
     #assert std and not err
     if p.returncode:

From 5d7b95b35cf0e9e7be9f6acf80629559cfe3538d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <adrian.waelchli@inf.unibe.ch>
Date: Fri, 7 Aug 2020 02:38:53 +0200
Subject: [PATCH 04/32] shell

---
 tests/models/test_gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 2a4d335fa5918..35611a37b6690 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -108,7 +108,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     cli_args = cli_args.split(' ') if cli_args else []
     cli_args += ['--default_root_dir', str(tmpdir)]
     command = [sys.executable, file, '--variation', variation] + cli_args
-    p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+    p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     std, err = p.communicate(timeout=60)
     #assert std and not err
     if p.returncode:

From fa4474987a392d0df0ee2096022cff7f69e8c070 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <adrian.waelchli@inf.unibe.ch>
Date: Fri, 7 Aug 2020 02:43:22 +0200
Subject: [PATCH 05/32] test

---
 tests/models/test_gpu.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 35611a37b6690..7f70e7f37e185 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -110,12 +110,14 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     command = [sys.executable, file, '--variation', variation] + cli_args
     p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     std, err = p.communicate(timeout=60)
-    #assert std and not err
+    std = std.decode('utf-8').strip()
+    err = err.decode('utf-8').strip()
+    assert std and not err
     if p.returncode:
         print(std)
         print(err)
         print(command)
-        raise RuntimeError('error')
+        pytest.fail(err)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")

From f18735232c90e6e770332e06cbc7a64cbbedb4fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <adrian.waelchli@inf.unibe.ch>
Date: Fri, 7 Aug 2020 02:55:54 +0200
Subject: [PATCH 06/32] test

---
 tests/models/test_gpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 7f70e7f37e185..65b5d1f55e275 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -112,7 +112,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     std, err = p.communicate(timeout=60)
     std = std.decode('utf-8').strip()
     err = err.decode('utf-8').strip()
-    assert std and not err
+    # assert std and not err
     if p.returncode:
         print(std)
         print(err)

From ca6558b0bedc299d335d4d4845da5c40b7875c0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <adrian.waelchli@inf.unibe.ch>
Date: Fri, 7 Aug 2020 03:39:02 +0200
Subject: [PATCH 07/32] try call

---
 tests/models/test_gpu.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 65b5d1f55e275..6d05fefdd418b 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -108,16 +108,17 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     cli_args = cli_args.split(' ') if cli_args else []
     cli_args += ['--default_root_dir', str(tmpdir)]
     command = [sys.executable, file, '--variation', variation] + cli_args
-    p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    std, err = p.communicate(timeout=60)
-    std = std.decode('utf-8').strip()
-    err = err.decode('utf-8').strip()
+    exitcode = subprocess.call(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    raise SystemExit(exitcode)
+    # std, err = p.communicate(timeout=60)
+    # std = std.decode('utf-8').strip()
+    # err = err.decode('utf-8').strip()
     # assert std and not err
-    if p.returncode:
-        print(std)
-        print(err)
-        print(command)
-        pytest.fail(err)
+    # if p.returncode:
+    #     print(std)
+    #     print(err)
+    #     print(command)
+    #     pytest.fail(err)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")

From a89bfa95234fd917ccc1ca35f8b0e8b52a6d269f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <adrian.waelchli@inf.unibe.ch>
Date: Fri, 7 Aug 2020 15:59:04 +0200
Subject: [PATCH 08/32] try without subprocess

---
 tests/models/test_gpu.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 6d05fefdd418b..84b67ebf17209 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -3,6 +3,7 @@
 from collections import namedtuple
 from unittest.mock import patch
 from pathlib import Path
+from unittest import mock
 
 import pytest
 import torch
@@ -107,9 +108,9 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     file = Path(train_test_variations.__file__).absolute()
     cli_args = cli_args.split(' ') if cli_args else []
     cli_args += ['--default_root_dir', str(tmpdir)]
-    command = [sys.executable, file, '--variation', variation] + cli_args
-    exitcode = subprocess.call(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    raise SystemExit(exitcode)
+    # command = [sys.executable, file, '--variation', variation] + cli_args
+    # exitcode = subprocess.call(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    # raise SystemExit(exitcode)
     # std, err = p.communicate(timeout=60)
     # std = std.decode('utf-8').strip()
     # err = err.decode('utf-8').strip()
@@ -120,6 +121,11 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     #     print(command)
     #     pytest.fail(err)
 
+    cli_args += ['--variation', variation]
+    from tests.models.data.ddp.train_test_variations import main
+    with mock.patch("argparse._sys.argv", ["any.py"] + cli_args):
+        main()
+
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_multi_gpu_model_ddp_spawn(tmpdir):

From 3e128968e31ef558f7cb4bf40a5da5993ac52181 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <adrian.waelchli@inf.unibe.ch>
Date: Fri, 7 Aug 2020 16:07:59 +0200
Subject: [PATCH 09/32] test

---
 tests/models/test_gpu.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 84b67ebf17209..6c727002e4049 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -108,9 +108,10 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     file = Path(train_test_variations.__file__).absolute()
     cli_args = cli_args.split(' ') if cli_args else []
     cli_args += ['--default_root_dir', str(tmpdir)]
-    # command = [sys.executable, file, '--variation', variation] + cli_args
-    # exitcode = subprocess.call(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    # raise SystemExit(exitcode)
+    command = [sys.executable, file, '--variation', variation] + cli_args
+    p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    p.communicate()
+    assert p.returncode == 0
     # std, err = p.communicate(timeout=60)
     # std = std.decode('utf-8').strip()
     # err = err.decode('utf-8').strip()
@@ -121,10 +122,10 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     #     print(command)
     #     pytest.fail(err)
 
-    cli_args += ['--variation', variation]
-    from tests.models.data.ddp.train_test_variations import main
-    with mock.patch("argparse._sys.argv", ["any.py"] + cli_args):
-        main()
+    # cli_args += ['--variation', variation]
+    # from tests.models.data.ddp.train_test_variations import main
+    # with mock.patch("argparse._sys.argv", ["any.py"] + cli_args):
+    #     main()
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")

From dea0278bc4ece67d92005b68cf3d7af98ba7dbcc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <adrian.waelchli@inf.unibe.ch>
Date: Fri, 7 Aug 2020 16:54:10 +0200
Subject: [PATCH 10/32] display the error

---
 tests/models/test_gpu.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 6c727002e4049..912e32659560a 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -111,16 +111,16 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     command = [sys.executable, file, '--variation', variation] + cli_args
     p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     p.communicate()
-    assert p.returncode == 0
-    # std, err = p.communicate(timeout=60)
-    # std = std.decode('utf-8').strip()
-    # err = err.decode('utf-8').strip()
+    # assert p.returncode == 0
+    std, err = p.communicate(timeout=60)
+    std = std.decode('utf-8').strip()
+    err = err.decode('utf-8').strip()
     # assert std and not err
-    # if p.returncode:
-    #     print(std)
-    #     print(err)
-    #     print(command)
-    #     pytest.fail(err)
+    if p.returncode:
+        print(std)
+        print(err)
+        print(command)
+        pytest.fail(err)
 
     # cli_args += ['--variation', variation]
     # from tests.models.data.ddp.train_test_variations import main

From be6ba0610a53b89b560b388bfbeeaf85e64181ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sat, 8 Aug 2020 05:38:14 +0200
Subject: [PATCH 11/32] list all variations

---
 tests/models/data/ddp/train_test_variations.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/tests/models/data/ddp/train_test_variations.py b/tests/models/data/ddp/train_test_variations.py
index 2949750951099..1ac2e110dd599 100644
--- a/tests/models/data/ddp/train_test_variations.py
+++ b/tests/models/data/ddp/train_test_variations.py
@@ -17,6 +17,11 @@ def variation_test_fit(trainer, model):
     trainer.fit(model)
 
 
+def variation_fit_fit(trainer, model):
+    trainer.fit(model)
+    trainer.fit(model)
+
+
 def variation_test_test(trainer, model):
     trainer.test(model)
     trainer.test(model)
@@ -29,7 +34,13 @@ def variation_test_fit_test(trainer, model):
 
 
 def get_variations():
-    variations = [v for v in globals() if v.startswith("variation")]
+    variations = [
+        "variation_fit_test",
+        "variation_test_fit",
+        "variation_fit_fit",
+        "variation_test_test",
+        "variation_test_fit_test",
+    ]
     return variations
 
 

From c4408674e93387de9470cc53abd94c63f75b973c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 9 Aug 2020 05:12:50 +0200
Subject: [PATCH 12/32] try string

---
 tests/models/test_gpu.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 912e32659560a..b2c117f505638 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -108,7 +108,8 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     file = Path(train_test_variations.__file__).absolute()
     cli_args = cli_args.split(' ') if cli_args else []
     cli_args += ['--default_root_dir', str(tmpdir)]
-    command = [sys.executable, file, '--variation', variation] + cli_args
+    # command = [sys.executable, file, '--variation', variation] + cli_args
+    command = ['python', file, '--variation', variation] + cli_args
     p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     p.communicate()
     # assert p.returncode == 0

From 31cae82c4a91755d0a25548b867a50ed50f23027 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <adrian.waelchli@inf.unibe.ch>
Date: Sun, 9 Aug 2020 05:48:37 +0200
Subject: [PATCH 13/32] try copy env

---
 tests/models/test_gpu.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index b2c117f505638..fff38d4343fe9 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -1,3 +1,4 @@
+import os
 import subprocess
 import sys
 from collections import namedtuple
@@ -110,7 +111,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     cli_args += ['--default_root_dir', str(tmpdir)]
     # command = [sys.executable, file, '--variation', variation] + cli_args
     command = ['python', file, '--variation', variation] + cli_args
-    p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy())
     p.communicate()
     # assert p.returncode == 0
     std, err = p.communicate(timeout=60)

From e75a35ccb70b176e21dcda845caf630d1ffb9df9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <adrian.waelchli@inf.unibe.ch>
Date: Sun, 9 Aug 2020 06:27:04 +0200
Subject: [PATCH 14/32] debug

---
 tests/models/test_gpu.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index fff38d4343fe9..d897c3a1632ae 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -111,6 +111,14 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     cli_args += ['--default_root_dir', str(tmpdir)]
     # command = [sys.executable, file, '--variation', variation] + cli_args
     command = ['python', file, '--variation', variation] + cli_args
+
+    # debugging WHY SUBPROCESS PYTHON CANNOT IMPORT PL
+    p = subprocess.Popen(['pip', 'freeze'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    p.communicate()
+    std, err = p.communicate()
+    std = std.decode('utf-8')
+    print(std)
+
     p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy())
     p.communicate()
     # assert p.returncode == 0

From 6b45e7df2e00de92ae1ab61a0a2817281961f8c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <adrian.waelchli@inf.unibe.ch>
Date: Sun, 9 Aug 2020 06:44:08 +0200
Subject: [PATCH 15/32] pythonpath

---
 tests/models/test_gpu.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index d897c3a1632ae..751ac9127baf4 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -12,6 +12,7 @@
 
 import tests.base.develop_pipelines as tpipes
 import tests.base.develop_utils as tutils
+import pytorch_lightning
 from pytorch_lightning import Trainer
 from pytorch_lightning.core import memory
 from pytorch_lightning.trainer.distrib_parts import _parse_gpu_ids, determine_root_gpu_device
@@ -113,13 +114,17 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     command = ['python', file, '--variation', variation] + cli_args
 
     # debugging WHY SUBPROCESS PYTHON CANNOT IMPORT PL
+
     p = subprocess.Popen(['pip', 'freeze'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     p.communicate()
     std, err = p.communicate()
     std = std.decode('utf-8')
     print(std)
 
-    p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=os.environ.copy())
+    env = os.environ.copy()
+    env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env['PYTHONPATH']
+
+    p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
     p.communicate()
     # assert p.returncode == 0
     std, err = p.communicate(timeout=60)

From bf5826a33db3774ee7458270a0335fc8ee2d9d2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <adrian.waelchli@inf.unibe.ch>
Date: Sun, 9 Aug 2020 06:55:42 +0200
Subject: [PATCH 16/32] path

---
 tests/models/test_gpu.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 751ac9127baf4..eeefc0eb80899 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -110,8 +110,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     file = Path(train_test_variations.__file__).absolute()
     cli_args = cli_args.split(' ') if cli_args else []
     cli_args += ['--default_root_dir', str(tmpdir)]
-    # command = [sys.executable, file, '--variation', variation] + cli_args
-    command = ['python', file, '--variation', variation] + cli_args
+    command = [sys.executable, str(file), '--variation', variation] + cli_args
 
     # debugging WHY SUBPROCESS PYTHON CANNOT IMPORT PL
 
@@ -122,8 +121,8 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     print(std)
 
     env = os.environ.copy()
-    env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env['PYTHONPATH']
-
+    env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '')
+    print('python path', env['PYTHONPATH'])
     p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
     p.communicate()
     # assert p.returncode == 0

From 379009994083b6f1e09fb6005b23bd55a55af59a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <adrian.waelchli@inf.unibe.ch>
Date: Sun, 9 Aug 2020 07:07:52 +0200
Subject: [PATCH 17/32] update test

---
 tests/models/test_gpu.py | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index eeefc0eb80899..a1b22709dbdee 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -111,36 +111,20 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     cli_args = cli_args.split(' ') if cli_args else []
     cli_args += ['--default_root_dir', str(tmpdir)]
     command = [sys.executable, str(file), '--variation', variation] + cli_args
-
-    # debugging WHY SUBPROCESS PYTHON CANNOT IMPORT PL
-
-    p = subprocess.Popen(['pip', 'freeze'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    p.communicate()
-    std, err = p.communicate()
-    std = std.decode('utf-8')
-    print(std)
-
     env = os.environ.copy()
     env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '')
-    print('python path', env['PYTHONPATH'])
     p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
     p.communicate()
-    # assert p.returncode == 0
     std, err = p.communicate(timeout=60)
     std = std.decode('utf-8').strip()
     err = err.decode('utf-8').strip()
-    # assert std and not err
+    assert std
     if p.returncode:
         print(std)
         print(err)
         print(command)
         pytest.fail(err)
 
-    # cli_args += ['--variation', variation]
-    # from tests.models.data.ddp.train_test_variations import main
-    # with mock.patch("argparse._sys.argv", ["any.py"] + cli_args):
-    #     main()
-
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_multi_gpu_model_ddp_spawn(tmpdir):

From bb40a2eebebf063e07db1878b339ab1e033a20bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <adrian.waelchli@inf.unibe.ch>
Date: Sun, 9 Aug 2020 07:17:20 +0200
Subject: [PATCH 18/32] change

---
 tests/models/test_gpu.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index a1b22709dbdee..db26edece1fbf 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -115,6 +115,7 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '')
     p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
     p.communicate()
+
     std, err = p.communicate(timeout=60)
     std = std.decode('utf-8').strip()
     err = err.decode('utf-8').strip()

From 1cd63aa83ccd448b55358f2faab62d41d879ecb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 16 Aug 2020 06:11:01 +0200
Subject: [PATCH 19/32] simple ddp test

---
 .../models/data/ddp/train_test_variations.py  | 33 ++++---------------
 tests/models/test_gpu.py                      |  7 ++--
 2 files changed, 10 insertions(+), 30 deletions(-)

diff --git a/tests/models/data/ddp/train_test_variations.py b/tests/models/data/ddp/train_test_variations.py
index 1ac2e110dd599..f37bd27e8a005 100644
--- a/tests/models/data/ddp/train_test_variations.py
+++ b/tests/models/data/ddp/train_test_variations.py
@@ -1,5 +1,5 @@
 """
-Runs several combinations of `.fit()` and `.test()` on a single node across multiple gpus.
+Runs either `.fit()` or `.test()` on a single node across multiple gpus.
 """
 from argparse import ArgumentParser
 
@@ -7,39 +7,18 @@
 from tests.base import EvalModelTemplate
 
 
-def variation_fit_test(trainer, model):
+def variation_fit(trainer, model):
     trainer.fit(model)
-    trainer.test(model)
-
-
-def variation_test_fit(trainer, model):
-    trainer.test(model)
-    trainer.fit(model)
-
 
-def variation_fit_fit(trainer, model):
-    trainer.fit(model)
-    trainer.fit(model)
 
-
-def variation_test_test(trainer, model):
-    trainer.test(model)
-    trainer.test(model)
-
-
-def variation_test_fit_test(trainer, model):
-    trainer.test(model)
-    trainer.fit(model)
+def variation_test(trainer, model):
     trainer.test(model)
 
 
 def get_variations():
     variations = [
-        "variation_fit_test",
-        "variation_test_fit",
-        "variation_fit_fit",
-        "variation_test_test",
-        "variation_test_fit_test",
+        "variation_fit",
+        "variation_test",
     ]
     return variations
 
@@ -48,7 +27,7 @@ def main():
     seed_everything(1234)
     parser = ArgumentParser(add_help=False)
     parser = Trainer.add_argparse_args(parser)
-    parser.add_argument('--variation', default=variation_fit_test.__name__)
+    parser.add_argument('--variation', default=variation_fit.__name__)
     parser.set_defaults(gpus=2)
     parser.set_defaults(distributed_backend="ddp")
     args = parser.parse_args()
diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index db26edece1fbf..245ba798c3c55 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -110,7 +110,8 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     file = Path(train_test_variations.__file__).absolute()
     cli_args = cli_args.split(' ') if cli_args else []
     cli_args += ['--default_root_dir', str(tmpdir)]
-    command = [sys.executable, str(file), '--variation', variation] + cli_args
+    cli_args += ['--variation', variation]
+    command = [sys.executable, str(file)] + cli_args
     env = os.environ.copy()
     env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '')
     p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
@@ -119,8 +120,8 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     std, err = p.communicate(timeout=60)
     std = std.decode('utf-8').strip()
     err = err.decode('utf-8').strip()
-    assert std
-    if p.returncode:
+    # assert std, f"{variation} produced no output"
+    if p.returncode > 0:
         print(std)
         print(err)
         print(command)

From febf1147c55eb8849d560e3d1c5277750257e33f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 16 Aug 2020 06:32:17 +0200
Subject: [PATCH 20/32] replace

---
 .../trainer/distrib_data_parallel.py          | 36 +++++++++----------
 pytorch_lightning/utilities/distributed.py    | 10 ++++++
 2 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py
index 5555c26172b11..9566525cfd250 100644
--- a/pytorch_lightning/trainer/distrib_data_parallel.py
+++ b/pytorch_lightning/trainer/distrib_data_parallel.py
@@ -163,9 +163,9 @@ def train_fx(trial_hparams, cluster_manager, _):
 else:
     XLA_AVAILABLE = True
 
-PID = os.getpid()
-RNG1 = np.random.RandomState(PID)
-RANDOM_PORTS = RNG1.randint(10000, 19999, 1000)
+# PID = os.getpid()
+# RNG1 = np.random.RandomState(PID)
+# RANDOM_PORTS = RNG1.randint(10000, 19999, 1000)
 
 
 class TrainerDDPMixin(ABC):
@@ -389,21 +389,21 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids):
         # don't make this debug... this is good UX
         rank_zero_info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]')
 
-    def set_random_port(self, force=False):
-        """
-        When running DDP NOT managed by SLURM, the ports might collide
-        """
-        # pick a random port first
-        assert self.num_nodes == 1, 'random port can only be called from single node training'
-        global RANDOM_PORTS
-        default_port = RANDOM_PORTS[-1]
-        RANDOM_PORTS = RANDOM_PORTS[:-1]
-
-        # when not forced, use the user port
-        if not force:
-            default_port = os.environ.get('MASTER_PORT', default_port)
-
-        os.environ['MASTER_PORT'] = str(default_port)
+    # def set_random_port(self, force=False):
+    #     """
+    #     When running DDP NOT managed by SLURM, the ports might collide
+    #     """
+    #     # pick a random port first
+    #     assert self.num_nodes == 1, 'random port can only be called from single node training'
+    #     global RANDOM_PORTS
+    #     default_port = RANDOM_PORTS[-1]
+    #     RANDOM_PORTS = RANDOM_PORTS[:-1]
+    #
+    #     # when not forced, use the user port
+    #     if not force:
+    #         default_port = os.environ.get('MASTER_PORT', default_port)
+    #
+    #     os.environ['MASTER_PORT'] = str(default_port)
 
     def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results):
         if self.distributed_backend.lower() not in ['ddp_spawn', 'ddp_cpu', 'tpu']:
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index cd0621496fe42..a44a6c0ec0749 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -34,3 +34,13 @@ def _debug(*args, **kwargs):
 rank_zero_debug = rank_zero_only(_debug)
 rank_zero_info = rank_zero_only(_info)
 rank_zero_warn = rank_zero_only(_warn)
+
+
+def find_free_network_port():
+    import socket
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    s.bind(("", 0))
+    s.listen(1)
+    port = s.getsockname()[1]
+    s.close()
+    return port

From 78c05945e8c818b255ee059fd76a18959633d1a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 16 Aug 2020 06:49:37 +0200
Subject: [PATCH 21/32] remove random port

---
 pytorch_lightning/accelerators/ddp_backend.py    | 16 ++++++----------
 .../accelerators/ddp_spawn_backend.py            |  5 +++--
 pytorch_lightning/core/lightning.py              |  4 ----
 pytorch_lightning/trainer/trainer.py             |  2 --
 4 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py
index ca4ade2d11f6b..d0ea497548a3e 100644
--- a/pytorch_lightning/accelerators/ddp_backend.py
+++ b/pytorch_lightning/accelerators/ddp_backend.py
@@ -24,7 +24,7 @@
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.utilities import AMPType
-from pytorch_lightning.utilities.distributed import rank_zero_only
+from pytorch_lightning.utilities.distributed import rank_zero_only, find_free_network_port
 
 try:
     from hydra.utils import to_absolute_path, get_original_cwd
@@ -56,19 +56,15 @@ def train(self, model):
         self.ddp_train(process_idx=self.task_idx, mp_queue=None, model=model)
 
     def spawn_ddp_children(self, model):
-        port = os.environ['MASTER_PORT']
+        assert self.trainer.global_rank == 0
 
-        master_address = '127.0.0.1' if 'MASTER_ADDR' not in os.environ else os.environ['MASTER_ADDR']
-        os.environ['MASTER_PORT'] = f'{port}'
-        os.environ['MASTER_ADDR'] = f'{master_address}'
+        os.environ['MASTER_ADDR'] = os.environ.get('MASTER_ADDR', '127.0.0.1')
+        os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', find_free_network_port())
 
         # allow the user to pass the node rank
         node_rank = '0'
-        if 'NODE_RANK' in os.environ:
-            node_rank = os.environ['NODE_RANK']
-        if 'GROUP_RANK' in os.environ:
-            node_rank = os.environ['GROUP_RANK']
-
+        node_rank = os.environ.get('NODE_RANK', node_rank)
+        node_rank = os.environ.get('GROUP_RANK', node_rank)
         os.environ['NODE_RANK'] = node_rank
         os.environ['LOCAL_RANK'] = '0'
 
diff --git a/pytorch_lightning/accelerators/ddp_spawn_backend.py b/pytorch_lightning/accelerators/ddp_spawn_backend.py
index 6fce4cb7a530e..b9d203f74583f 100644
--- a/pytorch_lightning/accelerators/ddp_spawn_backend.py
+++ b/pytorch_lightning/accelerators/ddp_spawn_backend.py
@@ -11,13 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
+import os
 
 import torch
 import torch.multiprocessing as mp
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.utilities import AMPType
-from pytorch_lightning.utilities.distributed import rank_zero_only
+from pytorch_lightning.utilities.distributed import rank_zero_only, find_free_network_port
 
 try:
     from apex import amp
@@ -32,7 +33,7 @@ def __init__(self, trainer):
         self.mp_queue = None
 
     def setup(self):
-        self.trainer.set_random_port()
+        os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', find_free_network_port())
 
         # pass in a state q
         smp = mp.get_context('spawn')
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 66d067a5146b6..ba00e702f26c8 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -893,10 +893,6 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi
         log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank+1}/{world_size}")
         torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size)
 
-    """
-    configure_sync_batchnorm
-    ^^^^^^^^^^^^^^^^^^^^^^^^
-    """
     def configure_sync_batchnorm(self, model: 'LightningModule') -> 'LightningModule':
         """
         Add global batchnorm for a model spread across multiple GPUs and nodes.
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 8c2eb4bfe4c32..17021f3f1225a 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1377,7 +1377,6 @@ def __test_using_best_weights(self, ckpt_path, test_dataloaders):
 
         # run tests
         self.tested_ckpt_path = ckpt_path
-        self.set_random_port(force=True)
         self.testing = True
         os.environ['PL_TESTING_MODE'] = '1'
         self.model = model
@@ -1400,7 +1399,6 @@ def __test_given_model(self, model, test_dataloaders):
 
         # run test
         # sets up testing so we short circuit to eval
-        self.set_random_port(force=True)
         self.testing = True
         self.model = model
         results = self.fit(model)

From d2efd396de96dbd83ca2166502995e86c3ab54b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 16 Aug 2020 06:51:15 +0200
Subject: [PATCH 22/32] random port

---
 pytorch_lightning/trainer/trainer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 17021f3f1225a..98d93ad6357fa 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1039,7 +1039,6 @@ def fit(
 
         # ddp
         elif self.distributed_backend == 'ddp':
-            self.set_random_port()
             self.accelerator_backend = DDPBackend(self)
             results = self.accelerator_backend.spawn_ddp_children(model)
 

From b519feeebfd2d07b866e6fd61a9095490aacd433 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 16 Aug 2020 06:52:13 +0200
Subject: [PATCH 23/32] str

---
 pytorch_lightning/accelerators/ddp_backend.py       | 2 +-
 pytorch_lightning/accelerators/ddp_spawn_backend.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py
index d0ea497548a3e..24365c673e931 100644
--- a/pytorch_lightning/accelerators/ddp_backend.py
+++ b/pytorch_lightning/accelerators/ddp_backend.py
@@ -59,7 +59,7 @@ def spawn_ddp_children(self, model):
         assert self.trainer.global_rank == 0
 
         os.environ['MASTER_ADDR'] = os.environ.get('MASTER_ADDR', '127.0.0.1')
-        os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', find_free_network_port())
+        os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port()))
 
         # allow the user to pass the node rank
         node_rank = '0'
diff --git a/pytorch_lightning/accelerators/ddp_spawn_backend.py b/pytorch_lightning/accelerators/ddp_spawn_backend.py
index b9d203f74583f..ee4954057964d 100644
--- a/pytorch_lightning/accelerators/ddp_spawn_backend.py
+++ b/pytorch_lightning/accelerators/ddp_spawn_backend.py
@@ -33,7 +33,7 @@ def __init__(self, trainer):
         self.mp_queue = None
 
     def setup(self):
-        os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', find_free_network_port())
+        os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port()))
 
         # pass in a state q
         smp = mp.get_context('spawn')

From e46862af7cd350574945883e04d0972902e3f7fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 16 Aug 2020 06:56:32 +0200
Subject: [PATCH 24/32] clean up

---
 pytorch_lightning/core/saving.py     | 1 +
 tests/base/model_valid_epoch_ends.py | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/core/saving.py b/pytorch_lightning/core/saving.py
index 16a9467c71554..1835251b2de54 100644
--- a/pytorch_lightning/core/saving.py
+++ b/pytorch_lightning/core/saving.py
@@ -353,6 +353,7 @@ def save_hparams_to_yaml(config_yaml, hparams: Union[dict, Namespace]) -> None:
     with open(config_yaml, 'w', newline='') as fp:
         yaml.dump(hparams, fp)
 
+
 def convert(val: str) -> Union[int, float, bool, str]:
     try:
         return ast.literal_eval(val)
diff --git a/tests/base/model_valid_epoch_ends.py b/tests/base/model_valid_epoch_ends.py
index 8974224409624..943227e738cae 100644
--- a/tests/base/model_valid_epoch_ends.py
+++ b/tests/base/model_valid_epoch_ends.py
@@ -21,7 +21,6 @@ def _mean(res, key):
             # recursive mean for multilevel dicts
             return torch.stack([x[key] if isinstance(x, dict) else _mean(x, key) for x in res]).mean()
 
-        print('in validation epoch end')
         val_loss_mean = _mean(outputs, 'val_loss')
         val_acc_mean = _mean(outputs, 'val_acc')
 

From fed39041a1104fdbf484a3b01326daf14a8091b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 16 Aug 2020 07:08:11 +0200
Subject: [PATCH 25/32] check run spawn

---
 pytorch_lightning/accelerators/ddp_backend.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/pytorch_lightning/accelerators/ddp_backend.py b/pytorch_lightning/accelerators/ddp_backend.py
index 24365c673e931..6866a66543e4a 100644
--- a/pytorch_lightning/accelerators/ddp_backend.py
+++ b/pytorch_lightning/accelerators/ddp_backend.py
@@ -45,6 +45,7 @@ class DDPBackend(object):
     def __init__(self, trainer):
         self.trainer = trainer
         self.task_idx = None
+        self._has_spawned_children = False
 
     def slurm_setup(self):
         self.task_idx = int(os.environ['SLURM_LOCALID'])
@@ -57,6 +58,8 @@ def train(self, model):
 
     def spawn_ddp_children(self, model):
         assert self.trainer.global_rank == 0
+        self._check_can_spawn_children()
+        self._has_spawned_children = True
 
         os.environ['MASTER_ADDR'] = os.environ.get('MASTER_ADDR', '127.0.0.1')
         os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(find_free_network_port()))
@@ -231,3 +234,10 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0
 
         if self.trainer.global_rank == 0 and self.trainer.distributed_backend not in ['ddp_spawn', 'ddp_cpu']:
             return results
+
+    def _check_can_spawn_children(self):
+        if self._has_spawned_children:
+            raise RuntimeError(
+                "You tried to run `.fit` or `.test` multiple times in the same script."
+                " This is not supported in DDP mode, switch to `distributed_backend='ddp_spawn'` instead."
+            )

From a5ae50809a2d4ccc30f57008316682e9efecd094 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 16 Aug 2020 07:19:19 +0200
Subject: [PATCH 26/32] clean up

---
 .../trainer/distrib_data_parallel.py          | 21 -------------------
 tests/models/test_gpu.py                      | 13 ++++++------
 2 files changed, 6 insertions(+), 28 deletions(-)

diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py
index 9566525cfd250..438e4be359441 100644
--- a/pytorch_lightning/trainer/distrib_data_parallel.py
+++ b/pytorch_lightning/trainer/distrib_data_parallel.py
@@ -132,7 +132,6 @@ def train_fx(trial_hparams, cluster_manager, _):
 from abc import ABC, abstractmethod
 from typing import Union, List, Optional, Tuple
 
-import numpy as np
 import torch
 
 from pytorch_lightning import _logger as log
@@ -163,10 +162,6 @@ def train_fx(trial_hparams, cluster_manager, _):
 else:
     XLA_AVAILABLE = True
 
-# PID = os.getpid()
-# RNG1 = np.random.RandomState(PID)
-# RANDOM_PORTS = RNG1.randint(10000, 19999, 1000)
-
 
 class TrainerDDPMixin(ABC):
 
@@ -389,22 +384,6 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids):
         # don't make this debug... this is good UX
         rank_zero_info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]')
 
-    # def set_random_port(self, force=False):
-    #     """
-    #     When running DDP NOT managed by SLURM, the ports might collide
-    #     """
-    #     # pick a random port first
-    #     assert self.num_nodes == 1, 'random port can only be called from single node training'
-    #     global RANDOM_PORTS
-    #     default_port = RANDOM_PORTS[-1]
-    #     RANDOM_PORTS = RANDOM_PORTS[:-1]
-    #
-    #     # when not forced, use the user port
-    #     if not force:
-    #         default_port = os.environ.get('MASTER_PORT', default_port)
-    #
-    #     os.environ['MASTER_PORT'] = str(default_port)
-
     def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results):
         if self.distributed_backend.lower() not in ['ddp_spawn', 'ddp_cpu', 'tpu']:
             return
diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 245ba798c3c55..929b887a29105 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -2,17 +2,16 @@
 import subprocess
 import sys
 from collections import namedtuple
-from unittest.mock import patch
 from pathlib import Path
-from unittest import mock
+from unittest.mock import patch
 
 import pytest
 import torch
 from torchtext.data import Batch, Dataset, Example, Field, LabelField
 
+import pytorch_lightning
 import tests.base.develop_pipelines as tpipes
 import tests.base.develop_utils as tutils
-import pytorch_lightning
 from pytorch_lightning import Trainer
 from pytorch_lightning.core import memory
 from pytorch_lightning.trainer.distrib_parts import _parse_gpu_ids, determine_root_gpu_device
@@ -107,24 +106,24 @@ def test_multi_gpu_model_dp(tmpdir):
 @pytest.mark.parametrize('variation', train_test_variations.get_variations())
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
+    """ Runs a basic training and test run with distributed_backend=ddp. """
     file = Path(train_test_variations.__file__).absolute()
     cli_args = cli_args.split(' ') if cli_args else []
     cli_args += ['--default_root_dir', str(tmpdir)]
     cli_args += ['--variation', variation]
     command = [sys.executable, str(file)] + cli_args
+    # need to set the PYTHONPATH in case pytorch_lightning was not installed into the environment
     env = os.environ.copy()
     env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '')
+    # for running in ddp mode, we need to lauch it's own process or pytest will get stuck
     p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
     p.communicate()
 
     std, err = p.communicate(timeout=60)
     std = std.decode('utf-8').strip()
     err = err.decode('utf-8').strip()
-    # assert std, f"{variation} produced no output"
+    assert std, f"{variation} produced no output"
     if p.returncode > 0:
-        print(std)
-        print(err)
-        print(command)
         pytest.fail(err)
 
 

From fed97a9a4ad13f1113cdeb3bf85d298aec351a14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 16 Aug 2020 07:57:02 +0200
Subject: [PATCH 27/32] docs

---
 docs/source/multi_gpu.rst                  | 11 ++++++++---
 pytorch_lightning/utilities/distributed.py |  7 ++++++-
 tests/models/test_gpu.py                   |  2 ++
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
index 706a977290f35..020044bf1b7a0 100644
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@@ -286,8 +286,6 @@ variables:
     MASTER_ADDR=localhost MASTER_PORT=random() WORLD_SIZE=3 NODE_RANK=1 LOCAL_RANK=0 python my_file.py --gpus 3 --etc
     MASTER_ADDR=localhost MASTER_PORT=random() WORLD_SIZE=3 NODE_RANK=2 LOCAL_RANK=0 python my_file.py --gpus 3 --etc
 
-If your code does not support this (ie: jupyter notebook, colab, or a nested script without a root package),
-use `dp` or `ddp_spawn`.
 We use DDP this way because `ddp_spawn` has a few limitations (due to Python and PyTorch):
 
 1. Since `.spawn()` trains the model in subprocesses, the model on the main process does not get updated.
@@ -296,7 +294,14 @@ We use DDP this way because `ddp_spawn` has a few limitations (due to Python and
 
 3. Forces everything to be picklable.
 
-However, if you don't mind these limitations, you can use `ddp_spawn`.
+There are cases in which it is not possible to use DDP. Examples are:
+
+- Jupyter Notebook
+- Google COLAB, Kaggle, etc.
+- You have a nested script without a root package
+- Your script needs to invoke `.fit` or `.test` multiple times
+
+In these situations you should use `dp` or `ddp_spawn` instead.
 
 Distributed Data Parallel 2
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index a44a6c0ec0749..5375bfd0fa6ce 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -36,7 +36,12 @@ def _debug(*args, **kwargs):
 rank_zero_warn = rank_zero_only(_warn)
 
 
-def find_free_network_port():
+def find_free_network_port() -> int:
+    """
+    Finds a free port on localhost.
+    It is useful in single-node training when we don't want to connect to a real master node but
+    have to set the `MASTER_PORT` environment variable.
+    """
     import socket
     s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     s.bind(("", 0))
diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 929b887a29105..e60bf70644aa7 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -112,9 +112,11 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
     cli_args += ['--default_root_dir', str(tmpdir)]
     cli_args += ['--variation', variation]
     command = [sys.executable, str(file)] + cli_args
+
     # need to set the PYTHONPATH in case pytorch_lightning was not installed into the environment
     env = os.environ.copy()
     env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '')
+
     # for running in ddp mode, we need to lauch it's own process or pytest will get stuck
     p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
     p.communicate()

From 1b212d3384e32b9a5a0d600457c74e3a819e79f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 16 Aug 2020 08:00:26 +0200
Subject: [PATCH 28/32] docs

---
 docs/source/multi_gpu.rst | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
index 020044bf1b7a0..c93d5634fa1c2 100644
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@@ -289,9 +289,7 @@ variables:
 We use DDP this way because `ddp_spawn` has a few limitations (due to Python and PyTorch):
 
 1. Since `.spawn()` trains the model in subprocesses, the model on the main process does not get updated.
-
 2. Dataloader(num_workers=N), where N is large, bottlenecks training with DDP... ie: it will be VERY slow or won't work at all. This is a PyTorch limitation.
-
 3. Forces everything to be picklable.
 
 There are cases in which it is not possible to use DDP. Examples are:

From 9652305615be166bf236443984f35f299572f411 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 16 Aug 2020 08:01:44 +0200
Subject: [PATCH 29/32] update test

---
 tests/models/test_gpu.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index e60bf70644aa7..b6a2efbb8621b 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -119,7 +119,6 @@ def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
 
     # for running in ddp mode, we need to lauch it's own process or pytest will get stuck
     p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
-    p.communicate()
 
     std, err = p.communicate(timeout=60)
     std = std.decode('utf-8').strip()

From d1ed49a9442b2297d9139fa4631dd47dfc7ca54c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 16 Aug 2020 08:02:44 +0200
Subject: [PATCH 30/32] docs

---
 docs/source/multi_gpu.rst | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
index c93d5634fa1c2..57b3e0813f54f 100644
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@@ -294,8 +294,7 @@ We use DDP this way because `ddp_spawn` has a few limitations (due to Python and
 
 There are cases in which it is not possible to use DDP. Examples are:
 
-- Jupyter Notebook
-- Google COLAB, Kaggle, etc.
+- Jupyter Notebook, Google COLAB, Kaggle, etc.
 - You have a nested script without a root package
 - Your script needs to invoke `.fit` or `.test` multiple times
 

From 0fbc3751502d388baaf90e7b9d181674f1c8eabb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 16 Aug 2020 08:05:34 +0200
Subject: [PATCH 31/32] changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6e639a5c53f6f..1a79aa8026fc9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -138,6 +138,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed checkpointing to remote file paths ([#2925](https://github.com/PyTorchLightning/pytorch-lightning/pull/2925))
 
+- Fixed an issue when running `Trainer.test()` in ddp mode ([#2997](https://github.com/PyTorchLightning/pytorch-lightning/pull/2997))
+
 ## [0.8.5] - 2020-07-09
 
 ### Added

From 92a1ea37a76a40efbf3d84e7b86c324ada3d5083 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 16 Aug 2020 08:06:43 +0200
Subject: [PATCH 32/32] changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1a79aa8026fc9..c5879503a2303 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -138,7 +138,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed checkpointing to remote file paths ([#2925](https://github.com/PyTorchLightning/pytorch-lightning/pull/2925))
 
-- Fixed an issue when running `Trainer.test()` in ddp mode ([#2997](https://github.com/PyTorchLightning/pytorch-lightning/pull/2997))
+- Fixed an issue that caused `Trainer.test()` to stall in ddp mode ([#2997](https://github.com/PyTorchLightning/pytorch-lightning/pull/2997))
 
 ## [0.8.5] - 2020-07-09