From 245adf05bd05c4866c6f31819b674d6d077d7740 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sat, 6 Mar 2021 19:28:49 +0100 Subject: [PATCH 1/7] fix --- pytorch_lightning/trainer/trainer.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index e509f22bc8b0e..fc2d4d907a972 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -381,21 +381,6 @@ def __init__( # Callback system self.on_init_end() - def setup_trainer(self, model: LightningModule): - """ - Sanity check a few things before starting actual training or testing. - - Args: - model: The model to run sanity test on. - """ - - # log hyper-parameters - if self.logger is not None: - # save exp to get started (this is where the first experiment logs are written) - self.logger.log_hyperparams(model.hparams_initial) - self.logger.log_graph(model) - self.logger.save() - def fit( self, model: LightningModule, @@ -444,7 +429,6 @@ def fit( self.call_setup_hook(model) self.call_hook("on_before_accelerator_backend_setup", model) self.accelerator.setup(self, model) - self.setup_trainer(model) # ---------------------------- # INSPECT THE CORE LOOPS @@ -509,6 +493,13 @@ def fit( def pre_dispatch(self): self.accelerator.pre_dispatch() + # log hyper-parameters + if self.logger is not None: + # save exp to get started (this is where the first experiment logs are written) + self.logger.log_hyperparams(self.lightning_module.hparams_initial) + self.logger.log_graph(self.lightning_module) + self.logger.save() + def post_dispatch(self): self.accelerator.post_dispatch() self.accelerator.teardown() From d5b2bc956e7e865a3f2769888a1fb8f373031c6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 7 Mar 2021 03:29:28 +0100 Subject: [PATCH 2/7] add simple test --- .../logging_/test_distributed_logging.py | 41 ++++++++++++++++++- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/tests/trainer/logging_/test_distributed_logging.py b/tests/trainer/logging_/test_distributed_logging.py index b8d68693fc393..8c28208063e59 100644 --- a/tests/trainer/logging_/test_distributed_logging.py +++ b/tests/trainer/logging_/test_distributed_logging.py @@ -11,10 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import os +from pathlib import Path from unittest import mock +from unittest.mock import Mock, MagicMock, call -from pytorch_lightning import Trainer +from pytorch_lightning import Trainer, Callback +from pytorch_lightning.loggers import TensorBoardLogger from tests.helpers import BoringModel from tests.helpers.runif import RunIf @@ -66,3 +69,37 @@ def test_global_zero_only_logging_ddp_spawn(tmpdir): weights_summary=None, ) trainer.fit(model) + + +class LoggerCallsObserver(Callback): + + def on_before_accelerator_backend_setup(self, trainer, pl_module): + assert not trainer.logger.method_calls + assert not os.listdir(trainer.logger.save_dir) + + def on_train_start(self, trainer, pl_module): + assert trainer.logger.method_call + trainer.logger.log_hyperparams.assert_called_once() + trainer.logger.log_graph.assert_called_once() + + +def test_first_logger_call_in_subprocess(tmpdir): + """ + Test that the Trainer does not call the logger too early. Only when the worker processes are initialized + do we have access to the rank and know which one is the main process. + """ + logger = Mock() + logger.version = "0" + logger.name = "name" + logger.save_dir = tmpdir + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=1, + limit_val_batches=1, + max_epochs=1, + logger=logger, + callbacks=[LoggerCallsObserver()] + ) + trainer.fit(model) From 78903501ffe1f8dadc9c876a7b2eb1b5b19cee2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 7 Mar 2021 03:37:34 +0100 Subject: [PATCH 3/7] fix imports --- tests/trainer/logging_/test_distributed_logging.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/trainer/logging_/test_distributed_logging.py b/tests/trainer/logging_/test_distributed_logging.py index 8c28208063e59..474fdfbbb7520 100644 --- a/tests/trainer/logging_/test_distributed_logging.py +++ b/tests/trainer/logging_/test_distributed_logging.py @@ -12,12 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from pathlib import Path from unittest import mock -from unittest.mock import Mock, MagicMock, call +from unittest.mock import Mock -from pytorch_lightning import Trainer, Callback -from pytorch_lightning.loggers import TensorBoardLogger +from pytorch_lightning import Callback, Trainer from tests.helpers import BoringModel from tests.helpers.runif import RunIf From 18a28414cf310983256d6dd627418b9a3c6c7b67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 7 Mar 2021 04:10:47 +0100 Subject: [PATCH 4/7] add changelog --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 41ec984da3c88..8e4697b572448 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -92,7 +92,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `ModelPruning(make_pruning_permanent=True)` pruning buffers getting removed when saved during training ([#6073](https://github.com/PyTorchLightning/pytorch-lightning/pull/6073)) -- Fixed `trainer.test` from `best_path` hangs after calling `trainer.fit` ([#6272](https://github.com/PyTorchLightning/pytorch-lightning/pull/6272)) +- Fixed `trainer.test` from `best_path` hangs after calling `trainer.fit` ([#6272](https://github.com/PyTorchLightning/pytorch-lightning/pull/6272)) - Fixed duplicate logs appearing in console when using the python logging module ([#5509](https://github.com/PyTorchLightning/pytorch-lightning/pull/5509), [#6275](https://github.com/PyTorchLightning/pytorch-lightning/pull/6275)) @@ -107,7 +107,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed PyTorch Profiler with `emit_nvtx` ([#6260](https://github.com/PyTorchLightning/pytorch-lightning/pull/6260)) -- Fixed `trainer.test` from `best_path` hangs after calling `trainer.fit` ([#6272](https://github.com/PyTorchLightning/pytorch-lightning/pull/6272)) +- Fixed logger creating directory structure too early in DDP ([#6380](https://github.com/PyTorchLightning/pytorch-lightning/pull/6380)) ## [1.2.2] - 2021-03-02 From 286001ec3a6f5a69a8fed2d88ffbac34be42cd5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 7 Mar 2021 15:27:39 +0100 Subject: [PATCH 5/7] tighter test with on_fit_start hook closer to the dispatch call --- tests/trainer/logging_/test_distributed_logging.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/logging_/test_distributed_logging.py b/tests/trainer/logging_/test_distributed_logging.py index 474fdfbbb7520..15bdff21037fe 100644 --- a/tests/trainer/logging_/test_distributed_logging.py +++ b/tests/trainer/logging_/test_distributed_logging.py @@ -71,7 +71,7 @@ def test_global_zero_only_logging_ddp_spawn(tmpdir): class LoggerCallsObserver(Callback): - def on_before_accelerator_backend_setup(self, trainer, pl_module): + def on_fit_start(self, trainer, pl_module): assert not trainer.logger.method_calls assert not os.listdir(trainer.logger.save_dir) From 8e210733aff55ef44e6ae2022b03f5124ba6a233 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 7 Mar 2021 15:28:25 +0100 Subject: [PATCH 6/7] move class inside test f unction --- .../logging_/test_distributed_logging.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/trainer/logging_/test_distributed_logging.py b/tests/trainer/logging_/test_distributed_logging.py index 15bdff21037fe..d58f53a6493c9 100644 --- a/tests/trainer/logging_/test_distributed_logging.py +++ b/tests/trainer/logging_/test_distributed_logging.py @@ -69,23 +69,23 @@ def test_global_zero_only_logging_ddp_spawn(tmpdir): trainer.fit(model) -class LoggerCallsObserver(Callback): - - def on_fit_start(self, trainer, pl_module): - assert not trainer.logger.method_calls - assert not os.listdir(trainer.logger.save_dir) - - def on_train_start(self, trainer, pl_module): - assert trainer.logger.method_call - trainer.logger.log_hyperparams.assert_called_once() - trainer.logger.log_graph.assert_called_once() - - def test_first_logger_call_in_subprocess(tmpdir): """ Test that the Trainer does not call the logger too early. Only when the worker processes are initialized do we have access to the rank and know which one is the main process. """ + + class LoggerCallsObserver(Callback): + + def on_fit_start(self, trainer, pl_module): + assert not trainer.logger.method_calls + assert not os.listdir(trainer.logger.save_dir) + + def on_train_start(self, trainer, pl_module): + assert trainer.logger.method_call + trainer.logger.log_hyperparams.assert_called_once() + trainer.logger.log_graph.assert_called_once() + logger = Mock() logger.version = "0" logger.name = "name" From e1f98fcd4cc6004b5b886aaa9f463afd26716044 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 7 Mar 2021 15:32:44 +0100 Subject: [PATCH 7/7] add a comment --- tests/trainer/logging_/test_distributed_logging.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/trainer/logging_/test_distributed_logging.py b/tests/trainer/logging_/test_distributed_logging.py index d58f53a6493c9..5832f387cc63d 100644 --- a/tests/trainer/logging_/test_distributed_logging.py +++ b/tests/trainer/logging_/test_distributed_logging.py @@ -78,6 +78,8 @@ def test_first_logger_call_in_subprocess(tmpdir): class LoggerCallsObserver(Callback): def on_fit_start(self, trainer, pl_module): + # this hook is executed directly before Trainer.pre_dispatch + # logger should not write any logs until this point assert not trainer.logger.method_calls assert not os.listdir(trainer.logger.save_dir)