From f6d40871bd52ac755a146958513a0a330b813b52 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Fri, 3 Sep 2021 17:26:26 +0100 Subject: [PATCH] Prevent loss to be moved to the cpu before backward call. (#9308) --- CHANGELOG.md | 3 +++ pytorch_lightning/loops/utilities.py | 8 +++++--- .../trainer/logging_/test_train_loop_logging.py | 17 +++++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 364d3880faea2..bd7bd5338de9d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -286,6 +286,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed error handling in DDP process reconciliation when `_sync_dir` was not initialized ([#9267](https://github.com/PyTorchLightning/pytorch-lightning/pull/9267)) +- Fixed `move_metrics_to_cpu` moving the loss on cpu while training on device ([#9308](https://github.com/PyTorchLightning/pytorch-lightning/pull/9308)) + + ## [1.4.5] - 2021-08-31 - Fixed reduction using `self.log(sync_dict=True, reduce_fx={mean,max})` ([#9142](https://github.com/PyTorchLightning/pytorch-lightning/pull/9142)) diff --git a/pytorch_lightning/loops/utilities.py b/pytorch_lightning/loops/utilities.py index bafe7df882462..f90bc392e72d8 100644 --- a/pytorch_lightning/loops/utilities.py +++ b/pytorch_lightning/loops/utilities.py @@ -99,14 +99,16 @@ def _process_training_step_output( elif isinstance(training_step_output, torch.Tensor): loss = training_step_output - # map to results under the hood - results.minimize = loss - if trainer.terminate_on_nan: check_finite_loss(loss) + # the loss shouldn't be moved to cpu. if trainer.move_metrics_to_cpu: results.cpu() + + # map to results under the hood + results.minimize = loss + return results, hiddens diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py index 385870fedd890..67df980aa05d6 100644 --- a/tests/trainer/logging_/test_train_loop_logging.py +++ b/tests/trainer/logging_/test_train_loop_logging.py @@ -701,3 +701,20 @@ def test_log_gpu_memory_without_logging_on_step(tmpdir, log_gpu_memory): assert "max_gpu_mem" in trainer.logged_metrics else: assert "gpu_id: 1/memory.used (MB)" in trainer.logged_metrics + + +@RunIf(min_gpus=1) +def test_move_metrics_to_cpu(tmpdir): + class TestModel(BoringModel): + def on_before_backward(self, loss: torch.Tensor) -> None: + assert loss.device.type == "cuda" + + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + amp_backend="native", + precision=16, + move_metrics_to_cpu=True, + gpus=1, + ) + trainer.fit(TestModel())