Prevent loss to be moved to the cpu before backward call. (#9308)

Lightning-AI · Sep 3, 2021 · f6d4087 · f6d4087
1 parent 9d0caa6
commit f6d4087
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -286,6 +286,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed error handling in DDP process reconciliation when `_sync_dir` was not initialized ([#9267](https://github.com/PyTorchLightning/pytorch-lightning/pull/9267))
 
 
+- Fixed `move_metrics_to_cpu` moving the loss on cpu while training on device ([#9308](https://github.com/PyTorchLightning/pytorch-lightning/pull/9308))
+
+
 ## [1.4.5] - 2021-08-31
 
 - Fixed reduction using `self.log(sync_dict=True, reduce_fx={mean,max})` ([#9142](https://github.com/PyTorchLightning/pytorch-lightning/pull/9142))

diff --git a/pytorch_lightning/loops/utilities.py b/pytorch_lightning/loops/utilities.py
@@ -99,14 +99,16 @@ def _process_training_step_output(
     elif isinstance(training_step_output, torch.Tensor):
         loss = training_step_output
 
-    # map to results under the hood
-    results.minimize = loss
-
     if trainer.terminate_on_nan:
         check_finite_loss(loss)
 
+    # the loss shouldn't be moved to cpu.
     if trainer.move_metrics_to_cpu:
         results.cpu()
+
+    # map to results under the hood
+    results.minimize = loss
+
     return results, hiddens
 
 

diff --git a/tests/trainer/logging_/test_train_loop_logging.py b/tests/trainer/logging_/test_train_loop_logging.py
@@ -701,3 +701,20 @@ def test_log_gpu_memory_without_logging_on_step(tmpdir, log_gpu_memory):
         assert "max_gpu_mem" in trainer.logged_metrics
     else:
         assert "gpu_id: 1/memory.used (MB)" in trainer.logged_metrics
+
+
+@RunIf(min_gpus=1)
+def test_move_metrics_to_cpu(tmpdir):
+    class TestModel(BoringModel):
+        def on_before_backward(self, loss: torch.Tensor) -> None:
+            assert loss.device.type == "cuda"
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        amp_backend="native",
+        precision=16,
+        move_metrics_to_cpu=True,
+        gpus=1,
+    )
+    trainer.fit(TestModel())