[Fix] Ensure we set the eval/train flag correctly on accelerator model (

#6877) * Ensure we move the model to eval mode before running evaluation * Ensure we set the flag appropriately across all stages * Add test, move hooks logic * Apply same fix to the validate loop * Update pytorch_lightning/trainer/trainer.py * Fix function name * Fix order, add predict * Shorten the name * Fix input dm, drop duplicate on predict start hook call, as it's called in the setup function * Use hook, remove double call (cherry picked from commit 742c48e)
Lightning-AI · Apr 14, 2021 · e999d96 · e999d96
1 parent 123dcb7
commit e999d96
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 8 deletions.
diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py
@@ -150,13 +150,13 @@ def on_validation_model_eval(self) -> None:
         """
         Sets the model to eval during the val loop
         """
-        self.eval()
+        self.trainer.model.eval()
 
     def on_validation_model_train(self) -> None:
         """
         Sets the model to train during the val loop
         """
-        self.train()
+        self.trainer.model.train()
 
     def on_validation_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None:
         """
@@ -208,19 +208,19 @@ def on_test_model_train(self) -> None:
         """
         Sets the model to train during the test loop
         """
-        self.train()
+        self.trainer.model.train()
 
     def on_test_model_eval(self) -> None:
         """
         Sets the model to eval during the test loop
         """
-        self.eval()
+        self.trainer.model.eval()
 
     def on_predict_model_eval(self) -> None:
         """
         Sets the model to eval during the predict loop
         """
-        self.eval()
+        self.trainer.model.eval()
 
     def on_epoch_start(self) -> None:
         """

diff --git a/pytorch_lightning/trainer/predict_loop.py b/pytorch_lightning/trainer/predict_loop.py
@@ -44,6 +44,7 @@ def on_predict_model_eval(self, *_, **__):
         model_ref.on_predict_model_eval()
 
     def setup(self, model, max_batches, dataloaders):
+
         # copy properties for forward overrides
         self.trainer.model_connector.copy_trainer_model_properties(model)
 
@@ -95,3 +96,7 @@ def _convert_to_numpy(v):
             return results[0]
 
         return results
+
+    def on_predict_start(self):
+        # hook
+        self.trainer.call_hook("on_predict_start")
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -612,11 +612,11 @@ def run_train(self):
         self.checkpoint_connector.has_trained = False
 
         # enable train mode
-        model = self.lightning_module
-        model.train()
+        self.model.train()
         torch.set_grad_enabled(True)
 
         # reload data when needed
+        model = self.lightning_module
         self.train_loop.reset_train_val_dataloaders(model)
 
         # hook
@@ -814,6 +814,9 @@ def run_predict(self):
         model.zero_grad()
         torch.set_grad_enabled(False)
 
+        # call hook
+        self.predict_loop.on_predict_start()
+
         # set up the eval loop
         self.predict_loop.setup(model, max_batches, dataloaders)
 

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
@@ -1381,7 +1381,14 @@ def setup(self, model, stage):
 )
 @patch("pytorch_lightning.loggers.tensorboard.TensorBoardLogger.log_metrics")
 def test_log_every_n_steps(log_metrics_mock, tmpdir, train_batches, max_steps, log_interval):
-    model = EvalModelTemplate()
+
+    class TestModel(BoringModel):
+
+        def training_step(self, *args, **kwargs):
+            self.log("foo", -1)
+            return super().training_step(*args, **kwargs)
+
+    model = TestModel()
     trainer = Trainer(
         default_root_dir=tmpdir,
         log_every_n_steps=log_interval,
@@ -1932,3 +1939,32 @@ def forward(self, x):
 
     with pytest.raises(MisconfigurationException, match="Dataloader not found for `Trainer.predict`"):
         trainer.predict(model)
+
+
+class TrainerStagesModel(BoringModel):
+
+    def on_train_start(self) -> None:
+        assert self.trainer.model.training
+        assert self.training
+
+    def on_validation_start(self) -> None:
+        assert not self.trainer.model.training
+        assert not self.training
+
+    def on_test_start(self) -> None:
+        assert not self.trainer.model.training
+        assert not self.training
+
+    def on_predict_start(self) -> None:
+        assert not self.trainer.model.training
+        assert not self.training
+
+
+@pytest.mark.parametrize(['accelerator', 'num_processes'],
+                         [(None, 1), pytest.param('ddp', 2, marks=RunIf(skip_windows=True))])
+def test_model_in_correct_mode_during_stages(tmpdir, accelerator, num_processes):
+    model = TrainerStagesModel()
+    trainer = Trainer(default_root_dir=tmpdir, accelerator=accelerator, num_processes=num_processes, fast_dev_run=True)
+    trainer.fit(model)
+    trainer.test(model)
+    trainer.predict(model, model.val_dataloader())