From 1c851b89e1758f7c3b96b71b7b6619af130e81cf Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Tue, 23 Feb 2021 23:08:46 +0100
Subject: [PATCH] fixing miss-leading tested acc values (#5876)

* fixing tested values

* .

* tests

* yapf

* softmax

* hvd

* rename

* lr

* duplicate

* drop

* classif

* rm EvalModel

* Revert "rm EvalModel"

This reverts commit 6c3fb39ebe0c4bfb52357bccfd050438f2c0f31c.

* update tests

* fix

* azure

* azure

* self

* cpu

* Apply suggestions from code review

Co-authored-by: rohitgr7 <rohitgr1998@gmail.com>
---
 tests/accelerators/ddp_model.py               | 37 +++++----
 tests/accelerators/test_ddp.py                | 25 +++---
 tests/accelerators/test_ddp_spawn.py          | 15 ++--
 tests/accelerators/test_dp.py                 | 60 +++++++++++----
 tests/base/model_template.py                  |  2 +-
 tests/core/test_datamodules.py                |  3 +-
 tests/helpers/pipelines.py                    | 76 ++++++-------------
 .../models/data/horovod/test_train_script.py  | 30 ++++++++
 .../data/horovod/train_default_model.py       | 15 ++--
 tests/models/test_gpu.py                      |  7 +-
 tests/models/test_restore.py                  | 61 ++++++++-------
 tests/models/test_tpu.py                      | 13 +++-
 tests/trainer/test_dataloaders.py             | 11 ++-
 tests/trainer/test_lr_finder.py               | 11 ++-
 tests/utilities/test_parsing.py               |  8 +-
 15 files changed, 207 insertions(+), 167 deletions(-)
 create mode 100644 tests/models/data/horovod/test_train_script.py

diff --git a/tests/accelerators/ddp_model.py b/tests/accelerators/ddp_model.py
index aa286d2118c13..78d1306665c59 100644
--- a/tests/accelerators/ddp_model.py
+++ b/tests/accelerators/ddp_model.py
@@ -20,7 +20,8 @@
 import torch
 
 from pytorch_lightning import seed_everything, Trainer
-from tests.base import EvalModelTemplate
+from tests.helpers.datamodules import ClassifDataModule
+from tests.helpers.simple_models import ClassificationModel
 
 
 def main():
@@ -35,24 +36,28 @@ def main():
     parser.set_defaults(accelerator="ddp")
     args = parser.parse_args()
 
-    model = EvalModelTemplate()
+    dm = ClassifDataModule()
+    model = ClassificationModel()
     trainer = Trainer.from_argparse_args(args)
 
-    result = {}
     if args.trainer_method == 'fit':
-        trainer.fit(model)
-        result = {'status': 'complete', 'method': args.trainer_method, 'result': None}
-    if args.trainer_method == 'test':
-        result = trainer.test(model)
-        result = {'status': 'complete', 'method': args.trainer_method, 'result': result}
-    if args.trainer_method == 'fit_test':
-        trainer.fit(model)
-        result = trainer.test(model)
-        result = {'status': 'complete', 'method': args.trainer_method, 'result': result}
-
-    if len(result) > 0:
-        file_path = os.path.join(args.tmpdir, 'ddp.result')
-        torch.save(result, file_path)
+        trainer.fit(model, datamodule=dm)
+        result = None
+    elif args.trainer_method == 'test':
+        result = trainer.test(model, datamodule=dm)
+    elif args.trainer_method == 'fit_test':
+        trainer.fit(model, datamodule=dm)
+        result = trainer.test(model, datamodule=dm)
+    else:
+        raise ValueError(f'Unsupported: {args.trainer_method}')
+
+    result_ext = {
+        'status': 'complete',
+        'method': args.trainer_method,
+        'result': result,
+    }
+    file_path = os.path.join(args.tmpdir, 'ddp.result')
+    torch.save(result_ext, file_path)
 
 
 if __name__ == '__main__':
diff --git a/tests/accelerators/test_ddp.py b/tests/accelerators/test_ddp.py
index b582532cd710e..4de9664fffb7e 100644
--- a/tests/accelerators/test_ddp.py
+++ b/tests/accelerators/test_ddp.py
@@ -23,14 +23,13 @@
 from tests.helpers.boring_model import BoringModel
 from tests.utilities.distributed import call_training_script
 
+CLI_ARGS = '--max_epochs 1 --gpus 2 --accelerator ddp'
+
 
-@pytest.mark.parametrize('cli_args', [
-    pytest.param('--max_epochs 1 --gpus 2 --accelerator ddp'),
-])
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-def test_multi_gpu_model_ddp_fit_only(tmpdir, cli_args):
+def test_multi_gpu_model_ddp_fit_only(tmpdir):
     # call the script
-    std, err = call_training_script(ddp_model, cli_args, 'fit', tmpdir, timeout=120)
+    call_training_script(ddp_model, CLI_ARGS, 'fit', tmpdir, timeout=120)
 
     # load the results of the script
     result_path = os.path.join(tmpdir, 'ddp.result')
@@ -40,13 +39,10 @@ def test_multi_gpu_model_ddp_fit_only(tmpdir, cli_args):
     assert result['status'] == 'complete'
 
 
-@pytest.mark.parametrize('cli_args', [
-    pytest.param('--max_epochs 1 --gpus 2 --accelerator ddp'),
-])
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-def test_multi_gpu_model_ddp_test_only(tmpdir, cli_args):
+def test_multi_gpu_model_ddp_test_only(tmpdir):
     # call the script
-    call_training_script(ddp_model, cli_args, 'test', tmpdir)
+    call_training_script(ddp_model, CLI_ARGS, 'test', tmpdir)
 
     # load the results of the script
     result_path = os.path.join(tmpdir, 'ddp.result')
@@ -56,13 +52,10 @@ def test_multi_gpu_model_ddp_test_only(tmpdir, cli_args):
     assert result['status'] == 'complete'
 
 
-@pytest.mark.parametrize('cli_args', [
-    pytest.param('--max_epochs 1 --gpus 2 --accelerator ddp'),
-])
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-def test_multi_gpu_model_ddp_fit_test(tmpdir, cli_args):
+def test_multi_gpu_model_ddp_fit_test(tmpdir):
     # call the script
-    call_training_script(ddp_model, cli_args, 'fit_test', tmpdir, timeout=20)
+    call_training_script(ddp_model, CLI_ARGS, 'fit_test', tmpdir, timeout=20)
 
     # load the results of the script
     result_path = os.path.join(tmpdir, 'ddp.result')
@@ -73,7 +66,7 @@ def test_multi_gpu_model_ddp_fit_test(tmpdir, cli_args):
 
     model_outs = result['result']
     for out in model_outs:
-        assert out['test_acc'] > 0.90
+        assert out['test_acc'] > 0.7
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
diff --git a/tests/accelerators/test_ddp_spawn.py b/tests/accelerators/test_ddp_spawn.py
index 1e17947fe6eb9..3ec391d8130c1 100644
--- a/tests/accelerators/test_ddp_spawn.py
+++ b/tests/accelerators/test_ddp_spawn.py
@@ -20,7 +20,9 @@
 from pytorch_lightning.core import memory
 from pytorch_lightning.trainer import Trainer
 from pytorch_lightning.trainer.states import TrainerState
-from tests.base import EvalModelTemplate
+from tests.helpers import BoringModel
+from tests.helpers.datamodules import ClassifDataModule
+from tests.helpers.simple_models import ClassificationModel
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@@ -29,7 +31,7 @@ def test_multi_gpu_early_stop_ddp_spawn(tmpdir):
 
     trainer_options = dict(
         default_root_dir=tmpdir,
-        callbacks=[EarlyStopping()],
+        callbacks=[EarlyStopping(monitor='train_acc')],
         max_epochs=50,
         limit_train_batches=10,
         limit_val_batches=10,
@@ -37,8 +39,9 @@ def test_multi_gpu_early_stop_ddp_spawn(tmpdir):
         accelerator='ddp_spawn',
     )
 
-    model = EvalModelTemplate()
-    tpipes.run_model_test(trainer_options, model)
+    dm = ClassifDataModule()
+    model = ClassificationModel()
+    tpipes.run_model_test(trainer_options, model, dm)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@@ -55,7 +58,7 @@ def test_multi_gpu_model_ddp_spawn(tmpdir):
         progress_bar_refresh_rate=0,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
 
     tpipes.run_model_test(trainer_options, model)
 
@@ -68,7 +71,7 @@ def test_ddp_all_dataloaders_passed_to_fit(tmpdir):
     """Make sure DDP works with dataloaders passed to fit()"""
     tutils.set_random_master_port()
 
-    model = EvalModelTemplate()
+    model = BoringModel()
     fit_options = dict(train_dataloader=model.train_dataloader(), val_dataloaders=model.val_dataloader())
 
     trainer = Trainer(
diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py
index 6e826719b5b98..7da18f0e81f7c 100644
--- a/tests/accelerators/test_dp.py
+++ b/tests/accelerators/test_dp.py
@@ -11,27 +11,61 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
+from unittest import mock
+
 import pytest
 import torch
+import torch.nn.functional as F
 
 import pytorch_lightning as pl
 import tests.helpers.pipelines as tpipes
 import tests.helpers.utils as tutils
 from pytorch_lightning.callbacks import EarlyStopping
 from pytorch_lightning.core import memory
-from tests.base import EvalModelTemplate
+from tests.helpers import BoringModel
+from tests.helpers.datamodules import ClassifDataModule
+from tests.helpers.simple_models import ClassificationModel
 
 PRETEND_N_OF_GPUS = 16
 
 
+class CustomClassificationModelDP(ClassificationModel):
+
+    def _step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        return {'logits': logits, 'y': y}
+
+    def training_step(self, batch, batch_idx):
+        out = self._step(batch, batch_idx)
+        loss = F.cross_entropy(out['logits'], out['y'])
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        return self._step(batch, batch_idx)
+
+    def test_step(self, batch, batch_idx):
+        return self._step(batch, batch_idx)
+
+    def validation_step_end(self, outputs):
+        self.log('val_acc', self.valid_acc(outputs['logits'], outputs['y']))
+
+    def test_step_end(self, outputs):
+        self.log('test_acc', self.test_acc(outputs['logits'], outputs['y']))
+
+
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_multi_gpu_early_stop_dp(tmpdir):
     """Make sure DDP works. with early stopping"""
     tutils.set_random_master_port()
 
+    dm = ClassifDataModule()
+    model = CustomClassificationModelDP()
+
     trainer_options = dict(
         default_root_dir=tmpdir,
-        callbacks=[EarlyStopping()],
+        callbacks=[EarlyStopping(monitor='val_acc')],
         max_epochs=50,
         limit_train_batches=10,
         limit_val_batches=10,
@@ -39,8 +73,7 @@ def test_multi_gpu_early_stop_dp(tmpdir):
         accelerator='dp',
     )
 
-    model = EvalModelTemplate()
-    tpipes.run_model_test(trainer_options, model)
+    tpipes.run_model_test(trainer_options, model, dm)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@@ -57,7 +90,7 @@ def test_multi_gpu_model_dp(tmpdir):
         progress_bar_refresh_rate=0,
     )
 
-    model = EvalModelTemplate()
+    model = BoringModel()
 
     tpipes.run_model_test(trainer_options, model)
 
@@ -65,14 +98,13 @@ def test_multi_gpu_model_dp(tmpdir):
     memory.get_memory_profile('min_max')
 
 
+@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"})
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_dp_test(tmpdir):
     tutils.set_random_master_port()
 
-    import os
-    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
-
-    model = EvalModelTemplate()
+    dm = ClassifDataModule()
+    model = CustomClassificationModelDP()
     trainer = pl.Trainer(
         default_root_dir=tmpdir,
         max_epochs=2,
@@ -81,17 +113,17 @@ def test_dp_test(tmpdir):
         gpus=[0, 1],
         accelerator='dp',
     )
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
     assert 'ckpt' in trainer.checkpoint_callback.best_model_path
-    results = trainer.test()
+    results = trainer.test(datamodule=dm)
     assert 'test_acc' in results[0]
 
-    old_weights = model.c_d1.weight.clone().detach().cpu()
+    old_weights = model.layer_0.weight.clone().detach().cpu()
 
-    results = trainer.test(model)
+    results = trainer.test(model, datamodule=dm)
     assert 'test_acc' in results[0]
 
     # make sure weights didn't change
-    new_weights = model.c_d1.weight.clone().detach().cpu()
+    new_weights = model.layer_0.weight.clone().detach().cpu()
 
     assert torch.all(torch.eq(old_weights, new_weights))
diff --git a/tests/base/model_template.py b/tests/base/model_template.py
index 1d36df8f5ef50..1ec2df7865caa 100644
--- a/tests/base/model_template.py
+++ b/tests/base/model_template.py
@@ -111,7 +111,7 @@ def forward(self, x):
         x = self.c_d1_drop(x)
 
         x = self.c_d2(x)
-        logits = F.log_softmax(x, dim=1)
+        logits = F.softmax(x, dim=1)
 
         return logits
 
diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py
index aa50405f87cd9..50195fef02426 100644
--- a/tests/core/test_datamodules.py
+++ b/tests/core/test_datamodules.py
@@ -385,9 +385,8 @@ def _step(self, batch, batch_idx):
             return {'logits': logits, 'y': y}
 
         def training_step(self, batch, batch_idx):
-            _, y = batch
             out = self._step(batch, batch_idx)
-            loss = F.cross_entropy(out['logits'], y)
+            loss = F.cross_entropy(out['logits'], out['y'])
             return loss
 
         def validation_step(self, batch, batch_idx):
diff --git a/tests/helpers/pipelines.py b/tests/helpers/pipelines.py
index ec1e81fc2cecb..403bcdfee8c1d 100644
--- a/tests/helpers/pipelines.py
+++ b/tests/helpers/pipelines.py
@@ -13,39 +13,41 @@
 # limitations under the License.
 import torch
 
-from pytorch_lightning import LightningDataModule, Trainer
+from pytorch_lightning import LightningDataModule, LightningModule, Trainer
+from pytorch_lightning.metrics.functional import accuracy
 from pytorch_lightning.trainer.states import TrainerState
 from pytorch_lightning.utilities import DistributedType
 from tests.helpers import BoringModel
 from tests.helpers.utils import get_default_logger, load_model_from_checkpoint, reset_seed
 
 
-def run_model_test_without_loggers(trainer_options, model, min_acc: float = 0.50):
+def run_model_test_without_loggers(
+    trainer_options: dict, model: LightningModule, data: LightningDataModule = None, min_acc: float = 0.50
+):
     reset_seed()
 
     # fit model
     trainer = Trainer(**trainer_options)
-    trainer.fit(model)
+    trainer.fit(model, datamodule=data)
 
     # correct result and ok accuracy
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
 
-    pretrained_model = load_model_from_checkpoint(
-        trainer.logger, trainer.checkpoint_callback.best_model_path, type(model)
-    )
+    model2 = load_model_from_checkpoint(trainer.logger, trainer.checkpoint_callback.best_model_path, type(model))
 
     # test new model accuracy
-    test_loaders = model.test_dataloader()
+    test_loaders = model2.test_dataloader() if not data else data.test_dataloader()
     if not isinstance(test_loaders, list):
         test_loaders = [test_loaders]
 
-    for dataloader in test_loaders:
-        run_prediction(pretrained_model, dataloader, min_acc=min_acc)
+    if not isinstance(model2, BoringModel):
+        for dataloader in test_loaders:
+            run_prediction_eval_model_template(model2, dataloader, min_acc=min_acc)
 
 
 def run_model_test(
     trainer_options,
-    model,
+    model: LightningModule,
     data: LightningDataModule = None,
     on_gpu: bool = True,
     version=None,
@@ -76,8 +78,9 @@ def run_model_test(
     if not isinstance(test_loaders, list):
         test_loaders = [test_loaders]
 
-    for dataloader in test_loaders:
-        run_prediction(pretrained_model, dataloader, min_acc=min_acc)
+    if not isinstance(model, BoringModel):
+        for dataloader in test_loaders:
+            run_prediction_eval_model_template(model, dataloader, min_acc=min_acc)
 
     if with_hpc:
         if trainer._distrib_type in (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2):
@@ -92,50 +95,17 @@ def run_model_test(
         trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=on_gpu)
 
 
-def run_prediction(trained_model, dataloader, dp=False, min_acc=0.25):
-    if isinstance(trained_model, BoringModel):
-        return _boring_model_run_prediction(trained_model, dataloader, min_acc)
-    else:
-        return _eval_model_template_run_prediction(trained_model, dataloader, dp, min_acc=min_acc)
-
-
-def _eval_model_template_run_prediction(trained_model, dataloader, dp=False, min_acc=0.50):
+@torch.no_grad()
+def run_prediction_eval_model_template(trained_model, dataloader, min_acc=0.50):
     # run prediction on 1 batch
+    trained_model.cpu()
+    trained_model.eval()
+
     batch = next(iter(dataloader))
     x, y = batch
-    x = x.view(x.size(0), -1)
-
-    if dp:
-        with torch.no_grad():
-            output = trained_model(batch, 0)
-            acc = output['val_acc']
-        acc = torch.mean(acc).item()
-
-    else:
-        with torch.no_grad():
-            y_hat = trained_model(x)
-        y_hat = y_hat.cpu()
+    x = x.flatten(1)
 
-        # acc
-        labels_hat = torch.argmax(y_hat, dim=1)
-
-        y = y.cpu()
-        acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0)
-        acc = torch.tensor(acc)
-        acc = acc.item()
+    y_hat = trained_model(x)
+    acc = accuracy(y_hat.cpu(), y.cpu(), top_k=2).item()
 
     assert acc >= min_acc, f"This model is expected to get > {min_acc} in test set (it got {acc})"
-
-
-# TODO: This test compares a loss value with a min accuracy - complete non-sense!
-# create BoringModels that make actual predictions!
-def _boring_model_run_prediction(trained_model, dataloader, min_acc=0.25):
-    # run prediction on 1 batch
-    trained_model.cpu()
-    batch = next(iter(dataloader))
-
-    with torch.no_grad():
-        output = trained_model(batch)
-
-    acc = trained_model.loss(batch, output)
-    assert acc >= min_acc, f"This model is expected to get, {min_acc} in test set but got {acc}"
diff --git a/tests/models/data/horovod/test_train_script.py b/tests/models/data/horovod/test_train_script.py
new file mode 100644
index 0000000000000..ee77efeeb8675
--- /dev/null
+++ b/tests/models/data/horovod/test_train_script.py
@@ -0,0 +1,30 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tests.models.data.horovod.train_default_model import run_test_from_config
+
+
+def test_horovod_model_script(tmpdir):
+    """This just for testing/debugging horovod script without horovod..."""
+    trainer_options = dict(
+        default_root_dir=str(tmpdir),
+        weights_save_path=str(tmpdir),
+        gradient_clip_val=1.0,
+        progress_bar_refresh_rate=0,
+        max_epochs=1,
+        limit_train_batches=0.4,
+        limit_val_batches=0.2,
+        deterministic=True,
+    )
+    run_test_from_config(trainer_options, check_size=False, on_gpu=False)
diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py
index 93a637dda1071..d3868cfd979e6 100644
--- a/tests/models/data/horovod/train_default_model.py
+++ b/tests/models/data/horovod/train_default_model.py
@@ -37,7 +37,6 @@
     print('You requested to import Horovod which is missing or not supported for your OS.')
 
 from tests.helpers import BoringModel  # noqa: E402
-from tests.helpers.pipelines import run_prediction  # noqa: E402
 from tests.helpers.utils import reset_seed, set_random_master_port  # noqa: E402
 
 parser = argparse.ArgumentParser()
@@ -45,7 +44,7 @@
 parser.add_argument('--on-gpu', action='store_true', default=False)
 
 
-def run_test_from_config(trainer_options):
+def run_test_from_config(trainer_options, on_gpu, check_size=True):
     """Trains the default model with the given config."""
     set_random_master_port()
     reset_seed()
@@ -60,7 +59,8 @@ def run_test_from_config(trainer_options):
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
 
     # Horovod should be initialized following training. If not, this will raise an exception.
-    assert hvd.size() == 2
+    if check_size:
+        assert hvd.size() == 2
 
     if trainer.global_rank > 0:
         return
@@ -74,15 +74,16 @@ def run_test_from_config(trainer_options):
         test_loaders = [test_loaders]
 
     for dataloader in test_loaders:
-        run_prediction(pretrained_model, dataloader)
+        batch = next(iter(dataloader))
+        pretrained_model(batch)
 
     # test HPC saving
     trainer.checkpoint_connector.hpc_save(ckpt_path, trainer.logger)
     # test HPC loading
     checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder(ckpt_path)
-    trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=args.on_gpu)
+    trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=on_gpu)
 
-    if args.on_gpu:
+    if on_gpu:
         trainer = Trainer(gpus=1, accelerator='horovod', max_epochs=1)
         # Test the root_gpu property
         assert trainer.root_gpu == hvd.local_rank()
@@ -90,4 +91,4 @@ def run_test_from_config(trainer_options):
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    run_test_from_config(json.loads(args.trainer_options))
+    run_test_from_config(json.loads(args.trainer_options), args.on_gpu)
diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index f30f12009450e..ec13ed9112ef0 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -24,6 +24,8 @@
 from pytorch_lightning.utilities import device_parser
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers import BoringModel
+from tests.helpers.datamodules import ClassifDataModule
+from tests.helpers.simple_models import ClassificationModel
 
 PRETEND_N_OF_GPUS = 16
 
@@ -41,8 +43,9 @@ def test_multi_gpu_none_backend(tmpdir):
         gpus=2,
     )
 
-    model = BoringModel()
-    tpipes.run_model_test(trainer_options, model, min_acc=0.20)
+    dm = ClassifDataModule()
+    model = ClassificationModel()
+    tpipes.run_model_test(trainer_options, model, dm)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
index a3f88e37bb09a..7d6c104abbd57 100644
--- a/tests/models/test_restore.py
+++ b/tests/models/test_restore.py
@@ -85,6 +85,28 @@ class GenericValTestLossBoringModel(GenericParentValTestLossBoringModel[int]):
     pass
 
 
+class CustomClassificationModelDP(ClassificationModel):
+
+    def _step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        return {'logits': logits, 'y': y}
+
+    def training_step(self, batch, batch_idx):
+        out = self._step(batch, batch_idx)
+        loss = F.cross_entropy(out['logits'], out['y'])
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        return self._step(batch, batch_idx)
+
+    def test_step(self, batch, batch_idx):
+        return self._step(batch, batch_idx)
+
+    def validation_step_end(self, outputs):
+        self.log('val_acc', self.valid_acc(outputs['logits'], outputs['y']))
+
+
 def test_model_properties_resume_from_checkpoint(tmpdir):
     """
     Test that properties like `current_epoch` and `global_step`
@@ -198,28 +220,6 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
 
     tutils.set_random_master_port()
 
-    class CustomClassificationModelDP(ClassificationModel):
-
-        def _step(self, batch, batch_idx):
-            x, y = batch
-            logits = self(x)
-            return {'logits': logits, 'y': y}
-
-        def training_step(self, batch, batch_idx):
-            _, y = batch
-            out = self._step(batch, batch_idx)
-            loss = F.cross_entropy(out['logits'], y)
-            return loss
-
-        def validation_step(self, batch, batch_idx):
-            return self._step(batch, batch_idx)
-
-        def test_step(self, batch, batch_idx):
-            return self._step(batch, batch_idx)
-
-        def validation_step_end(self, outputs):
-            self.log('val_acc', self.valid_acc(outputs['logits'], outputs['y']))
-
     dm = ClassifDataModule()
     model = CustomClassificationModelDP(lr=0.1)
 
@@ -259,7 +259,7 @@ def validation_step_end(self, outputs):
         dataloaders = [dataloaders]
 
     for dataloader in dataloaders:
-        tpipes.run_prediction(pretrained_model, dataloader)
+        tpipes.run_prediction_eval_model_template(pretrained_model, dataloader)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@@ -307,7 +307,7 @@ def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir):
         dataloaders = [dataloaders]
 
     for dataloader in dataloaders:
-        tpipes.run_prediction(pretrained_model, dataloader, min_acc=0.1)
+        tpipes.run_prediction_eval_model_template(pretrained_model, dataloader, min_acc=0.1)
 
 
 def test_running_test_pretrained_model_cpu(tmpdir):
@@ -398,7 +398,8 @@ def test_load_model_from_checkpoint(tmpdir, model_template):
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
 def test_dp_resume(tmpdir):
     """Make sure DP continues training correctly."""
-    model = BoringModel()
+    model = CustomClassificationModelDP(lr=0.1)
+    dm = ClassifDataModule()
 
     trainer_options = dict(max_epochs=1, gpus=2, accelerator='dp', default_root_dir=tmpdir)
 
@@ -416,7 +417,7 @@ def test_dp_resume(tmpdir):
     # fit model
     trainer = Trainer(**trainer_options)
     trainer.is_slurm_managing_tasks = True
-    trainer.fit(model)
+    trainer.fit(model, datamodule=dm)
 
     # track epoch before saving. Increment since we finished the current epoch, don't want to rerun
     real_global_epoch = trainer.current_epoch + 1
@@ -439,7 +440,7 @@ def test_dp_resume(tmpdir):
     trainer_options['max_epochs'] = 1
     new_trainer = Trainer(**trainer_options)
 
-    class CustomModel(BoringModel):
+    class CustomModel(CustomClassificationModelDP):
 
         def __init__(self):
             super().__init__()
@@ -451,19 +452,17 @@ def on_train_start(self):
 
             # if model and state loaded correctly, predictions will be good even though we
             # haven't trained with the new loaded model
-            dp_model = new_trainer.model
-            dp_model.eval()
             new_trainer._running_stage = RunningStage.EVALUATING
 
             dataloader = self.train_dataloader()
-            tpipes.run_prediction(self.trainer.lightning_module, dataloader)
+            tpipes.run_prediction_eval_model_template(self.trainer.lightning_module, dataloader=dataloader)
             self.on_train_start_called = True
 
     # new model
     model = CustomModel()
 
     # fit new model which should load hpc weights
-    new_trainer.fit(model)
+    new_trainer.fit(model, datamodule=dm)
     assert model.on_train_start_called
 
     # test freeze on gpu
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index bfa8f2432e3a2..6a4605b3e2b36 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -223,12 +223,19 @@ def test_tpu_grad_norm(tmpdir):
 @pl_multi_process_test
 def test_dataloaders_passed_to_fit(tmpdir):
     """Test if dataloaders passed to trainer works on TPU"""
-
     tutils.reset_seed()
     model = BoringModel()
 
-    trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, tpu_cores=8)
-    trainer.fit(model, train_dataloader=model.train_dataloader(), val_dataloaders=model.val_dataloader())
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        tpu_cores=8,
+    )
+    trainer.fit(
+        model,
+        train_dataloader=model.train_dataloader(),
+        val_dataloaders=model.val_dataloader(),
+    )
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
 
 
diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index bca8e5dcc531b..fe07e41d20b4c 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -130,7 +130,7 @@ def test_multiple_val_dataloader(tmpdir):
 
     # make sure predictions are good for each val set
     for dataloader in trainer.val_dataloaders:
-        tpipes.run_prediction(trained_model=model, dataloader=dataloader)
+        tpipes.run_prediction_eval_model_template(trained_model=model, dataloader=dataloader)
 
 
 @pytest.mark.parametrize('ckpt_path', [None, 'best', 'specific'])
@@ -153,8 +153,8 @@ def test_step(self, batch, batch_idx, *args, **kwargs):
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=1,
-        limit_val_batches=0.1,
-        limit_train_batches=0.2,
+        limit_val_batches=10,
+        limit_train_batches=100,
     )
     trainer.fit(model)
     if ckpt_path == 'specific':
@@ -162,12 +162,11 @@ def test_step(self, batch, batch_idx, *args, **kwargs):
     trainer.test(ckpt_path=ckpt_path)
 
     # verify there are 2 test loaders
-    assert len(trainer.test_dataloaders) == 2, \
-        'Multiple test_dataloaders not initiated properly'
+    assert len(trainer.test_dataloaders) == 2, 'Multiple test_dataloaders not initiated properly'
 
     # make sure predictions are good for each test set
     for dataloader in trainer.test_dataloaders:
-        tpipes.run_prediction(trainer.model, dataloader)
+        tpipes.run_prediction_eval_model_template(trainer.model, dataloader)
 
     # run the test method
     trainer.test(ckpt_path=ckpt_path)
diff --git a/tests/trainer/test_lr_finder.py b/tests/trainer/test_lr_finder.py
index 750b989a7d513..e85c43361976d 100644
--- a/tests/trainer/test_lr_finder.py
+++ b/tests/trainer/test_lr_finder.py
@@ -229,8 +229,8 @@ def test_accumulation_and_early_stopping(tmpdir):
 def test_suggestion_parameters_work(tmpdir):
     """ Test that default skipping does not alter results in basic case """
 
-    hparams = EvalModelTemplate.get_default_hparams()
-    model = EvalModelTemplate(**hparams)
+    dm = ClassifDataModule()
+    model = ClassificationModel()
 
     # logger file to get meta
     trainer = Trainer(
@@ -238,12 +238,11 @@ def test_suggestion_parameters_work(tmpdir):
         max_epochs=3,
     )
 
-    lrfinder = trainer.tuner.lr_find(model)
+    lrfinder = trainer.tuner.lr_find(model, datamodule=dm)
     lr1 = lrfinder.suggestion(skip_begin=10)  # default
-    lr2 = lrfinder.suggestion(skip_begin=80)  # way too high, should have an impact
+    lr2 = lrfinder.suggestion(skip_begin=150)  # way too high, should have an impact
 
-    assert lr1 != lr2, \
-        'Skipping parameter did not influence learning rate'
+    assert lr1 != lr2, 'Skipping parameter did not influence learning rate'
 
 
 def test_suggestion_with_non_finite_values(tmpdir):
diff --git a/tests/utilities/test_parsing.py b/tests/utilities/test_parsing.py
index 42edb8e48f336..f6f802615f003 100644
--- a/tests/utilities/test_parsing.py
+++ b/tests/utilities/test_parsing.py
@@ -113,8 +113,8 @@ def test_lightning_getattr(tmpdir):
 
     for m in models:
         with pytest.raises(
-                AttributeError,
-                match="is neither stored in the model namespace nor the `hparams` namespace/dict, nor the datamodule."
+            AttributeError,
+            match="is neither stored in the model namespace nor the `hparams` namespace/dict, nor the datamodule."
         ):
             lightning_getattr(m, "this_attr_not_exist")
 
@@ -140,7 +140,7 @@ def test_lightning_setattr(tmpdir):
 
     for m in models:
         with pytest.raises(
-                AttributeError,
-                match="is neither stored in the model namespace nor the `hparams` namespace/dict, nor the datamodule."
+            AttributeError,
+            match="is neither stored in the model namespace nor the `hparams` namespace/dict, nor the datamodule."
         ):
             lightning_setattr(m, "this_attr_not_exist", None)