diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml
index 86a5a4a933268..d91f656efb492 100644
--- a/.github/workflows/ci_test-conda.yml
+++ b/.github/workflows/ci_test-conda.yml
@@ -31,6 +31,7 @@ jobs:
         python ./requirements/adjust_versions.py requirements/extra.txt
         python ./requirements/adjust_versions.py requirements/examples.txt
         pip install --requirement requirements/devel.txt --find-links https://download.pytorch.org/whl/nightly/torch_nightly.html
+        pip install pytest-random-order
         pip list
 
     - name: Pull checkpoints from S3
@@ -44,7 +45,7 @@ jobs:
     - name: Tests
       run: |
         # NOTE: run coverage on tests does not propagate failure status for Win, https://github.com/nedbat/coveragepy/issues/1003
-        coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml
+        coverage run --source pytorch_lightning -m pytest --random-order-seed=1 pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml
       shell: bash -l {0}
 
     - name: Upload pytest results
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 409c7bb341f17..1eb73dd3950a8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -100,6 +100,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
     * Marked several methods in `PredictionLoop` as protected: `on_predict_start`, `on_predict_epoch_end`, `on_predict_end`, `on_predict_model_eval` ([#9516](https://github.com/PyTorchLightning/pytorch-lightning/pull/9516))
     * Marked several methods in `EvaluationLoop` as protected: `get_max_batches`, `on_evaluation_model_eval`, `on_evaluation_model_train`, `on_evaluation_start`, `on_evaluation_epoch_start`, `on_evaluation_epoch_end`, `on_evaluation_end`, `reload_evaluation_dataloaders` ([#9516](https://github.com/PyTorchLightning/pytorch-lightning/pull/9516))
     * Marked several methods in `EvaluationEpochLoop` as protected: `on_evaluation_batch_start`, `evaluation_step`, `evaluation_step_end` ([#9516](https://github.com/PyTorchLightning/pytorch-lightning/pull/9516))
+    * Added `yielding_training_step` example ([#9983](https://github.com/PyTorchLightning/pytorch-lightning/pull/9983))
 
 
 - Added support for saving and loading state of multiple callbacks of the same type ([#7187](https://github.com/PyTorchLightning/pytorch-lightning/pull/7187))
@@ -213,10 +214,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - LightningLite:
     * Added `PrecisionPlugin.forward_context`, making it the default implementation for all `{train,val,test,predict}_step_context()` methods ([#9988](https://github.com/PyTorchLightning/pytorch-lightning/pull/9988))
     * Added `DDPSpawnPlugin.spawn()` for spawning new processes of a given function ([#10018](https://github.com/PyTorchLightning/pytorch-lightning/pull/10018), [#10022](https://github.com/PyTorchLightning/pytorch-lightning/pull/10022))
-    * Added `TrainingTypePlugin.{_setup_model, _setup_optimizer}` methods ([#9994](https://github.com/PyTorchLightning/pytorch-lightning/pull/9994))
+    * Added `TrainingTypePlugin.{_setup_model, _setup_optimizer}` methods ([#9994](https://github.com/PyTorchLightning/pytorch-lightning/pull/9994), [#10064](https://github.com/PyTorchLightning/pytorch-lightning/pull/10064))
     * Implemented `DataParallelPlugin._setup_model` ([#10010](https://github.com/PyTorchLightning/pytorch-lightning/pull/10010))
-    * Implemented `DeepSpeedPlugin._setup_models_and_optimizers` ([#10009](https://github.com/PyTorchLightning/pytorch-lightning/pull/10009))
-    * Implemented `{DDPShardedPlugin,DDPShardedSpawnPlugin}._setup_models_and_optimizers` ([#10028](https://github.com/PyTorchLightning/pytorch-lightning/pull/10028))
+    * Implemented `DeepSpeedPlugin._setup_model_and_optimizers` ([#10009](https://github.com/PyTorchLightning/pytorch-lightning/pull/10009), [#10064](https://github.com/PyTorchLightning/pytorch-lightning/pull/10064))
+    * Implemented `{DDPShardedPlugin,DDPShardedSpawnPlugin}._setup_model_and_optimizers` ([#10028](https://github.com/PyTorchLightning/pytorch-lightning/pull/10028), [#10064](https://github.com/PyTorchLightning/pytorch-lightning/pull/10064))
     * Added optional `model` argument to the `optimizer_step` methods in accelerators and plugins ([#10023](https://github.com/PyTorchLightning/pytorch-lightning/pull/10023))
 
 
@@ -327,13 +328,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - `pytorch_lightning.utilities.grads.grad_norm` now raises an exception if parameter `norm_type <= 0` ([#9765](https://github.com/PyTorchLightning/pytorch-lightning/pull/9765))
 
 
-
 - Updated error message for interactive incompatible plugins ([#9896](https://github.com/PyTorchLightning/pytorch-lightning/pull/9896))
 
 
 - Updated several places in the loops and trainer to access `training_type_plugin` directly instead of `accelerator` ([#9901](https://github.com/PyTorchLightning/pytorch-lightning/pull/9901))
 
 
+- Disable quantization aware training observers by default during validating/testing/predicting stages ([#8540](https://github.com/PyTorchLightning/pytorch-lightning/pull/8540))
+
 
 ### Deprecated
 
@@ -617,7 +619,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed `LearningRateMonitor` logging with multiple param groups optimizer with no scheduler ([#10044](https://github.com/PyTorchLightning/pytorch-lightning/pull/10044))
 
 
-
 - Fixed undesired side effects being caused by `Trainer` patching dataloader methods on the `LightningModule` ([#9764](https://github.com/PyTorchLightning/pytorch-lightning/pull/9764))
 
 
diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst
index ef04d27136eef..ba89a06e15d0a 100644
--- a/docs/source/common/trainer.rst
+++ b/docs/source/common/trainer.rst
@@ -516,7 +516,9 @@ Example::
 checkpoint_callback
 ^^^^^^^^^^^^^^^^^^^
 
-Deprecated: This has been deprecated in v1.5 and will be removed in v1.7. Please use ``enable_checkpointing`` instead.
+.. warning:: `checkpoint_callback` has been deprecated in v1.5 and will be removed in v1.7.
+    To disable checkpointing, pass ``enable_checkpointing = False`` to the Trainer instead.
+
 
 default_root_dir
 ^^^^^^^^^^^^^^^^
diff --git a/docs/source/extensions/callbacks.rst b/docs/source/extensions/callbacks.rst
index f8088d1d4153e..a8e7010cfb37e 100644
--- a/docs/source/extensions/callbacks.rst
+++ b/docs/source/extensions/callbacks.rst
@@ -72,10 +72,10 @@ Examples
 --------
 You can do pretty much anything with callbacks.
 
-- `Add a MLP to fine-tune self-supervised networks <https://lightning-bolts.readthedocs.io/en/latest/self_supervised_callbacks.html#sslonlineevaluator>`_.
-- `Find how to modify an image input to trick the classification result <https://lightning-bolts.readthedocs.io/en/latest/vision_callbacks.html#confused-logit>`_.
-- `Interpolate the latent space of any variational model <https://lightning-bolts.readthedocs.io/en/latest/variational_callbacks.html#latent-dim-interpolator>`_.
-- `Log images to Tensorboard for any model <https://lightning-bolts.readthedocs.io/en/latest/vision_callbacks.html#tensorboard-image-generator>`_.
+- `Add a MLP to fine-tune self-supervised networks <https://lightning-bolts.readthedocs.io/en/latest/deprecated/callbacks/self_supervised.html#sslonlineevaluator>`_.
+- `Find how to modify an image input to trick the classification result <https://lightning-bolts.readthedocs.io/en/latest/deprecated/callbacks/vision.html#confused-logit>`_.
+- `Interpolate the latent space of any variational model <https://lightning-bolts.readthedocs.io/en/latest/deprecated/callbacks/variational.html#latent-dim-interpolator>`_.
+- `Log images to Tensorboard for any model <https://lightning-bolts.readthedocs.io/en/latest/deprecated/callbacks/vision.html#tensorboard-image-generator>`_.
 
 
 --------------
diff --git a/pl_examples/loop_examples/yielding_training_step.py b/pl_examples/loop_examples/yielding_training_step.py
new file mode 100644
index 0000000000000..4d870f002e247
--- /dev/null
+++ b/pl_examples/loop_examples/yielding_training_step.py
@@ -0,0 +1,168 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from functools import partial
+from typing import Generator
+
+import torch
+
+from pl_examples.domain_templates.generative_adversarial_net import GAN as GANTemplate
+from pl_examples.domain_templates.generative_adversarial_net import MNISTDataModule
+from pytorch_lightning import Trainer
+from pytorch_lightning.loops import OptimizerLoop
+from pytorch_lightning.loops.optimization.optimizer_loop import ClosureResult
+from pytorch_lightning.loops.utilities import _build_training_step_kwargs
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+
+#############################################################################################
+#                                    Yield Loop                                             #
+#                                                                                           #
+# This example shows an implementation of a custom loop that changes how the                #
+# `LightningModule.training_step` behaves. In particular, this custom "Yield" loop will     #
+# enable the `training_step` to yield like a Python generator, retaining the values         #
+# of local variables for subsequent calls. This can result in much cleaner and elegant      #
+# code when dealing with multiple optimizers (automatic optimization).                      #
+#                                                                                           #
+# Learn more about the loop structure from the documentation:                               #
+# https://pytorch-lightning.readthedocs.io/en/latest/extensions/loops.html                  #
+#############################################################################################
+
+
+#############################################################################################
+#                        Step 1 / 3: Implement a custom OptimizerLoop                       #
+#                                                                                           #
+# The `training_step` gets called in the                                                    #
+# `pytorch_lightning.loops.optimization.OptimizerLoop`. To make it into a Python generator, #
+# we need to override the place where it gets called.                                       #
+#############################################################################################
+
+
+class YieldLoop(OptimizerLoop):
+    def __init__(self):
+        super().__init__()
+        self._generator = None
+
+    def connect(self, **kwargs):
+        raise NotImplementedError(f"{self.__class__.__name__} does not connect any child loops.")
+
+    def on_run_start(self, batch, optimizers, batch_idx):
+        super().on_run_start(batch, optimizers, batch_idx)
+        if not inspect.isgeneratorfunction(self.trainer.lightning_module.training_step):
+            raise MisconfigurationException("The LightingModule does not yield anything in the `training_step`.")
+        assert self.trainer.lightning_module.automatic_optimization
+
+        # We request the generator once and save it for later
+        # so we can call next() on it.
+        self._generator = self._get_generator(batch, batch_idx, opt_idx=0)
+
+    def _make_step_fn(self, split_batch, batch_idx, opt_idx):
+        return partial(self._training_step, self._generator)
+
+    def _get_generator(self, split_batch, batch_idx, opt_idx):
+        step_kwargs = _build_training_step_kwargs(
+            self.trainer.lightning_module, self.trainer.optimizers, split_batch, batch_idx, opt_idx, hiddens=None
+        )
+
+        # Here we are basically calling `lightning_module.training_step()`
+        # and this returns a generator! The `training_step` is handled by the
+        # accelerator to enable distributed training.
+        return self.trainer.accelerator.training_step(step_kwargs)
+
+    def _training_step(self, generator):
+        # required for logging
+        self.trainer.lightning_module._current_fx_name = "training_step"
+
+        # Here, instead of calling `lightning_module.training_step()`
+        # we call next() on the generator!
+        training_step_output = next(generator)
+        self.trainer.accelerator.post_training_step()
+
+        training_step_output = self.trainer.call_hook("training_step_end", training_step_output)
+
+        # The closure result takes care of properly detaching the loss for logging and peforms
+        # some additional checks that the output format is correct.
+        result = ClosureResult.from_training_step_output(training_step_output, self.trainer.accumulate_grad_batches)
+        return result
+
+
+#############################################################################################
+#               Step 2 / 3: Implement a model using the new yield mechanism                 #
+#                                                                                           #
+# We can now implement a model that defines the `training_step` using "yield" statements.   #
+# We choose a generative adversarial network (GAN) because it alternates between two        #
+# optimizers updating the model parameters. In the first step we compute the loss of the    #
+# first network (coincidentally also named "generator") and yield the loss. In the second   #
+# step we compute the loss of the second network (the "discriminator") and yield again.     #
+# The nice property of this yield approach is that we can reuse variables that we computed  #
+# earlier. If this was a regular Lightning `training_step`, we would have to recompute the  #
+# output of the first network.                                                              #
+#############################################################################################
+
+
+class GAN(GANTemplate):
+
+    # This training_step method is now a Python generator
+    def training_step(self, batch, batch_idx, optimizer_idx=0) -> Generator:
+        imgs, _ = batch
+        z = torch.randn(imgs.shape[0], self.hparams.latent_dim)
+        z = z.type_as(imgs)
+
+        # Here, we compute the generator output once and reuse it later.
+        # It gets saved when we yield from the training_step.
+        # The output then gets re-used again in the discriminator update.
+        generator_output = self(z)
+
+        # train generator
+        real_labels = torch.ones(imgs.size(0), 1)
+        real_labels = real_labels.type_as(imgs)
+        g_loss = self.adversarial_loss(self.discriminator(generator_output), real_labels)
+        self.log("g_loss", g_loss)
+
+        # Yield instead of return: This makes the training_step a Python generator.
+        # Once we call it again, it will continue the execution with the block below
+        yield g_loss
+
+        # train discriminator
+        real_labels = torch.ones(imgs.size(0), 1)
+        real_labels = real_labels.type_as(imgs)
+        real_loss = self.adversarial_loss(self.discriminator(imgs), real_labels)
+        fake_labels = torch.zeros(imgs.size(0), 1)
+        fake_labels = fake_labels.type_as(imgs)
+
+        # We make use again of the generator_output
+        fake_loss = self.adversarial_loss(self.discriminator(generator_output.detach()), fake_labels)
+        d_loss = (real_loss + fake_loss) / 2
+        self.log("d_loss", d_loss)
+
+        yield d_loss
+
+
+#############################################################################################
+#                       Step 3 / 3: Connect the loop to the Trainer                         #
+#                                                                                           #
+# Finally, attach the loop to the `Trainer`. Here, we modified the `AutomaticOptimization`  #
+# loop which is a subloop of the `TrainingBatchLoop`. We use `.connect()` to attach it.     #
+#############################################################################################
+
+if __name__ == "__main__":
+    model = GAN()
+    dm = MNISTDataModule()
+    trainer = Trainer()
+
+    # Connect the new loop
+    # YieldLoop now replaces the previous optimizer loop
+    trainer.fit_loop.epoch_loop.batch_loop.connect(optimizer_loop=YieldLoop())
+
+    # fit() will now use the new loop!
+    trainer.fit(model, dm)
diff --git a/pytorch_lightning/callbacks/quantization.py b/pytorch_lightning/callbacks/quantization.py
index 564e24071602a..bf0088575e8b4 100644
--- a/pytorch_lightning/callbacks/quantization.py
+++ b/pytorch_lightning/callbacks/quantization.py
@@ -16,10 +16,20 @@
 ^^^^^^^^^^^^
 
 """
+import copy
 import functools
-from typing import Any, Callable, Optional, Sequence, Union
+from typing import Any, Callable, Dict, Optional, Sequence, Union
 
 import torch
+from torch import Tensor
+
+from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8
+
+if _TORCH_GREATER_EQUAL_1_8:
+    from torch.quantization import FakeQuantizeBase
+else:
+    # For torch 1.6 and 1.7.
+    from torch.quantization import FakeQuantize as FakeQuantizeBase
 
 import pytorch_lightning as pl
 from pytorch_lightning.callbacks.base import Callback
@@ -126,11 +136,25 @@ def custom_trigger_last(trainer):
         quantize_on_fit_end: perform the quantization in `on_fit_end`.
             Note that once converted, the model cannot be put in training mode again.
 
+        observer_enabled_stages: allow fake-quantization modules' observers to do calibration during provided stages:
+
+            - ``'train'``: the observers can do calibration during training.
+            - ``'validate'``: the observers can do calibration during validating.
+              Note that we don't disable observers during the sanity check as the model hasn't been calibrated with
+              training data yet. After the sanity check, the fake-quantization modules are restored to initial states.
+            - ``'test'``: the observers can do calibration during testing.
+            - ``'predict'``: the observers can do calibration during predicting.
+
+            Note that we only handle observers belonging to fake-quantization modules. When ``qconfig`` is a ``str`` and
+            ``observer_type`` is ``'histogram'``, the observers won't belong to any fake-quantization modules and will
+            not be controlled by the callback.
+
     .. _PyTorch Quantization: https://pytorch.org/docs/stable/quantization.html#quantization-aware-training
     .. _torch.quantization.QConfig: https://pytorch.org/docs/stable/torch.quantization.html#torch.quantization.QConfig
     """
 
     OBSERVER_TYPES = ("histogram", "average")
+    OBSERVER_STAGES = ("train", "validate", "test", "predict")
 
     def __init__(
         self,
@@ -140,6 +164,7 @@ def __init__(
         modules_to_fuse: Optional[Sequence] = None,
         input_compatible: bool = True,
         quantize_on_fit_end: bool = True,
+        observer_enabled_stages: Sequence[str] = ("train",),
     ) -> None:
         _valid_qconf_str = isinstance(qconfig, str) and qconfig in torch.backends.quantized.supported_engines
         if not isinstance(qconfig, QConfig) and not _valid_qconf_str:
@@ -163,9 +188,20 @@ def __init__(
         self.modules_to_fuse = modules_to_fuse
         self._input_compatible = input_compatible
         self._convert_on_fit_end = quantize_on_fit_end
+
+        observer_enabled_stages = set(observer_enabled_stages)
+        unsupported_stages = observer_enabled_stages - set(self.OBSERVER_STAGES)
+        if unsupported_stages:
+            raise MisconfigurationException(
+                f'Unsupported stages "{tuple(sorted(unsupported_stages))}", allowed are {self.OBSERVER_STAGES}.'
+            )
+        self._observer_disabled_stages = set(self.OBSERVER_STAGES) - observer_enabled_stages
+
         self._forward_calls = 0
+        self._fake_quant_to_initial_state_dict = {}
+        self._last_fake_quant_to_observer_enabled = {}
 
-    def _check_feasible_fuse(self, model):
+    def _check_feasible_fuse(self, model: "pl.LightningModule") -> bool:
         if not self.modules_to_fuse:
             return False
         for group in self.modules_to_fuse:
@@ -175,7 +211,20 @@ def _check_feasible_fuse(self, model):
                 )
         return True
 
-    def on_fit_start(self, trainer, pl_module):
+    def _collect_observer_enabled(self) -> Dict[FakeQuantizeBase, Tensor]:
+        return {
+            fake_quant: fake_quant.observer_enabled.clone() for fake_quant in self._fake_quant_to_initial_state_dict
+        }
+
+    def _disable_observer(self, pl_module: "pl.LightningModule") -> None:
+        self._last_fake_quant_to_observer_enabled = self._collect_observer_enabled()
+        pl_module.apply(torch.quantization.disable_observer)
+
+    def _restore_last_observer_enabled(self) -> None:
+        for fake_quant, observer_enabled in self._last_fake_quant_to_observer_enabled.items():
+            fake_quant.observer_enabled.copy_(observer_enabled)
+
+    def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         # QuantStub converts tensors from floating point to quantized
         pl_module.quant = torch.quantization.QuantStub()
         # DeQuantStub converts tensors from quantized to floating point
@@ -209,7 +258,12 @@ def on_fit_start(self, trainer, pl_module):
         # the model that will observe weight and activation tensors during calibration.
         torch.quantization.prepare_qat(pl_module, inplace=True)
 
-    def on_fit_end(self, trainer, pl_module):
+        fake_quants = tuple(module for module in pl_module.modules() if isinstance(module, FakeQuantizeBase))
+        self._fake_quant_to_initial_state_dict = {
+            fake_quant: copy.deepcopy(fake_quant.state_dict()) for fake_quant in fake_quants
+        }
+
+    def on_fit_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
         if not self._convert_on_fit_end:
             pl_module.forward = self.__module_forward
             return
@@ -224,3 +278,43 @@ def on_fit_end(self, trainer, pl_module):
             pl_module.forward = wrap_quantize_forward_context(model=pl_module, func=self.__module_forward)
         else:
             pl_module.forward = self.__module_forward
+
+    def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        if "train" in self._observer_disabled_stages:
+            self._disable_observer(pl_module)
+
+    def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        if "train" in self._observer_disabled_stages:
+            self._restore_last_observer_enabled()
+
+    def on_validation_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        if "validate" in self._observer_disabled_stages and not trainer.sanity_checking:
+            # ``torch.quantization.MovingAveragePerChannelMinMaxObserver`` and ``torch.quantization.HistogramObserver``
+            # need to see at least one batch to infer the shapes of quantization ``scale`` and ``zero_point``. So we
+            # don't disable observers during the sanity check so that they can infer the shapes of quantization
+            # parameters with validation data.
+            self._disable_observer(pl_module)
+
+    def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        if "validate" in self._observer_disabled_stages:
+            if trainer.sanity_checking:
+                for fake_quant, state_dict in self._fake_quant_to_initial_state_dict.items():
+                    fake_quant.load_state_dict(state_dict)
+            else:
+                self._restore_last_observer_enabled()
+
+    def on_test_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        if "test" in self._observer_disabled_stages:
+            self._disable_observer(pl_module)
+
+    def on_test_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        if "test" in self._observer_disabled_stages:
+            self._restore_last_observer_enabled()
+
+    def on_predict_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        if "predict" in self._observer_disabled_stages:
+            self._disable_observer(pl_module)
+
+    def on_predict_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
+        if "predict" in self._observer_disabled_stages:
+            self._restore_last_observer_enabled()
diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py
index 119e7f6c5472c..c2a1a5e786833 100644
--- a/pytorch_lightning/loops/dataloader/evaluation_loop.py
+++ b/pytorch_lightning/loops/dataloader/evaluation_loop.py
@@ -101,7 +101,7 @@ def advance(self, *args: Any, **kwargs: Any) -> None:
 
         dataloader_idx: int = self.current_dataloader_idx
         dataloader = self.trainer.training_type_plugin.process_dataloader(self.current_dataloader)
-        dataloader = self.trainer.data_connector.get_profiled_dataloader(dataloader, dataloader_idx=dataloader_idx)
+        dataloader = self.trainer._data_connector.get_profiled_dataloader(dataloader, dataloader_idx=dataloader_idx)
         dl_max_batches = self._max_batches[dataloader_idx]
 
         dl_outputs = self.epoch_loop.run(dataloader, dataloader_idx, dl_max_batches, self.num_dataloaders)
diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
index 14ed3c1bac562..b4660c96a0989 100644
--- a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
@@ -107,7 +107,7 @@ def advance(
         if batch is None:
             raise StopIteration
 
-        if not self.trainer.data_connector.evaluation_data_fetcher.store_on_device:
+        if not self.trainer._data_connector.evaluation_data_fetcher.store_on_device:
             with self.trainer.profiler.profile("evaluation_batch_to_device"):
                 batch = self.trainer.accelerator.batch_to_device(batch, dataloader_idx=dataloader_idx)
 
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index fbe0ff2e9e2cd..1fe70d9d4e77c 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -147,7 +147,7 @@ def advance(self, *args: Any, **kwargs: Any) -> None:
 
         batch_idx, (batch, self.batch_progress.is_last_batch) = next(self._dataloader_iter)
 
-        if not self.trainer.data_connector.train_data_fetcher.store_on_device:
+        if not self.trainer._data_connector.train_data_fetcher.store_on_device:
             with self.trainer.profiler.profile("training_batch_to_device"):
                 batch = self.trainer.accelerator.batch_to_device(batch)
 
diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
index 415df45ca4ee5..b7004f9436a0f 100644
--- a/pytorch_lightning/loops/fit_loop.py
+++ b/pytorch_lightning/loops/fit_loop.py
@@ -212,7 +212,7 @@ def on_advance_start(self) -> None:
     def advance(self) -> None:
         """Runs one whole epoch."""
         dataloader = self.trainer.training_type_plugin.process_dataloader(self.trainer.train_dataloader)
-        data_fetcher = self.trainer.data_connector.get_profiled_dataloader(dataloader)
+        data_fetcher = self.trainer._data_connector.get_profiled_dataloader(dataloader)
 
         with self.trainer.profiler.profile("run_training_epoch"):
             self.epoch_loop.run(data_fetcher)
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 49e2f7e8a60df..1ff22cc07ecb0 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -379,30 +379,28 @@ def pre_dispatch(self):
         self.init_deepspeed()
         self.barrier()
 
-    def _setup_models_and_optimizers(
-        self, models: List[Module], optimizers: List[Optimizer]
-    ) -> Tuple[List[Module], List[Optimizer]]:
-        """Setup multiple models and multiple optimizers together.
+    def _setup_model_and_optimizers(self, model: Module, optimizers: List[Optimizer]) -> Tuple[Module, List[Optimizer]]:
+        """Setup a model and multiple optimizers together.
 
-        Currently only one model paired with a single optimizer is supported.
+        Currently only a single optimizer is supported.
 
         Return:
-            A list with one model wrapped into a :class:`deepspeed.DeepSpeedEngine` and list with a single
+            The model wrapped into a :class:`deepspeed.DeepSpeedEngine` and a list with a single
             deepspeed optimizer.
         """
-        if not (len(models) == len(optimizers) == 1):
+        if len(optimizers) != 1:
             raise ValueError(
-                f"Currently only one model and one optimizer is supported with DeepSpeed."
-                f" Got {len(models)} models and {len(optimizers)} optimizers instead."
+                f"Currently only one optimizer is supported with DeepSpeed."
+                f" Got {len(optimizers)} optimizers instead."
             )
 
         # train_micro_batch_size_per_gpu is used for throughput logging purposes
         # normally we set this to the batch size, but it is not available here unless the user provides it
         # as part of the config
         self.config.setdefault("train_micro_batch_size_per_gpu", 1)
-        self._model, optimizer = self._setup_model_and_optimizer(models[0], optimizers[0])
+        self._model, optimizer = self._setup_model_and_optimizer(model, optimizers[0])
         self._set_deepspeed_activation_checkpointing()
-        return [self._model], [optimizer]
+        return self._model, [optimizer]
 
     def _setup_model_and_optimizer(
         self, model: Module, optimizer: Optimizer, lr_scheduler: Optional[_LRScheduler] = None
@@ -623,7 +621,7 @@ def _auto_select_batch_size(self):
         # train_micro_batch_size_per_gpu is used for throughput logging purposes
         # by default we try to use the batch size of the loader
         batch_size = 1
-        train_dl_source = self.lightning_module.trainer.data_connector._train_dataloader_source
+        train_dl_source = self.lightning_module.trainer._data_connector._train_dataloader_source
         if train_dl_source.is_defined():
             train_dataloader = train_dl_source.dataloader()
             if hasattr(train_dataloader, "batch_sampler"):
diff --git a/pytorch_lightning/plugins/training_type/sharded.py b/pytorch_lightning/plugins/training_type/sharded.py
index 9712b5356091f..6e278d44e5cb8 100644
--- a/pytorch_lightning/plugins/training_type/sharded.py
+++ b/pytorch_lightning/plugins/training_type/sharded.py
@@ -47,33 +47,23 @@ def configure_ddp(self) -> None:
             # For multi-node training, enabling bucketing will improve performance.
             self._ddp_kwargs["reduce_buffer_size"] = self._REDUCE_BUFFER_SIZE_DEFAULT if self.num_nodes > 1 else 0
 
-        [self._model], optimizers = self._setup_models_and_optimizers(
-            models=[LightningShardedDataParallel(self.model)],
+        self._model, optimizers = self._setup_model_and_optimizers(
+            model=LightningShardedDataParallel(self.model),
             optimizers=trainer.optimizers,
         )
         trainer.optimizers = optimizers
         trainer.convert_to_lightning_optimizers()
 
-    def _setup_models_and_optimizers(
-        self, models: List[Module], optimizers: List[Optimizer]
-    ) -> Tuple[List[Module], List[Optimizer]]:
+    def _setup_model_and_optimizers(self, model: Module, optimizers: List[Optimizer]) -> Tuple[Module, List[Optimizer]]:
         """Wraps the model and optimizers with fairscale components.
 
-        Currently only one model can be setup at once.
-
         Return:
-            A list with one model wrapped into a :class:`~fairscale.nn.data_parallel.ShardedDataParallel` module
+            The model wrapped into a :class:`~fairscale.nn.data_parallel.ShardedDataParallel` module
             and a list of optimizer wrapped in :class:~`fairscale.optim.OSS`.
         """
-        if len(models) > 1:
-            raise ValueError(
-                "DDPSharded only supports setting up a single model with one or several optimizers."
-                f" Got {len(models)} models."
-            )
-
         optimizers = self._wrap_optimizers(optimizers)
-        model = ShardedDataParallel(models[0], sharded_optimizer=optimizers, **self._ddp_kwargs)
-        return [model], optimizers
+        model = ShardedDataParallel(model, sharded_optimizer=optimizers, **self._ddp_kwargs)
+        return model, optimizers
 
     def _reinit_optimizers_with_oss(self, optimizers: List[Union[Optimizer, LightningOptimizer]]) -> List["OSS"]:
         for x, optimizer in enumerate(optimizers):
diff --git a/pytorch_lightning/plugins/training_type/sharded_spawn.py b/pytorch_lightning/plugins/training_type/sharded_spawn.py
index 9503ffb951abb..13615ce05e2fb 100644
--- a/pytorch_lightning/plugins/training_type/sharded_spawn.py
+++ b/pytorch_lightning/plugins/training_type/sharded_spawn.py
@@ -39,32 +39,22 @@ class DDPSpawnShardedPlugin(DDPSpawnPlugin):
 
     def configure_ddp(self) -> None:
         trainer = self.lightning_module.trainer
-        [self._model], optimizers = self._setup_models_and_optimizers(
-            models=[LightningShardedDataParallel(self.model)],
+        self._model, optimizers = self._setup_model_and_optimizers(
+            model=LightningShardedDataParallel(self.model),
             optimizers=trainer.optimizers,
         )
         trainer.optimizers = optimizers
 
-    def _setup_models_and_optimizers(
-        self, models: List[Module], optimizers: List[Optimizer]
-    ) -> Tuple[List[Module], List[Optimizer]]:
+    def _setup_model_and_optimizers(self, model: Module, optimizers: List[Optimizer]) -> Tuple[Module, List[Optimizer]]:
         """Wraps the model and optimizers with fairscale components.
 
-        Currently only one model can be setup at once.
-
         Return:
-            A list with one model wrapped into a :class:`~fairscale.nn.data_parallel.ShardedDataParallel` module
+            The model wrapped into a :class:`~fairscale.nn.data_parallel.ShardedDataParallel` module
             and a list of optimizer wrapped in :class:~`fairscale.optim.OSS`.
         """
-        if len(models) > 1:
-            raise ValueError(
-                f"DDPShardedSpawn only supports setting up a single model with one or several optimizers."
-                f" Got {len(models)} models."
-            )
-
         optimizers = self._wrap_optimizers(optimizers)
-        model = ShardedDataParallel(models[0], sharded_optimizer=optimizers, **self._ddp_kwargs)
-        return [model], optimizers
+        model = ShardedDataParallel(model, sharded_optimizer=optimizers, **self._ddp_kwargs)
+        return model, optimizers
 
     def _reinit_optimizers_with_oss(self, optimizers: List[Optimizer]) -> List["OSS"]:
         for x, optimizer in enumerate(optimizers):
diff --git a/pytorch_lightning/plugins/training_type/tpu_spawn.py b/pytorch_lightning/plugins/training_type/tpu_spawn.py
index f8968a69ceed1..94edfe5354fc3 100644
--- a/pytorch_lightning/plugins/training_type/tpu_spawn.py
+++ b/pytorch_lightning/plugins/training_type/tpu_spawn.py
@@ -20,6 +20,7 @@
 
 import torch
 import torch.multiprocessing as mp
+from torch.nn import Module
 from torch.utils.data import DataLoader
 
 import pytorch_lightning as pl
@@ -95,7 +96,7 @@ def _validate_dataloader(dataloaders: Union[List[DataLoader], DataLoader]) -> No
     @staticmethod
     def _validate_patched_dataloaders(model: "pl.LightningModule") -> None:
         """Validate and fail fast if the dataloaders were passed directly to fit."""
-        connector: DataConnector = model.trainer.data_connector
+        connector: DataConnector = model.trainer._data_connector
         sources = (
             connector._train_dataloader_source,
             connector._val_dataloader_source,
@@ -118,6 +119,9 @@ def pre_dispatch(self):
     def setup(self) -> None:
         self.create_mp_queue()
 
+    def _setup_model(self, model: Module) -> Module:
+        return model
+
     def create_mp_queue(self):
         self.start_method = "fork"
         smp = mp.get_context(self.start_method)
diff --git a/pytorch_lightning/plugins/training_type/training_type_plugin.py b/pytorch_lightning/plugins/training_type/training_type_plugin.py
index 95c74d4a87b70..e1cfdda2d68d8 100644
--- a/pytorch_lightning/plugins/training_type/training_type_plugin.py
+++ b/pytorch_lightning/plugins/training_type/training_type_plugin.py
@@ -61,18 +61,16 @@ def setup_environment(self) -> None:
     def setup(self) -> None:
         """Called by the accelerator to finish setup."""
 
-    def _setup_models_and_optimizers(
-        self, models: List[Module], optimizers: List[Optimizer]
-    ) -> Tuple[List[Module], List[Optimizer]]:
-        """Setup multiple models and multiple optimizers together.
+    def _setup_model_and_optimizers(self, model: Module, optimizers: List[Optimizer]) -> Tuple[Module, List[Optimizer]]:
+        """Setup a model and multiple optimizers together.
 
         The returned objects are expected to be in the same order they were passed in. The default implementation will
-        call :meth:`_setup_model` and :meth:`_setup_optimizer` on the input lists.
+        call :meth:`_setup_model` and :meth:`_setup_optimizer` on the inputs.
         """
         # TODO (@awaelchli): standardize this across all plugins in Lightning and Lite. Related refactor: #7324
-        models = [self._setup_model(model) for model in models]
+        model = self._setup_model(model)
         optimizers = [self._setup_optimizer(optimizer) for optimizer in optimizers]
-        return models, optimizers
+        return model, optimizers
 
     def _setup_model(self, model: Module) -> Module:
         """Performs setup for the model, e.g., by wrapping it by another class."""
diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py
index 45205cf36a899..58cee0c1d8af2 100644
--- a/pytorch_lightning/profiler/__init__.py
+++ b/pytorch_lightning/profiler/__init__.py
@@ -146,7 +146,11 @@ def custom_processing_step(self, data):
 The output below shows the profiling for the action ``training_step_and_backward``.
 The user can provide ``PyTorchProfiler(record_functions={...})`` to extend the scope of profiled functions.
 
-.. note:: When using the PyTorch Profiler, wall clock time will not not be representative of the true wall clock time. This is due to forcing profiled operations to be measured synchronously, when many CUDA ops happen asynchronously. It is recommended to use this Profiler to find bottlenecks/breakdowns, however for end to end wall clock time use the `SimpleProfiler`.   # noqa: E501
+.. note::
+    When using the PyTorch Profiler, wall clock time will not not be representative of the true wall clock time.
+    This is due to forcing profiled operations to be measured synchronously, when many CUDA ops happen asynchronously.
+    It is recommended to use this Profiler to find bottlenecks/breakdowns, however for end to end wall clock time use
+    the `SimpleProfiler`.
 
 .. code-block::
 
diff --git a/pytorch_lightning/trainer/configuration_validator.py b/pytorch_lightning/trainer/configuration_validator.py
index 88c319ac57431..b59e7fd56f62f 100644
--- a/pytorch_lightning/trainer/configuration_validator.py
+++ b/pytorch_lightning/trainer/configuration_validator.py
@@ -65,7 +65,7 @@ def __verify_train_loop_configuration(trainer: "pl.Trainer", model: "pl.Lightnin
     # -----------------------------------
     # verify model has a train dataloader
     # -----------------------------------
-    has_train_dataloader = trainer.data_connector._train_dataloader_source.is_defined()
+    has_train_dataloader = trainer._data_connector._train_dataloader_source.is_defined()
     if not has_train_dataloader:
         raise MisconfigurationException(
             "No `train_dataloader()` method defined. Lightning `Trainer` expects as minimum a"
@@ -176,7 +176,7 @@ def __verify_eval_loop_configuration(model: "pl.LightningModule", stage: str) ->
 
 
 def __verify_predict_loop_configuration(trainer: "pl.Trainer", model: "pl.LightningModule") -> None:
-    has_predict_dataloader = trainer.data_connector._predict_dataloader_source.is_defined()
+    has_predict_dataloader = trainer._data_connector._predict_dataloader_source.is_defined()
     if not has_predict_dataloader:
         raise MisconfigurationException("Dataloader not found for `Trainer.predict`")
     # ----------------------------------------------
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index 1d2bc1dc680a2..8271cf9bdc742 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -176,11 +176,6 @@ def __init__(
         self._training_type_plugin_resolved = False
         self.accelerator = self.select_accelerator()
 
-        # init flags for SLURM+DDP to work
-        self.world_size = 1
-        self.interactive_ddp_procs = []
-        self.global_rank = 0
-
         # benchmarking
         # TODO: should this be moved to GPU accelerator?
         torch.backends.cudnn.benchmark = self.benchmark
@@ -1021,14 +1016,6 @@ def _configure_slurm_ddp(self):
                 # likely not on slurm, so set the slurm managed flag to false
                 self._is_slurm_managing_tasks = False
 
-        # used for tests only, set this flag to simulate slurm managing a task
-        try:
-            should_fake = int(os.environ["FAKE_SLURM_MANAGING_TASKS"])
-            if should_fake:
-                self._is_slurm_managing_tasks = True
-        except Exception:
-            pass
-
         # notify user the that slurm is managing tasks
         if self._is_slurm_managing_tasks:
             rank_zero_info("Multi-processing is handled by Slurm.")
diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py
index 3174effdffb72..52b1efa681665 100644
--- a/pytorch_lightning/trainer/data_loading.py
+++ b/pytorch_lightning/trainer/data_loading.py
@@ -343,7 +343,7 @@ def reset_train_dataloader(self, model: Optional["pl.LightningModule"] = None) -
             apply_to_collection(self.train_dataloader, DataLoader, self._add_sampler_metadata_collate)
 
         # wrap the sequence of train loaders to a CombinedLoader object for computing the num_training_batches
-        self.train_dataloader = CombinedLoader(self.train_dataloader, self.data_connector.multiple_trainloader_mode)
+        self.train_dataloader = CombinedLoader(self.train_dataloader, self._data_connector.multiple_trainloader_mode)
 
         self.num_training_batches = len(self.train_dataloader) if has_len(self.train_dataloader) else float("inf")
 
@@ -488,7 +488,7 @@ def reset_val_dataloader(self, model: Optional["pl.LightningModule"] = None) ->
         Args:
             model: The `LightningModule` if called outside of the trainer scope.
         """
-        source = self.data_connector._val_dataloader_source
+        source = self._data_connector._val_dataloader_source
         pl_module = self.lightning_module or model
         has_step = is_overridden("validation_step", pl_module)
         if source.is_defined() and has_step:
@@ -502,7 +502,7 @@ def reset_test_dataloader(self, model: Optional["pl.LightningModule"] = None) ->
         Args:
             model: The `LightningModule` if called outside of the trainer scope.
         """
-        source = self.data_connector._test_dataloader_source
+        source = self._data_connector._test_dataloader_source
         pl_module = self.lightning_module or model
         has_step = is_overridden("test_step", pl_module)
         if source.is_defined() and has_step:
@@ -516,7 +516,7 @@ def reset_predict_dataloader(self, model: Optional["pl.LightningModule"] = None)
         Args:
             model: The `LightningModule` if called outside of the trainer scope.
         """
-        source = self.data_connector._predict_dataloader_source
+        source = self._data_connector._predict_dataloader_source
         pl_module = self.lightning_module or model
         if source.is_defined():
             self.num_predict_batches, self.predict_dataloaders = self._reset_eval_dataloader(
@@ -545,7 +545,7 @@ def request_dataloader(
         Returns:
             The requested dataloader
         """
-        source = getattr(self.data_connector, f"_{stage.dataloader_prefix}_dataloader_source")
+        source = getattr(self._data_connector, f"_{stage.dataloader_prefix}_dataloader_source")
 
         hook = f"{stage.dataloader_prefix}_dataloader"
         self.call_hook("on_" + hook, pl_module=model)
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index bc0bfbf346c26..c525e22ac128c 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -424,7 +424,7 @@ def __init__(
         gpu_ids, tpu_cores = self._parse_devices(gpus, auto_select_gpus, tpu_cores)
 
         # init connectors
-        self.data_connector = DataConnector(self, multiple_trainloader_mode)
+        self._data_connector = DataConnector(self, multiple_trainloader_mode)
         self.optimizer_connector = OptimizerConnector(self)
 
         self.accelerator_connector = AcceleratorConnector(
@@ -514,7 +514,7 @@ def __init__(
         self.optimizer_connector.on_trainer_init()
 
         # init data flags
-        self.data_connector.on_trainer_init(
+        self._data_connector.on_trainer_init(
             check_val_every_n_epoch,
             reload_dataloaders_every_n_epochs,
             reload_dataloaders_every_epoch,
@@ -663,7 +663,7 @@ def _fit_impl(
             )
 
         # links data to the trainer
-        self.data_connector.attach_data(
+        self._data_connector.attach_data(
             model, train_dataloaders=train_dataloaders, val_dataloaders=val_dataloaders, datamodule=datamodule
         )
 
@@ -747,7 +747,7 @@ def _validate_impl(
             )
 
         # links data to the trainer
-        self.data_connector.attach_data(model, val_dataloaders=dataloaders, datamodule=datamodule)
+        self._data_connector.attach_data(model, val_dataloaders=dataloaders, datamodule=datamodule)
 
         self.validated_ckpt_path = self.__set_ckpt_path(
             ckpt_path, model_provided=model_provided, model_connected=self.lightning_module is not None
@@ -837,7 +837,7 @@ def _test_impl(
             )
 
         # links data to the trainer
-        self.data_connector.attach_data(model, test_dataloaders=dataloaders, datamodule=datamodule)
+        self._data_connector.attach_data(model, test_dataloaders=dataloaders, datamodule=datamodule)
 
         self.tested_ckpt_path = self.__set_ckpt_path(
             ckpt_path, model_provided=model_provided, model_connected=self.lightning_module is not None
@@ -921,7 +921,7 @@ def _predict_impl(
             )
 
         # links data to the trainer
-        self.data_connector.attach_data(model, predict_dataloaders=dataloaders, datamodule=datamodule)
+        self._data_connector.attach_data(model, predict_dataloaders=dataloaders, datamodule=datamodule)
 
         self.predicted_ckpt_path = self.__set_ckpt_path(
             ckpt_path, model_provided=model_provided, model_connected=self.lightning_module is not None
@@ -985,7 +985,7 @@ def tune(
             )
 
         # links data to the trainer
-        self.data_connector.attach_data(
+        self._data_connector.attach_data(
             model, train_dataloaders=train_dataloaders, val_dataloaders=val_dataloaders, datamodule=datamodule
         )
 
@@ -1027,7 +1027,7 @@ def _run(self, model: "pl.LightningModule") -> Optional[Union[_EVALUATE_OUTPUT,
         self.training_type_plugin.connect(model)
 
         # hook
-        self.data_connector.prepare_data()
+        self._data_connector.prepare_data()
         self.callback_connector._attach_model_callbacks()
 
         if self._ckpt_path and not self.training_type_plugin.restore_checkpoint_after_pre_dispatch:
@@ -1171,7 +1171,7 @@ def _post_dispatch(self):
         # these `teardown` calls are here instead of in `_call_teardown_hook` since they are internal teardowns
         # which need to happen before.
         self.accelerator.teardown()
-        self.data_connector.teardown()
+        self._data_connector.teardown()
         self._active_loop.teardown()
         self.logger_connector.teardown()
 
@@ -1258,7 +1258,7 @@ def _run_predict(self) -> Optional[_PREDICT_OUTPUT]:
             return self.predict_loop.run()
 
     def _run_sanity_check(self, ref_model):
-        using_val_step = self.data_connector._val_dataloader_source.is_defined() and is_overridden(
+        using_val_step = self._data_connector._val_dataloader_source.is_defined() and is_overridden(
             "validation_step", ref_model
         )
         should_sanity_check = using_val_step and self.num_sanity_val_steps > 0 and self.limit_val_batches > 0
diff --git a/pytorch_lightning/tuner/batch_size_scaling.py b/pytorch_lightning/tuner/batch_size_scaling.py
index 6700cf6b32629..721c5e293bae8 100644
--- a/pytorch_lightning/tuner/batch_size_scaling.py
+++ b/pytorch_lightning/tuner/batch_size_scaling.py
@@ -51,7 +51,7 @@ def scale_batch_size(
             " If this is not the intended behavior, please remove either one."
         )
 
-    if not trainer.data_connector._train_dataloader_source.is_module():
+    if not trainer._data_connector._train_dataloader_source.is_module():
         raise MisconfigurationException(
             "The batch scaling feature cannot be used with dataloaders passed directly to `.fit()`."
             " Please disable the feature or incorporate the dataloader into the model."
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index 69cf3ce1d4a9f..c7ad70895672a 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -44,7 +44,7 @@ def _module_available(module_path: str) -> bool:
         return False
 
 
-def _compare_version(package: str, op: Callable, version: str, use_base_version: bool = True) -> bool:
+def _compare_version(package: str, op: Callable, version: str, use_base_version: bool = False) -> bool:
     """Compare package version with some requirements.
 
     >>> _compare_version("torch", operator.ge, "0.1")
diff --git a/tests/accelerators/test_common.py b/tests/accelerators/test_common.py
index cf00d84eecb64..d38c5c5fad140 100644
--- a/tests/accelerators/test_common.py
+++ b/tests/accelerators/test_common.py
@@ -32,7 +32,7 @@
     ),
 )
 def test_evaluate(tmpdir, trainer_kwargs):
-    tutils.set_random_master_port()
+    tutils.set_random_main_port()
     seed_everything(1)
     dm = ClassifDataModule()
     model = CustomClassificationModelDP()
diff --git a/tests/accelerators/test_ddp_spawn.py b/tests/accelerators/test_ddp_spawn.py
index 47b6305a2eb98..b5d7ba52e1451 100644
--- a/tests/accelerators/test_ddp_spawn.py
+++ b/tests/accelerators/test_ddp_spawn.py
@@ -24,7 +24,7 @@
 
 @RunIf(min_gpus=2)
 def test_multi_gpu_early_stop_ddp_spawn(tmpdir):
-    tutils.set_random_master_port()
+    tutils.set_random_main_port()
 
     trainer_options = dict(
         default_root_dir=tmpdir,
@@ -43,7 +43,7 @@ def test_multi_gpu_early_stop_ddp_spawn(tmpdir):
 
 @RunIf(min_gpus=2)
 def test_multi_gpu_model_ddp_spawn(tmpdir):
-    tutils.set_random_master_port()
+    tutils.set_random_main_port()
 
     trainer_options = dict(
         default_root_dir=tmpdir,
@@ -66,7 +66,7 @@ def test_multi_gpu_model_ddp_spawn(tmpdir):
 @RunIf(min_gpus=2)
 def test_ddp_all_dataloaders_passed_to_fit(tmpdir):
     """Make sure DDP works with dataloaders passed to fit()"""
-    tutils.set_random_master_port()
+    tutils.set_random_main_port()
 
     model = BoringModel()
     fit_options = dict(train_dataloader=model.train_dataloader(), val_dataloaders=model.val_dataloader())
diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py
index 57dd6a5b3e2ec..38a2caceed859 100644
--- a/tests/accelerators/test_dp.py
+++ b/tests/accelerators/test_dp.py
@@ -59,7 +59,7 @@ def test_multi_gpu_early_stop_dp(tmpdir):
 
     with early stopping
     """
-    tutils.set_random_master_port()
+    tutils.set_random_main_port()
 
     dm = ClassifDataModule()
     model = CustomClassificationModelDP()
@@ -79,7 +79,7 @@ def test_multi_gpu_early_stop_dp(tmpdir):
 
 @RunIf(min_gpus=2)
 def test_multi_gpu_model_dp(tmpdir):
-    tutils.set_random_master_port()
+    tutils.set_random_main_port()
 
     trainer_options = dict(
         default_root_dir=tmpdir,
diff --git a/tests/callbacks/test_quantization.py b/tests/callbacks/test_quantization.py
index f548d4d98adcc..ec2bb66110a2e 100644
--- a/tests/callbacks/test_quantization.py
+++ b/tests/callbacks/test_quantization.py
@@ -21,11 +21,19 @@
 from pytorch_lightning import seed_everything, Trainer
 from pytorch_lightning.callbacks import QuantizationAwareTraining
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_8
 from pytorch_lightning.utilities.memory import get_model_size_mb
+from tests.helpers.boring_model import RandomDataset
 from tests.helpers.datamodules import RegressDataModule
 from tests.helpers.runif import RunIf
 from tests.helpers.simple_models import RegressionModel
 
+if _TORCH_GREATER_EQUAL_1_8:
+    from torch.quantization import FakeQuantizeBase
+else:
+    # For torch 1.6 and 1.7.
+    from torch.quantization import FakeQuantize as FakeQuantizeBase
+
 
 @pytest.mark.parametrize("observe", ["average", "histogram"])
 @pytest.mark.parametrize("fuse", [True, False])
@@ -45,7 +53,12 @@ def test_quantization(tmpdir, observe: str, fuse: bool, convert: bool):
     org_score = torch.mean(torch.tensor([mape(model(x), y) for x, y in dm.test_dataloader()]))
 
     fusing_layers = [(f"layer_{i}", f"layer_{i}a") for i in range(3)] if fuse else None
-    qcb = QuantizationAwareTraining(observer_type=observe, modules_to_fuse=fusing_layers, quantize_on_fit_end=convert)
+    qcb = QuantizationAwareTraining(
+        observer_type=observe,
+        modules_to_fuse=fusing_layers,
+        quantize_on_fit_end=convert,
+        observer_enabled_stages=("train", "validate"),
+    )
     trainer = Trainer(callbacks=[qcb], **trainer_args)
     trainer.fit(qmodel, datamodule=dm)
 
@@ -105,6 +118,9 @@ def test_quantization_exceptions(tmpdir):
     with pytest.raises(MisconfigurationException, match="Unsupported `collect_quantization`"):
         QuantizationAwareTraining(collect_quantization=1.2)
 
+    with pytest.raises(MisconfigurationException, match="Unsupported stages"):
+        QuantizationAwareTraining(observer_enabled_stages=("abc",))
+
     fusing_layers = [(f"layers.mlp_{i}", f"layers.NONE-mlp_{i}a") for i in range(3)]
     qcb = QuantizationAwareTraining(modules_to_fuse=fusing_layers)
     trainer = Trainer(callbacks=[qcb], default_root_dir=tmpdir, max_epochs=1)
@@ -140,3 +156,91 @@ def test_quantization_triggers(tmpdir, trigger_fn: Union[None, int, Callable], e
     trainer.fit(qmodel, datamodule=dm)
 
     assert qcb._forward_calls == expected_count
+
+
+def _get_observer_enabled(fake_quant: FakeQuantizeBase):
+    # ``torch.quantization.FakeQuantize`` checks ``observer_enabled[0] == 1``.
+    return fake_quant.observer_enabled[0] == 1
+
+
+@pytest.mark.parametrize(
+    "observer_enabled_stages",
+    [("train", "validate", "test", "predict"), ("train",), ("validate",), ("test",), ("predict",), ()],
+)
+@RunIf(quantization=True)
+def test_quantization_disable_observers(tmpdir, observer_enabled_stages):
+    """Test disabling observers."""
+    qmodel = RegressionModel()
+    qcb = QuantizationAwareTraining(observer_enabled_stages=observer_enabled_stages)
+    trainer = Trainer(callbacks=[qcb], default_root_dir=tmpdir)
+
+    # Quantize qmodel.
+    qcb.on_fit_start(trainer, qmodel)
+    fake_quants = list(module for module in qmodel.modules() if isinstance(module, FakeQuantizeBase))
+    # Disable some of observers before fitting.
+    for fake_quant in fake_quants[::2]:
+        fake_quant.disable_observer()
+
+    for stage, on_stage_start, on_stage_end in [
+        ("train", qcb.on_train_start, qcb.on_train_end),
+        ("validate", qcb.on_validation_start, qcb.on_validation_end),
+        ("test", qcb.on_test_start, qcb.on_test_end),
+        ("predict", qcb.on_predict_start, qcb.on_predict_end),
+    ]:
+        before_stage_observer_enabled = torch.as_tensor(list(map(_get_observer_enabled, fake_quants)))
+
+        on_stage_start(trainer, qmodel)
+        expected_stage_observer_enabled = torch.as_tensor(
+            before_stage_observer_enabled if stage in observer_enabled_stages else [False] * len(fake_quants)
+        )
+        assert torch.equal(
+            torch.as_tensor(list(map(_get_observer_enabled, fake_quants))), expected_stage_observer_enabled
+        )
+
+        on_stage_end(trainer, qmodel)
+        assert torch.equal(
+            torch.as_tensor(list(map(_get_observer_enabled, fake_quants))), before_stage_observer_enabled
+        )
+
+
+@RunIf(quantization=True)
+def test_quantization_val_test_predict(tmpdir):
+    """Test the default quantization aware training not affected by validating, testing and predicting."""
+    seed_everything(42)
+    num_features = 16
+    dm = RegressDataModule(num_features=num_features)
+    qmodel = RegressionModel()
+
+    val_test_predict_qmodel = copy.deepcopy(qmodel)
+    trainer = Trainer(
+        callbacks=[QuantizationAwareTraining(quantize_on_fit_end=False)],
+        default_root_dir=tmpdir,
+        limit_train_batches=1,
+        limit_val_batches=1,
+        limit_test_batches=1,
+        limit_predict_batches=1,
+        val_check_interval=1,
+        num_sanity_val_steps=1,
+        max_epochs=4,
+    )
+    trainer.fit(val_test_predict_qmodel, datamodule=dm)
+    trainer.validate(model=val_test_predict_qmodel, verbose=False)
+    trainer.test(model=val_test_predict_qmodel, verbose=False)
+    trainer.predict(
+        model=val_test_predict_qmodel, dataloaders=[torch.utils.data.DataLoader(RandomDataset(num_features, 16))]
+    )
+
+    expected_qmodel = copy.deepcopy(qmodel)
+    # No validation in ``expected_qmodel`` fitting.
+    Trainer(
+        callbacks=[QuantizationAwareTraining(quantize_on_fit_end=False)],
+        default_root_dir=tmpdir,
+        limit_train_batches=1,
+        limit_val_batches=0,
+        max_epochs=4,
+    ).fit(expected_qmodel, datamodule=dm)
+
+    expected_state_dict = expected_qmodel.state_dict()
+    for key, value in val_test_predict_qmodel.state_dict().items():
+        expected_value = expected_state_dict[key]
+        assert torch.allclose(value, expected_value)
diff --git a/tests/core/test_datamodules.py b/tests/core/test_datamodules.py
index b0dadd0e31a7a..868c13bcc1a78 100644
--- a/tests/core/test_datamodules.py
+++ b/tests/core/test_datamodules.py
@@ -51,7 +51,7 @@ def test_can_prepare_data(local_rank, node_rank):
     local_rank.return_value = 0
     assert trainer.local_rank == 0
 
-    trainer.data_connector.prepare_data()
+    trainer._data_connector.prepare_data()
     assert dm.random_full is not None
 
     # local rank = 1   (False)
@@ -60,7 +60,7 @@ def test_can_prepare_data(local_rank, node_rank):
     local_rank.return_value = 1
     assert trainer.local_rank == 1
 
-    trainer.data_connector.prepare_data()
+    trainer._data_connector.prepare_data()
     assert dm.random_full is None
 
     # prepare_data_per_node = False (prepare across all nodes)
@@ -71,7 +71,7 @@ def test_can_prepare_data(local_rank, node_rank):
     node_rank.return_value = 0
     local_rank.return_value = 0
 
-    trainer.data_connector.prepare_data()
+    trainer._data_connector.prepare_data()
     assert dm.random_full is not None
 
     # global rank = 1   (False)
@@ -80,13 +80,13 @@ def test_can_prepare_data(local_rank, node_rank):
     node_rank.return_value = 1
     local_rank.return_value = 0
 
-    trainer.data_connector.prepare_data()
+    trainer._data_connector.prepare_data()
     assert dm.random_full is None
 
     node_rank.return_value = 0
     local_rank.return_value = 1
 
-    trainer.data_connector.prepare_data()
+    trainer._data_connector.prepare_data()
     assert dm.random_full is None
 
     # 2 dm
@@ -100,13 +100,13 @@ def test_can_prepare_data(local_rank, node_rank):
         # has been called
         # False
         dm._has_prepared_data = True
-        trainer.data_connector.prepare_data()
+        trainer._data_connector.prepare_data()
         dm_mock.assert_not_called()
 
         # has not been called
         # True
         dm._has_prepared_data = False
-        trainer.data_connector.prepare_data()
+        trainer._data_connector.prepare_data()
         dm_mock.assert_called_once()
 
 
@@ -629,7 +629,7 @@ def test_inconsistent_prepare_data_per_node(tmpdir):
         trainer = Trainer(prepare_data_per_node=False)
         trainer.model = model
         trainer.datamodule = dm
-        trainer.data_connector.prepare_data()
+        trainer._data_connector.prepare_data()
 
 
 DATALOADER = DataLoader(RandomDataset(1, 32))
diff --git a/tests/core/test_metric_result_integration.py b/tests/core/test_metric_result_integration.py
index b97a3ee8c5c78..dead9b4e438ac 100644
--- a/tests/core/test_metric_result_integration.py
+++ b/tests/core/test_metric_result_integration.py
@@ -103,7 +103,7 @@ def _ddp_test_fn(rank, worldsize):
 @RunIf(skip_windows=True, min_gpus=2)
 def test_result_reduce_ddp():
     """Make sure result logging works with DDP."""
-    tutils.set_random_master_port()
+    tutils.set_random_main_port()
 
     worldsize = 2
     mp.spawn(_ddp_test_fn, args=(worldsize,), nprocs=worldsize)
diff --git a/tests/core/test_results.py b/tests/core/test_results.py
index 1033699ef398c..0e62441b1d40e 100644
--- a/tests/core/test_results.py
+++ b/tests/core/test_results.py
@@ -41,6 +41,6 @@ def _ddp_test_fn(rank, worldsize):
 @RunIf(skip_windows=True)
 def test_result_reduce_ddp():
     """Make sure result logging works with DDP."""
-    tutils.set_random_master_port()
+    tutils.set_random_main_port()
     worldsize = 2
     mp.spawn(_ddp_test_fn, args=(worldsize,), nprocs=worldsize)
diff --git a/tests/helpers/utils.py b/tests/helpers/utils.py
index 08afeb9139f79..832c15f2b0e22 100644
--- a/tests/helpers/utils.py
+++ b/tests/helpers/utils.py
@@ -73,7 +73,7 @@ def reset_seed(seed=0):
     seed_everything(seed)
 
 
-def set_random_master_port():
+def set_random_main_port():
     reset_seed()
     port = RANDOM_PORTS.pop()
     os.environ["MASTER_PORT"] = str(port)
diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py
index ff93a1bcfdc70..76cbf8b4fcee7 100644
--- a/tests/models/data/horovod/train_default_model.py
+++ b/tests/models/data/horovod/train_default_model.py
@@ -37,7 +37,7 @@
     print("You requested to import Horovod which is missing or not supported for your OS.")
 
 from tests.helpers import BoringModel  # noqa: E402
-from tests.helpers.utils import reset_seed, set_random_master_port  # noqa: E402
+from tests.helpers.utils import reset_seed, set_random_main_port  # noqa: E402
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--trainer-options", required=True)
@@ -46,7 +46,7 @@
 
 def run_test_from_config(trainer_options, on_gpu, check_size=True):
     """Trains the default model with the given config."""
-    set_random_master_port()
+    set_random_main_port()
     reset_seed()
 
     ckpt_path = trainer_options["weights_save_path"]
diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py
index bc08d950fc6ca..716c0f17f203d 100644
--- a/tests/models/test_amp.py
+++ b/tests/models/test_amp.py
@@ -129,7 +129,7 @@ def test_amp_gpus(tmpdir, strategy, precision, gpus):
 def test_amp_gpu_ddp_slurm_managed(tmpdir):
     """Make sure DDP + AMP work."""
     # simulate setting slurm flags
-    tutils.set_random_master_port()
+    tutils.set_random_main_port()
 
     model = AMPTestModel()
 
diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py
index 7271af004c48b..2fb537b1d2861 100644
--- a/tests/models/test_cpu.py
+++ b/tests/models/test_cpu.py
@@ -125,7 +125,7 @@ def validation_step(self, *args, **kwargs):
 @RunIf(skip_windows=True)
 def test_multi_cpu_model_ddp(tmpdir):
     """Make sure DDP works."""
-    tutils.set_random_master_port()
+    tutils.set_random_main_port()
 
     trainer_options = dict(
         default_root_dir=tmpdir,
diff --git a/tests/models/test_gpu.py b/tests/models/test_gpu.py
index 9317804b1cca3..8dc0de250dfae 100644
--- a/tests/models/test_gpu.py
+++ b/tests/models/test_gpu.py
@@ -40,7 +40,7 @@
 @RunIf(min_gpus=2)
 def test_multi_gpu_none_backend(tmpdir):
     """Make sure when using multiple GPUs the user can't use `accelerator = None`."""
-    tutils.set_random_master_port()
+    tutils.set_random_main_port()
     trainer_options = dict(
         default_root_dir=tmpdir,
         enable_progress_bar=False,
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index 73862825941fe..abf5a34757424 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -273,7 +273,7 @@ def test_result_reduce_horovod(tmpdir):
     This test mirrors tests/core/test_results.py::_ddp_test_fn
     """
     tutils.reset_seed()
-    tutils.set_random_master_port()
+    tutils.set_random_main_port()
 
     def hvd_test_fn():
         path_here = os.path.abspath(os.path.dirname(__file__))
diff --git a/tests/models/test_onnx.py b/tests/models/test_onnx.py
index 70fce02a689e9..7ab425dd12ea6 100644
--- a/tests/models/test_onnx.py
+++ b/tests/models/test_onnx.py
@@ -88,7 +88,7 @@ def test_model_saves_with_example_input_array(tmpdir, modelclass, input_sample):
 @RunIf(min_gpus=2)
 def test_model_saves_on_multi_gpu(tmpdir):
     """Test that ONNX model saves on a distributed backend."""
-    tutils.set_random_master_port()
+    tutils.set_random_main_port()
 
     trainer_options = dict(
         default_root_dir=tmpdir,
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
index ba886a9459d87..d6424510088eb 100644
--- a/tests/models/test_restore.py
+++ b/tests/models/test_restore.py
@@ -304,7 +304,7 @@ def test_callbacks_references_resume_from_checkpoint(tmpdir):
 def test_running_test_pretrained_model_distrib_dp(tmpdir):
     """Verify `test()` on pretrained model."""
 
-    tutils.set_random_master_port()
+    tutils.set_random_main_port()
 
     dm = ClassifDataModule()
     model = CustomClassificationModelDP(lr=0.1)
@@ -351,7 +351,7 @@ def test_running_test_pretrained_model_distrib_dp(tmpdir):
 @RunIf(min_gpus=2)
 def test_running_test_pretrained_model_distrib_ddp_spawn(tmpdir):
     """Verify `test()` on pretrained model."""
-    tutils.set_random_master_port()
+    tutils.set_random_main_port()
     dm = ClassifDataModule()
     model = ClassificationModel()
 
diff --git a/tests/models/test_sync_batchnorm.py b/tests/models/test_sync_batchnorm.py
index 8d96eb4b75b4e..67880bec4e474 100644
--- a/tests/models/test_sync_batchnorm.py
+++ b/tests/models/test_sync_batchnorm.py
@@ -22,7 +22,7 @@
 from pytorch_lightning.utilities import FLOAT16_EPSILON
 from tests.helpers.datamodules import MNISTDataModule
 from tests.helpers.runif import RunIf
-from tests.helpers.utils import set_random_master_port
+from tests.helpers.utils import set_random_main_port
 
 
 class SyncBNModule(LightningModule):
@@ -70,7 +70,7 @@ def configure_optimizers(self):
 @RunIf(min_gpus=2, special=True)
 def test_sync_batchnorm_ddp(tmpdir):
     seed_everything(234)
-    set_random_master_port()
+    set_random_main_port()
 
     # define datamodule and dataloader
     dm = MNISTDataModule()
diff --git a/tests/plugins/test_tpu_spawn.py b/tests/plugins/test_tpu_spawn.py
index 5537125ce3afb..3f4ff354e39bb 100644
--- a/tests/plugins/test_tpu_spawn.py
+++ b/tests/plugins/test_tpu_spawn.py
@@ -65,7 +65,7 @@ def test_error_iterable_dataloaders_passed_to_fit(
     model = BoringModelNoDataloaders()
     model.trainer = trainer
 
-    trainer.data_connector.attach_dataloaders(
+    trainer._data_connector.attach_dataloaders(
         model,
         train_dataloaders=train_dataloaders,
         val_dataloaders=val_dataloaders,
diff --git a/tests/profiler/test_profiler.py b/tests/profiler/test_profiler.py
index 6528c39e2a9a9..36ef565d03f51 100644
--- a/tests/profiler/test_profiler.py
+++ b/tests/profiler/test_profiler.py
@@ -293,6 +293,7 @@ def test_pytorch_profiler_trainer_ddp(tmpdir, pytorch_profiler):
         assert any(f"{local_rank}-validation_step" in f for f in files)
 
 
+@RunIf(special=True)
 @pytest.mark.parametrize("fast_dev_run", [1, 2, 3, 4, 5])
 @pytest.mark.parametrize("boring_model_cls", [ManualOptimBoringModel, BoringModel])
 def test_pytorch_profiler_trainer_fit(fast_dev_run, boring_model_cls, tmpdir):
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index c3e532b8e2a02..16629d9da52c6 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1834,7 +1834,7 @@ def validation_epoch_end(self, outputs) -> None:
 @RunIf(skip_windows=True)
 def test_fit_test_synchronization(tmpdir):
     """Test that the trainer synchronizes processes before returning control back to the caller."""
-    tutils.set_random_master_port()
+    tutils.set_random_main_port()
     model = TestDummyModelForCheckpoint()
     checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="x", mode="min", save_top_k=1)
     trainer = Trainer(
diff --git a/tests/trainer/test_trainer_tricks.py b/tests/trainer/test_trainer_tricks.py
index 1dd3ab92eb833..ac85bc11df4fb 100644
--- a/tests/trainer/test_trainer_tricks.py
+++ b/tests/trainer/test_trainer_tricks.py
@@ -85,7 +85,7 @@ def test_overfit_batch_limits(tmpdir):
     # ------------------------------------------------------
     trainer = Trainer(overfit_batches=4)
     model.trainer = trainer
-    trainer.data_connector.attach_dataloaders(model=model)
+    trainer._data_connector.attach_dataloaders(model=model)
     trainer.reset_train_dataloader(model)
     assert trainer.num_training_batches == 4
 
@@ -96,7 +96,7 @@ def test_overfit_batch_limits(tmpdir):
 
     trainer = Trainer(overfit_batches=0.11)
     model.trainer = trainer
-    trainer.data_connector.attach_dataloaders(model=model)
+    trainer._data_connector.attach_dataloaders(model=model)
     trainer.reset_train_dataloader(model)
     # The dataloader should have been overwritten with a Sequential sampler.
     assert trainer.train_dataloader is not train_loader
@@ -116,7 +116,7 @@ def test_overfit_batch_limits(tmpdir):
         # test overfit_batches as percent
         # ------------------------------------------------------
         trainer = Trainer(overfit_batches=0.11)
-        trainer.data_connector.attach_dataloaders(model)
+        trainer._data_connector.attach_dataloaders(model)
         loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model)
         assert loader_num_batches[0] == num_train_samples
 
@@ -132,11 +132,11 @@ def test_overfit_batch_limits(tmpdir):
         # test overfit_batches as int
         # ------------------------------------------------------
         trainer = Trainer(overfit_batches=1)
-        trainer.data_connector.attach_dataloaders(model)
+        trainer._data_connector.attach_dataloaders(model)
         loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model)
         assert loader_num_batches[0] == 1
         trainer = Trainer(overfit_batches=5)
-        trainer.data_connector.attach_dataloaders(model)
+        trainer._data_connector.attach_dataloaders(model)
         loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model)
         assert loader_num_batches[0] == 5
 
@@ -145,21 +145,21 @@ def test_overfit_batch_limits(tmpdir):
         # ------------------------------------------------------
         if split == RunningStage.VALIDATING:
             trainer = Trainer(limit_val_batches=0.1)
-            trainer.data_connector.attach_dataloaders(model)
+            trainer._data_connector.attach_dataloaders(model)
             loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model)
             assert loader_num_batches[0] == int(0.1 * len(val_loader))
 
             trainer = Trainer(limit_val_batches=10)
-            trainer.data_connector.attach_dataloaders(model)
+            trainer._data_connector.attach_dataloaders(model)
             loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model)
             assert loader_num_batches[0] == 10
         else:
             trainer = Trainer(limit_test_batches=0.1)
-            trainer.data_connector.attach_dataloaders(model)
+            trainer._data_connector.attach_dataloaders(model)
             loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model)
             assert loader_num_batches[0] == int(0.1 * len(test_loader))
 
             trainer = Trainer(limit_test_batches=10)
-            trainer.data_connector.attach_dataloaders(model)
+            trainer._data_connector.attach_dataloaders(model)
             loader_num_batches, dataloaders = trainer._reset_eval_dataloader(split, model=model)
             assert loader_num_batches[0] == 10
diff --git a/tests/utilities/test_auto_restart.py b/tests/utilities/test_auto_restart.py
index 1000aefe2eeb2..b325aef1b9ea0 100644
--- a/tests/utilities/test_auto_restart.py
+++ b/tests/utilities/test_auto_restart.py
@@ -358,7 +358,7 @@ def _test_fast_forward_sampler_with_distributed_sampler(rank, worldsize):
 @RunIf(skip_windows=True)
 def test_fast_forward_sampler_with_distributed_sampler():
     """Make sure result logging works with DDP."""
-    tutils.set_random_master_port()
+    tutils.set_random_main_port()
     worldsize = 2
     mp.spawn(_test_fast_forward_sampler_with_distributed_sampler, args=(worldsize,), nprocs=worldsize)
 
@@ -632,7 +632,7 @@ def test_fast_forward_sampler_iterative_dataset():
 @RunIf(skip_windows=True)
 def test_fast_forward_sampler_with_distributed_sampler_and_iterative_dataset():
     """Make sure result logging works with DDP."""
-    tutils.set_random_master_port()
+    tutils.set_random_main_port()
     worldsize = 2
     mp.spawn(
         _test_fast_forward_sampler_with_distributed_sampler_and_iterative_dataset, args=(worldsize,), nprocs=worldsize
diff --git a/tests/utilities/test_fetching.py b/tests/utilities/test_fetching.py
index 88c232b76a5ab..39fcb4edcf1c2 100644
--- a/tests/utilities/test_fetching.py
+++ b/tests/utilities/test_fetching.py
@@ -185,9 +185,9 @@ def __init__(self, check_inter_batch: bool):
 
         def on_train_epoch_end(self, trainer, lightning_module):
             if self._check_inter_batch:
-                assert isinstance(trainer.data_connector.train_data_fetcher, InterBatchParallelDataFetcher)
+                assert isinstance(trainer._data_connector.train_data_fetcher, InterBatchParallelDataFetcher)
             else:
-                assert isinstance(trainer.data_connector.train_data_fetcher, DataFetcher)
+                assert isinstance(trainer._data_connector.train_data_fetcher, DataFetcher)
 
     trainer_kwargs = dict(
         default_root_dir=tmpdir,
@@ -232,7 +232,7 @@ def __init__(self, *args, automatic_optimization: bool = False, **kwargs):
 
         def training_step(self, dataloader_iter, batch_idx):
             assert self.count == batch_idx
-            assert isinstance(self.trainer.data_connector.train_data_fetcher, DataLoaderIterDataFetcher)
+            assert isinstance(self.trainer._data_connector.train_data_fetcher, DataLoaderIterDataFetcher)
             # fetch 2 batches
             self.batches.append(next(dataloader_iter))
             self.batches.append(next(dataloader_iter))
@@ -255,7 +255,7 @@ def training_step(self, dataloader_iter, batch_idx):
 
         def training_epoch_end(self, *_):
             assert self.trainer.fit_loop.epoch_loop.batch_progress.current.ready == 33
-            assert self.trainer.data_connector.train_data_fetcher.fetched == 64
+            assert self.trainer._data_connector.train_data_fetcher.fetched == 64
             assert self.count == 64
 
     model = TestModel(automatic_optimization=automatic_optimization)
diff --git a/tests/utilities/test_imports.py b/tests/utilities/test_imports.py
index bf2c2c4f70a9f..75bcb51ffb89f 100644
--- a/tests/utilities/test_imports.py
+++ b/tests/utilities/test_imports.py
@@ -11,8 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import operator
 
 from pytorch_lightning.utilities import _module_available
+from pytorch_lightning.utilities.imports import _compare_version
 
 
 def test_module_exists():
@@ -22,3 +24,24 @@ def test_module_exists():
     assert not _module_available("torch.nn.asdf")
     assert not _module_available("asdf")
     assert not _module_available("asdf.bla.asdf")
+
+
+def test_compare_version(monkeypatch):
+    from pytorch_lightning.utilities.imports import torch
+
+    monkeypatch.setattr(torch, "__version__", "1.8.9")
+    assert not _compare_version("torch", operator.ge, "1.10.0")
+    assert _compare_version("torch", operator.lt, "1.10.0")
+
+    monkeypatch.setattr(torch, "__version__", "1.10.0.dev123")
+    assert _compare_version("torch", operator.ge, "1.10.0.dev123")
+    assert not _compare_version("torch", operator.ge, "1.10.0.dev124")
+
+    assert _compare_version("torch", operator.ge, "1.10.0.dev123", use_base_version=True)
+    assert _compare_version("torch", operator.ge, "1.10.0.dev124", use_base_version=True)
+
+    monkeypatch.setattr(torch, "__version__", "1.10.0a0+0aef44c")  # dev version before rc
+    assert _compare_version("torch", operator.ge, "1.10.0.rc0", use_base_version=True)
+    assert not _compare_version("torch", operator.ge, "1.10.0.rc0")
+    assert _compare_version("torch", operator.ge, "1.10.0", use_base_version=True)
+    assert not _compare_version("torch", operator.ge, "1.10.0")