From ff5361604b2fd508aa2432babed6844fbe268849 Mon Sep 17 00:00:00 2001
From: Rohit Gupta <rohitgr1998@gmail.com>
Date: Wed, 13 Jul 2022 05:10:14 +0530
Subject: [PATCH] Weekly patch release v1.6.5 (#13481)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update NGC docker (#13136)

* update docker
* Apply suggestions from code review

Co-authored-by: Akihiro Nitta <nitta@akihironitta.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

* Decouple pulling legacy checkpoints from existing GHA workflows and docker files (#13185)

* Add pull-legacy-checkpoints action
* Replace pulls with the new action and script
* Simplify

* Merge pull request #13250 from PyTorchLightning/ci/rm-base

CI: Remove simple test `ci_test-base.yml`

* Update rich requirement from !=10.15.*,<=12.0.0,>=10.2.2 to >=10.2.2,!=10.15.0.a,<13.0.0 in /requirements (#13047)

* Update rich requirement in /requirements

Updates the requirements on [rich](https://github.com/willmcgugan/rich) to permit the latest version.
- [Release notes](https://github.com/willmcgugan/rich/releases)
- [Changelog](https://github.com/Textualize/rich/blob/master/CHANGELOG.md)
- [Commits](https://github.com/willmcgugan/rich/compare/v10.2.2...v12.4.1)

---
updated-dependencies:
- dependency-name: rich
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

* Fix torch.distributed._sharded_tensor DeprecationWarning (#13261)

* update tutorials (#13268)

* [BUG] `estimated_stepping_batches` requires distributed comms in `configure_optimizers` for `DeepSpeedStrategy` (#13350)

* Update torchmetrics requirement from <=0.7.2,>=0.4.1 to >=0.4.1,<0.9.2 in /requirements (#13275)

Update torchmetrics requirement in /requirements

Updates the requirements on [torchmetrics](https://github.com/PyTorchLightning/metrics) to permit the latest version.
- [Release notes](https://github.com/PyTorchLightning/metrics/releases)
- [Changelog](https://github.com/PyTorchLightning/metrics/blob/master/CHANGELOG.md)
- [Commits](https://github.com/PyTorchLightning/metrics/compare/v0.4.1...v0.9.1)

---
updated-dependencies:
- dependency-name: torchmetrics
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>

* Fix mypy errors for model summary utilities (#13384)

* rename org Lightning AI

* Modified python version check to accommodate for legacy version styles (#13420)

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

(cherry picked from commit b332b6632821e3f8fd451bbdf158bc9389eea51a)

* Call `set_epoch` for distributed batch samplers (#13396)

Co-authored-by: Jirka <jirka.borovec@seznam.cz>
Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>

(cherry picked from commit 2dd332f9c795aa5e590dce4d83e76d791a7b43df)

* _RICH_AVAILABLE

* _FAIRSCALE_AVAILABLE

* _BAGUA_AVAILABLE

* redefine

* chlog spaces

* CI: Fix `fatal: unsafe repository` (#13515)

* update release date

* CI: azure rename

* Restore log step during restart (#13467)

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>

* remove redundant test

* Update CI setup (#13291)

* drop mamba
* use legacy GPU machines

* fix schema check

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Akihiro Nitta <nitta@akihironitta.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Adam J. Stewart <ajstewart426@gmail.com>
Co-authored-by: Sean Naren <sean@grid.ai>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: Jirka <jirka.borovec@seznam.cz>
Co-authored-by: Martino Sorbaro <martinosorb@users.noreply.github.com>
---
 .actions/pull_legacy_checkpoints.sh           |  9 +++
 .../gpu-benchmark.yml                         |  2 +-
 {.azure-pipelines => .azure}/gpu-tests.yml    |  7 +-
 {.azure-pipelines => .azure}/hpu-tests.yml    |  0
 {.azure-pipelines => .azure}/ipu-tests.yml    |  0
 .github/workflows/ci_dockers.yml              |  4 +-
 .github/workflows/ci_schema.yml               |  2 +-
 .github/workflows/ci_test-base.yml            | 81 -------------------
 .github/workflows/ci_test-conda.yml           | 29 ++++---
 .github/workflows/ci_test-full.yml            | 21 +++--
 .github/workflows/events-nightly.yml          |  4 +-
 CHANGELOG.md                                  | 10 +++
 _notebooks                                    |  2 +-
 dockers/base-conda/Dockerfile                 |  7 +-
 dockers/nvidia/Dockerfile                     | 15 ++--
 dockers/release/Dockerfile                    |  2 +-
 dockers/tpu-tests/Dockerfile                  |  5 +-
 legacy/README.md                              |  3 +-
 .../basic_examples/mnist_datamodule.py        |  2 +-
 .../domain_templates/reinforce_learn_ppo.py   |  2 +-
 pytorch_lightning/__about__.py                |  2 +-
 .../callbacks/progress/rich_progress.py       |  5 +-
 .../callbacks/rich_model_summary.py           |  2 +-
 pytorch_lightning/core/lightning.py           |  7 +-
 pytorch_lightning/loggers/neptune.py          | 22 ++---
 .../loops/dataloader/evaluation_loop.py       | 13 +--
 .../loops/dataloader/prediction_loop.py       | 10 +--
 .../loops/epoch/evaluation_epoch_loop.py      |  2 +-
 .../loops/epoch/training_epoch_loop.py        |  2 +
 pytorch_lightning/loops/fit_loop.py           |  9 +--
 pytorch_lightning/loops/utilities.py          | 14 ++++
 pytorch_lightning/overrides/fairscale.py      | 16 +++-
 .../overrides/torch_distributed.py            |  6 +-
 pytorch_lightning/plugins/io/torch_plugin.py  |  2 +-
 pytorch_lightning/plugins/precision/ipu.py    |  2 +-
 .../plugins/precision/sharded_native_amp.py   |  2 +-
 pytorch_lightning/plugins/precision/tpu.py    |  2 +-
 pytorch_lightning/setup_tools.py              |  4 +-
 pytorch_lightning/strategies/bagua.py         |  4 +-
 pytorch_lightning/strategies/ddp.py           |  2 +-
 pytorch_lightning/strategies/deepspeed.py     |  2 +
 pytorch_lightning/strategies/fully_sharded.py |  2 +-
 .../strategies/launchers/spawn.py             |  2 +-
 pytorch_lightning/strategies/sharded.py       |  2 +-
 pytorch_lightning/strategies/sharded_spawn.py |  2 +-
 .../connectors/accelerator_connector.py       |  6 +-
 .../logger_connector/fx_validator.py          |  2 +-
 .../connectors/logger_connector/result.py     |  2 +-
 pytorch_lightning/trainer/supporters.py       |  7 +-
 pytorch_lightning/utilities/__init__.py       |  5 --
 pytorch_lightning/utilities/imports.py        |  8 +-
 pytorch_lightning/utilities/meta.py           |  6 +-
 pytorch_lightning/utilities/migration.py      |  2 +-
 pytorch_lightning/utilities/model_summary.py  |  6 +-
 requirements/base.txt                         |  2 +-
 requirements/extra.txt                        |  2 +-
 setup.py                                      |  8 +-
 tests/README.md                               |  3 +-
 tests/core/test_lightning_module.py           |  5 +-
 tests/helpers/runif.py                        |  7 +-
 tests/loggers/test_neptune.py                 |  4 +-
 tests/loops/test_evaluation_loop.py           | 57 ++++++++++---
 tests/loops/test_loop_state_dict.py           |  2 +-
 tests/loops/test_utilities.py                 | 24 +++++-
 tests/models/test_restore.py                  |  1 +
 .../precision/test_sharded_precision.py       |  2 +-
 ..._ddp_fully_sharded_with_full_state_dict.py |  2 +-
 tests/strategies/test_deepspeed_strategy.py   | 23 ++++++
 tests/strategies/test_sharded_strategy.py     |  2 +-
 .../logging_/test_eval_loop_logging.py        |  3 +-
 tests/trainer/test_dataloaders.py             | 49 +----------
 tests/utilities/test_imports.py               |  4 +-
 72 files changed, 292 insertions(+), 297 deletions(-)
 create mode 100644 .actions/pull_legacy_checkpoints.sh
 rename {.azure-pipelines => .azure}/gpu-benchmark.yml (97%)
 rename {.azure-pipelines => .azure}/gpu-tests.yml (95%)
 rename {.azure-pipelines => .azure}/hpu-tests.yml (100%)
 rename {.azure-pipelines => .azure}/ipu-tests.yml (100%)
 delete mode 100644 .github/workflows/ci_test-base.yml

diff --git a/.actions/pull_legacy_checkpoints.sh b/.actions/pull_legacy_checkpoints.sh
new file mode 100644
index 0000000000000..8b3f791297b66
--- /dev/null
+++ b/.actions/pull_legacy_checkpoints.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Run this script from the project root.
+URL="https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip"
+mkdir -p legacy
+# wget is simpler but does not work on Windows
+python -c "from urllib.request import urlretrieve; urlretrieve('$URL', 'legacy/checkpoints.zip')"
+ls -l legacy/
+unzip -o legacy/checkpoints.zip -d legacy/
+ls -l legacy/checkpoints/
diff --git a/.azure-pipelines/gpu-benchmark.yml b/.azure/gpu-benchmark.yml
similarity index 97%
rename from .azure-pipelines/gpu-benchmark.yml
rename to .azure/gpu-benchmark.yml
index cfccbf7081f14..451f5b5646dca 100644
--- a/.azure-pipelines/gpu-benchmark.yml
+++ b/.azure/gpu-benchmark.yml
@@ -26,7 +26,7 @@ jobs:
   - job: benchmarks
     timeoutInMinutes: "90"
     cancelTimeoutInMinutes: "2"
-    pool: azure-gpus-spot
+    pool: azure-jirka-spot
     container:
       image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11"
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
diff --git a/.azure-pipelines/gpu-tests.yml b/.azure/gpu-tests.yml
similarity index 95%
rename from .azure-pipelines/gpu-tests.yml
rename to .azure/gpu-tests.yml
index c0074adf4e81c..5ec9db1297b43 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure/gpu-tests.yml
@@ -29,7 +29,7 @@ jobs:
     # how much time to give 'run always even if cancelled tasks' before stopping them
     cancelTimeoutInMinutes: "2"
 
-    pool: azure-gpus-spot
+    pool: azure-jirka-spot
 
     container:
       image: $(image)
@@ -69,10 +69,7 @@ jobs:
         python requirements/check-avail-extras.py
       displayName: 'Env details'
 
-    - bash: |
-        wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -P legacy/
-        unzip -o legacy/checkpoints.zip -d legacy/
-        ls -l legacy/checkpoints/
+    - bash: bash .actions/pull_legacy_checkpoints.sh
       displayName: 'Get legacy checkpoints'
 
     - bash: |
diff --git a/.azure-pipelines/hpu-tests.yml b/.azure/hpu-tests.yml
similarity index 100%
rename from .azure-pipelines/hpu-tests.yml
rename to .azure/hpu-tests.yml
diff --git a/.azure-pipelines/ipu-tests.yml b/.azure/ipu-tests.yml
similarity index 100%
rename from .azure-pipelines/ipu-tests.yml
rename to .azure/ipu-tests.yml
diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml
index 94d5be6901c5a..c9ed112516ff4 100644
--- a/.github/workflows/ci_dockers.yml
+++ b/.github/workflows/ci_dockers.yml
@@ -97,7 +97,7 @@ jobs:
             UBUNTU_VERSION=${{ matrix.ubuntu_version }}
           file: dockers/base-cuda/Dockerfile
           push: false
-        timeout-minutes: 75
+        timeout-minutes: 95
 
   build-Conda:
     runs-on: ubuntu-20.04
@@ -123,7 +123,7 @@ jobs:
             CUDA_VERSION=${{ matrix.cuda_version }}
           file: dockers/base-conda/Dockerfile
           push: false
-        timeout-minutes: 75
+        timeout-minutes: 95
 
   build-ipu:
     runs-on: ubuntu-20.04
diff --git a/.github/workflows/ci_schema.yml b/.github/workflows/ci_schema.yml
index 54efaff27a201..03b230124085d 100644
--- a/.github/workflows/ci_schema.yml
+++ b/.github/workflows/ci_schema.yml
@@ -21,4 +21,4 @@ jobs:
       - name: Azure Pipelines
         env:
           SCHEMA_FILE: https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.204.0/service-schema.json
-        run: check-jsonschema .azure-pipelines/*.yml --schemafile "$SCHEMA_FILE"
+        run: check-jsonschema .azure/*.yml --schemafile "$SCHEMA_FILE"
diff --git a/.github/workflows/ci_test-base.yml b/.github/workflows/ci_test-base.yml
deleted file mode 100644
index 221f92d26ad5e..0000000000000
--- a/.github/workflows/ci_test-base.yml
+++ /dev/null
@@ -1,81 +0,0 @@
-# this jobs runs `pytest` over the source directory. It does not install any extra dependencies.
-# this is useful to catch errors where an import has been added which is not part of the basic dependencies.
-name: Test simple
-
-# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows
-on:  # Trigger the workflow on push or pull request, but only for the master branch
-  push:
-    branches: [master, "release/*"]
-  pull_request:
-    branches: [master, "release/*"]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
-  cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
-
-jobs:
-  source:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-20.04]
-        # this will install stable torch
-        python-version: [3.9]
-
-    # lower timeout as this should run very quickly
-    timeout-minutes: 20
-    steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-
-    - name: Reset caching
-      run: python -c "import time; days = time.time() / 60 / 60 / 24; print(f'TIME_PERIOD=d{int(days / 2) * 2}')" >> $GITHUB_ENV
-
-    # Note: This uses an internal pip API and may not always work
-    # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow
-    - name: Get pip cache
-      id: pip-cache
-      run: python -c "from pip._internal.locations import USER_CACHE_DIR; print('::set-output name=dir::' + USER_CACHE_DIR)"
-
-    - name: Cache pip
-      uses: actions/cache@v2
-      with:
-        path: ${{ steps.pip-cache.outputs.dir }}
-        key: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ hashFiles('requirements/base.txt') }}
-        restore-keys: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.requires }}-
-
-    - name: Install dependencies
-      run: |
-        python --version
-        python -m pip install --upgrade --user pip
-        pip --version
-        pip install -r requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade
-        pip install -r requirements/test.txt
-        pip list
-      shell: bash
-
-    - name: Test Package [only]
-      run: |
-        coverage run --source pytorch_lightning -m pytest pytorch_lightning -v
-
-    - name: Statistics
-      if: success()
-      run: |
-        coverage report
-        coverage xml
-
-    - name: Upload coverage to Codecov
-      uses: codecov/codecov-action@v1
-      if: always()
-      # see: https://github.com/actions/toolkit/issues/399
-      continue-on-error: true
-      with:
-        token: ${{ secrets.CODECOV_TOKEN }}
-        file: coverage.xml
-        flags: cpu,pytest
-        name: Base-coverage
-        fail_ci_if_error: false
diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml
index c907e13b4ac23..1064c7a095d54 100644
--- a/.github/workflows/ci_test-conda.yml
+++ b/.github/workflows/ci_test-conda.yml
@@ -31,19 +31,27 @@ jobs:
     timeout-minutes: 30
     steps:
     - name: Workaround for https://github.com/actions/checkout/issues/760
-      run: git config --global --add safe.directory /__w/pytorch-lightning/pytorch-lightning
+      run: git config --global --add safe.directory /__w/lightning/lightning
 
     - uses: actions/checkout@v2
 
-    - name: Update dependencies
+    - name: Update base dependencies
+      run: |
+        conda info
+        conda list
+        pip install -r requirements/test.txt
+
+    - name: DocTests
+      run: |
+        coverage run --source pytorch_lightning -m pytest pytorch_lightning
+
+    - name: Update all dependencies
       env:
         HOROVOD_BUILD_ARCH_FLAGS: "-mfma"
         HOROVOD_WITHOUT_MXNET: 1
         HOROVOD_WITHOUT_TENSORFLOW: 1
       run: |
         set -e
-        conda info
-        conda list
         # adjust versions according installed Torch version
         python ./requirements/adjust-versions.py requirements/extra.txt
         python ./requirements/adjust-versions.py requirements/examples.txt
@@ -55,17 +63,12 @@ jobs:
         # sanity check
         python requirements/check-avail-extras.py
 
-    - name: Pull checkpoints from S3
-      working-directory: ./legacy
-      run: |
-        # enter legacy and update checkpoints from S3
-        curl https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip --output checkpoints.zip
-        unzip -o checkpoints.zip
-        ls -l checkpoints/
+    - name: Pull legacy checkpoints
+      run: bash .actions/pull_legacy_checkpoints.sh
 
-    - name: Tests
+    - name: UnitTests
       run: |
-        coverage run --source pytorch_lightning -m pytest --timeout 150 pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml
+        coverage run --source pytorch_lightning -m pytest --timeout 150 tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml
 
     - name: Upload pytest results
       uses: actions/upload-artifact@v2
diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
index f260c67069912..bca5699d43029 100644
--- a/.github/workflows/ci_test-full.yml
+++ b/.github/workflows/ci_test-full.yml
@@ -76,27 +76,21 @@ jobs:
         restore-keys: |
           ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}-
 
-    - name: Pull checkpoints from S3
-      working-directory: ./legacy
-      run: |
-        # wget is simpler but does not work on Windows
-        python -c "from urllib.request import urlretrieve ; urlretrieve('https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip', 'checkpoints.zip')"
-        ls -l .
-        unzip -o checkpoints.zip
-        ls -l checkpoints/
+    - name: Pull legacy checkpoints
+      run: bash .actions/pull_legacy_checkpoints.sh
 
     - name: Install dependencies
       run: |
         flag=$(python -c "print('--pre' if '${{matrix.release}}' == 'pre' else '')" 2>&1)
         url=$(python -c "print('test/cpu/torch_test.html' if '${{matrix.release}}' == 'pre' else 'cpu/torch_stable.html')" 2>&1)
         pip install -r requirements.txt --upgrade $flag --find-links "https://download.pytorch.org/whl/${url}"
-        # adjust versions according installed Torch version
-        python ./requirements/adjust-versions.py requirements/examples.txt
-        pip install -r requirements/examples.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade
         pip install -r requirements/test.txt --upgrade
         pip list
       shell: bash
 
+    - name: DocTests
+      run: coverage run --source pytorch_lightning -m pytest pytorch_lightning
+
     - name: Install extra dependencies
       run: |
         # adjust versions according installed Torch version
@@ -132,13 +126,16 @@ jobs:
       run: |
         python requirements/check-avail-extras.py
 
-    - name: Tests
+    - name: UnitTests
       run: |
         # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003
         coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml
 
     - name: Examples
       run: |
+        # adjust versions according installed Torch version
+        python ./requirements/adjust-versions.py requirements/examples.txt
+        pip install -r requirements/examples.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade
         python -m pytest pl_examples -v --durations=10
 
     - name: Upload pytest results
diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml
index 91a666dd2f6dd..4496e176d9720 100644
--- a/.github/workflows/events-nightly.yml
+++ b/.github/workflows/events-nightly.yml
@@ -146,7 +146,7 @@ jobs:
           file: dockers/base-cuda/Dockerfile
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
-        timeout-minutes: 85
+        timeout-minutes: 95
 
       # report failure to Slack
       - name: Slack notification
@@ -197,7 +197,7 @@ jobs:
           file: dockers/base-conda/Dockerfile
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
-        timeout-minutes: 85
+        timeout-minutes: 95
 
       # report failure to Slack
       - name: Slack notification
diff --git a/CHANGELOG.md b/CHANGELOG.md
index bdcd6e07da750..455fa67add3f1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
+## [1.6.5] - 2022-07-12
+
+### Fixed
+
+- Fixed `estimated_stepping_batches` requiring distributed comms in `configure_optimizers` for the `DeepSpeedStrategy` ([#13350](https://github.com/PyTorchLightning/pytorch-lightning/pull/13350))
+- Fixed bug with Python version check that prevented use with development versions of Python ([#13420](https://github.com/PyTorchLightning/pytorch-lightning/pull/13420))
+- The loops now call `.set_epoch()` also on batch samplers if the dataloader has one wrapped in a distributed sampler ([#13396](https://github.com/PyTorchLightning/pytorch-lightning/pull/13396))
+- Fixed the restoration of log step during restart ([#13467](https://github.com/PyTorchLightning/pytorch-lightning/pull/13467))
+
+
 ## [1.6.4] - 2022-06-01
 
 ### Added
diff --git a/_notebooks b/_notebooks
index 290fb466de1fc..8a36a41548f34 160000
--- a/_notebooks
+++ b/_notebooks
@@ -1 +1 @@
-Subproject commit 290fb466de1fcc2ac6025f74b56906592911e856
+Subproject commit 8a36a41548f34c44ac455d515a72994487e85813
diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile
index 05227707b31fa..72603b04ffd64 100644
--- a/dockers/base-conda/Dockerfile
+++ b/dockers/base-conda/Dockerfile
@@ -72,16 +72,15 @@ COPY environment.yml environment.yml
 
 # conda init
 RUN conda update -n base -c defaults conda && \
-    conda install mamba -n base -c conda-forge && \
-    mamba create -y --name $CONDA_ENV python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION} -c nvidia -c pytorch -c pytorch-test -c pytorch-nightly && \
+    conda create -y --name $CONDA_ENV python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION} -c nvidia -c pytorch -c pytorch-test -c pytorch-nightly && \
     conda init bash && \
     # NOTE: this requires that the channel is presented in the yaml before packages \
     printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n    req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \
     python prune.py && \
     rm prune.py && \
     cat environment.yml && \
-    mamba env update --name $CONDA_ENV --file environment.yml && \
-    mamba clean -ya && \
+    conda env update --name $CONDA_ENV --file environment.yml && \
+    conda clean -ya && \
     rm environment.yml
 
 ENV \
diff --git a/dockers/nvidia/Dockerfile b/dockers/nvidia/Dockerfile
index 792835c574ada..6848f6cf34eae 100644
--- a/dockers/nvidia/Dockerfile
+++ b/dockers/nvidia/Dockerfile
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG PYTORCH_VERSION=21.11
+ARG PYTORCH_VERSION=22.04
 
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes
 FROM nvcr.io/nvidia/pytorch:${PYTORCH_VERSION}-py3
@@ -37,20 +37,19 @@ RUN \
         cd .. ; \
     fi && \
 # save the examples
-    mv pytorch-lightning/_notebooks notebooks && \
+    mv pytorch-lightning/_notebooks/.notebooks/ notebooks && \
     mv pytorch-lightning/pl_examples . && \
 
 # Installations \
     pip install -q fire && \
-    python ./pytorch-lightning/.actions/assistant.py requirements_prune_pkgs horovod --req_files ./pytorch-lightning/requirements/extra.txt  && \
-    pip install "Pillow>=8.2, !=8.3.0" "cryptography>=3.4" "py>=1.10" --no-cache-dir --upgrade-strategy only-if-needed && \
-    pip install -r ./pytorch-lightning/requirements/extra.txt --no-cache-dir --upgrade-strategy only-if-needed && \
-    pip install -r ./pytorch-lightning/requirements/examples.txt --no-cache-dir --upgrade-strategy only-if-needed && \
-    pip install ./pytorch-lightning --no-cache-dir && \
+    pip install "Pillow>=8.2, !=8.3.0" "cryptography>=3.4" "py>=1.10" --no-cache-dir && \
+    pip install ./pytorch-lightning["extra","loggers","strategies"] --no-cache-dir && \
+    pip install -r ./pytorch-lightning/requirements/examples.txt --no-cache-dir && \
     rm -rf pytorch-lightning && \
-    pip install jupyterlab[all] -U && \
     pip list
 
+RUN pip install jupyterlab[all] -U
+
 RUN pip install lightning-grid -U && \
     pip install "py>=1.10" "protobuf>=3.15.6" --upgrade-strategy only-if-needed
 
diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile
index f4083f2dd42fc..a0ba3a4a41c37 100644
--- a/dockers/release/Dockerfile
+++ b/dockers/release/Dockerfile
@@ -36,7 +36,7 @@ RUN \
         mv pytorch-lightning-*/ pytorch-lightning ; \
         rm *.zip ; \
     fi && \
-    pip install ./pytorch-lightning["extra"] --no-cache-dir && \
+    pip install ./pytorch-lightning["extra","loggers","strategies"] --no-cache-dir && \
     rm -rf pytorch-lightning
 
 RUN python --version && \
diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile
index 602ef1684b859..d4c58c665e7a5 100644
--- a/dockers/tpu-tests/Dockerfile
+++ b/dockers/tpu-tests/Dockerfile
@@ -22,10 +22,7 @@ LABEL maintainer="PyTorchLightning <https://github.com/PyTorchLightning>"
 COPY ./ ./pytorch-lightning/
 
 # Pull the legacy checkpoints
-RUN cd pytorch-lightning && \
-    wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -P legacy/ && \
-    unzip -o legacy/checkpoints.zip -d legacy/ && \
-    ls -l legacy/checkpoints/
+RUN cd pytorch-lightning && bash .actions/pull_legacy_checkpoints.sh
 
 RUN \
     pip install -q fire && \
diff --git a/legacy/README.md b/legacy/README.md
index efbd18f7eede6..68eb718a98b07 100644
--- a/legacy/README.md
+++ b/legacy/README.md
@@ -7,8 +7,7 @@ At this moment we focus on ability running old checkpoints, so the flow here is
 If you want to pull all saved version-checkpoints for local testing/development, call
 
 ```bash
-wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip
-unzip -o checkpoints.zip
+bash .actions/pull_legacy_checkpoints.sh
 ```
 
 To back populate collection with past version you can use following bash:
diff --git a/pl_examples/basic_examples/mnist_datamodule.py b/pl_examples/basic_examples/mnist_datamodule.py
index 31a2e284dd8ba..02d7ff8fd7a37 100644
--- a/pl_examples/basic_examples/mnist_datamodule.py
+++ b/pl_examples/basic_examples/mnist_datamodule.py
@@ -36,7 +36,7 @@ class _MNIST(Dataset):
     """Carbon copy of ``tests.helpers.datasets.MNIST``.
 
     We cannot import the tests as they are not distributed with the package.
-    See https://github.com/PyTorchLightning/pytorch-lightning/pull/7614#discussion_r671183652 for more context.
+    See https://github.com/Lightning-AI/lightning/pull/7614#discussion_r671183652 for more context.
     """
 
     RESOURCES = (
diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py
index a400de062bf6f..baae200f9e2bc 100644
--- a/pl_examples/domain_templates/reinforce_learn_ppo.py
+++ b/pl_examples/domain_templates/reinforce_learn_ppo.py
@@ -125,7 +125,7 @@ def get_log_prob(self, pi: Normal, actions: torch.Tensor):
 
 
 class ExperienceSourceDataset(IterableDataset):
-    """Implementation from PyTorch Lightning Bolts: https://github.com/PyTorchLightning/lightning-
+    """Implementation from PyTorch Lightning Bolts: https://github.com/Lightning-AI/lightning-
     bolts/blob/master/pl_bolts/datamodules/experience_source.py.
 
     Basic experience source dataset. Takes a generate_batch function that returns an iterator. The logic for the
diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py
index fdaa499392c7e..ec1d1701bebd2 100644
--- a/pytorch_lightning/__about__.py
+++ b/pytorch_lightning/__about__.py
@@ -1,7 +1,7 @@
 import time
 
 _this_year = time.strftime("%Y")
-__version__ = "1.6.4"
+__version__ = "1.6.5"
 __author__ = "William Falcon et al."
 __author_email__ = "waf2107@columbia.edu"
 __license__ = "Apache-2.0"
diff --git a/pytorch_lightning/callbacks/progress/rich_progress.py b/pytorch_lightning/callbacks/progress/rich_progress.py
index fb5914a7a5d41..9dcbb1f4522fd 100644
--- a/pytorch_lightning/callbacks/progress/rich_progress.py
+++ b/pytorch_lightning/callbacks/progress/rich_progress.py
@@ -12,13 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
+import operator
 from dataclasses import dataclass
 from datetime import timedelta
 from typing import Any, Dict, Optional, Union
 
 import pytorch_lightning as pl
 from pytorch_lightning.callbacks.progress.base import ProgressBarBase
-from pytorch_lightning.utilities.imports import _RICH_AVAILABLE
+from pytorch_lightning.utilities.imports import _compare_version, _package_available
+
+_RICH_AVAILABLE: bool = _package_available("rich") and _compare_version("rich", operator.ge, "10.2.2")
 
 Task, Style = None, None
 if _RICH_AVAILABLE:
diff --git a/pytorch_lightning/callbacks/rich_model_summary.py b/pytorch_lightning/callbacks/rich_model_summary.py
index 148de6275950e..f2833bb33ff8c 100644
--- a/pytorch_lightning/callbacks/rich_model_summary.py
+++ b/pytorch_lightning/callbacks/rich_model_summary.py
@@ -14,7 +14,7 @@
 from typing import List, Tuple
 
 from pytorch_lightning.callbacks import ModelSummary
-from pytorch_lightning.utilities.imports import _RICH_AVAILABLE
+from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE
 from pytorch_lightning.utilities.model_summary import get_human_readable_count
 
 if _RICH_AVAILABLE:
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index bf86471fe92ae..849d6715ef0eb 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -46,7 +46,7 @@
 from pytorch_lightning.utilities.cloud_io import get_filesystem
 from pytorch_lightning.utilities.distributed import distributed_available, sync_ddp
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12
+from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11, _TORCH_GREATER_EQUAL_1_12
 from pytorch_lightning.utilities.memory import get_model_size_mb
 from pytorch_lightning.utilities.model_summary import ModelSummary, summarize
 from pytorch_lightning.utilities.parsing import collect_init_args
@@ -2064,7 +2064,10 @@ def _register_sharded_tensor_state_dict_hooks_if_available(self) -> None:
             rank_zero_debug("Could not register sharded tensor state dict hooks")
             return
 
-        from torch.distributed._sharded_tensor import pre_load_state_dict_hook, state_dict_hook
+        if _TORCH_GREATER_EQUAL_1_11:
+            from torch.distributed._shard.sharded_tensor import pre_load_state_dict_hook, state_dict_hook
+        else:
+            from torch.distributed._sharded_tensor import pre_load_state_dict_hook, state_dict_hook
 
         self._register_state_dict_hook(state_dict_hook)
 
diff --git a/pytorch_lightning/loggers/neptune.py b/pytorch_lightning/loggers/neptune.py
index 05952f6bc5747..e3cae87d19d18 100644
--- a/pytorch_lightning/loggers/neptune.py
+++ b/pytorch_lightning/loggers/neptune.py
@@ -54,7 +54,7 @@
 
 log = logging.getLogger(__name__)
 
-_INTEGRATION_VERSION_KEY = "source_code/integrations/pytorch-lightning"
+_INTEGRATION_VERSION_KEY = "source_code/integrations/lightning"
 
 # kwargs used in previous NeptuneLogger version, now deprecated
 _LEGACY_NEPTUNE_INIT_KWARGS = [
@@ -113,7 +113,7 @@ class NeptuneLogger(LightningLoggerBase):
 
         neptune_logger = NeptuneLogger(
             api_key="ANONYMOUS",  # replace with your own
-            project="common/pytorch-lightning-integration",  # format "<WORKSPACE/PROJECT>"
+            project="common/lightning-integration",  # format "<WORKSPACE/PROJECT>"
             tags=["training", "resnet"],  # optional
         )
         trainer = Trainer(max_epochs=10, logger=neptune_logger)
@@ -157,7 +157,7 @@ def any_lightning_module_function_or_hook(self):
 
     .. code-block:: python
 
-        neptune_logger = NeptuneLogger(project="common/pytorch-lightning-integration")
+        neptune_logger = NeptuneLogger(project="common/lightning-integration")
 
         trainer = pl.Trainer(logger=neptune_logger)
         model = ...
@@ -182,7 +182,7 @@ def any_lightning_module_function_or_hook(self):
 
     .. code-block:: python
 
-        neptune_logger = NeptuneLogger(project="common/pytorch-lightning-integration", log_model_checkpoints=False)
+        neptune_logger = NeptuneLogger(project="common/lightning-integration", log_model_checkpoints=False)
 
     **Pass additional parameters to the Neptune run**
 
@@ -194,7 +194,7 @@ def any_lightning_module_function_or_hook(self):
         from pytorch_lightning.loggers import NeptuneLogger
 
         neptune_logger = NeptuneLogger(
-            project="common/pytorch-lightning-integration",
+            project="common/lightning-integration",
             name="lightning-run",
             description="mlp quick run with pytorch-lightning",
             tags=["mlp", "quick-run"],
@@ -216,10 +216,10 @@ def any_lightning_module_function_or_hook(self):
     See Also:
         - Read about
           `what object you can log to Neptune <https://docs.neptune.ai/you-should-know/what-can-you-log-and-display>`_.
-        - Check `example run <https://app.neptune.ai/o/common/org/pytorch-lightning-integration/e/PTL-1/all>`_
+        - Check `example run <https://app.neptune.ai/o/common/org/lightning-integration/e/PTL-1/all>`_
           with multiple types of metadata logged.
         - For more detailed info check
-          `user guide <https://docs.neptune.ai/integrations-and-supported-tools/model-training/pytorch-lightning>`_.
+          `user guide <https://docs.neptune.ai/integrations-and-supported-tools/model-training/lightning>`_.
 
     Args:
         api_key: Optional.
@@ -350,7 +350,7 @@ def _verify_input_arguments(
             "  - https://docs-legacy.neptune.ai/integrations/pytorch_lightning.html\n"
             "The NeptuneLogger was re-written to use the neptune.new Python API\n"
             "  - https://neptune.ai/blog/neptune-new\n"
-            "  - https://docs.neptune.ai/integrations-and-supported-tools/model-training/pytorch-lightning\n"
+            "  - https://docs.neptune.ai/integrations-and-supported-tools/model-training/lightning\n"
             "You should use arguments accepted by either NeptuneLogger.init() or neptune.init()"
         )
 
@@ -377,7 +377,7 @@ def _verify_input_arguments(
                 "  - https://docs-legacy.neptune.ai/integrations/pytorch_lightning.html\n"
                 "The NeptuneLogger was re-written to use the neptune.new Python API\n"
                 "  - https://neptune.ai/blog/neptune-new\n"
-                "  - https://docs.neptune.ai/integrations-and-supported-tools/model-training/pytorch-lightning\n"
+                "  - https://docs.neptune.ai/integrations-and-supported-tools/model-training/lightning\n"
             )
 
         # check if user passed redundant neptune.init arguments when passed run
@@ -477,7 +477,7 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None:  #
 
             neptune_logger = NeptuneLogger(
                 api_key="ANONYMOUS",
-                project="common/pytorch-lightning-integration"
+                project="common/lightning-integration"
             )
 
             neptune_logger.log_hyperparams(PARAMS)
@@ -627,7 +627,7 @@ def _signal_deprecated_api_usage(f_name, sample_code, raise_exception=False):
             f"  - https://docs-legacy.neptune.ai/integrations/pytorch_lightning.html\n"
             f"The NeptuneLogger was re-written to use the neptune.new Python API\n"
             f"  - https://neptune.ai/blog/neptune-new\n"
-            f"  - https://docs.neptune.ai/integrations-and-supported-tools/model-training/pytorch-lightning\n"
+            f"  - https://docs.neptune.ai/integrations-and-supported-tools/model-training/lightning\n"
             f"Instead of `logger.{f_name}` you can use:\n"
             f"\t{sample_code}"
         )
diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py
index 2ec42fa0acbb5..46ea5ba631d10 100644
--- a/pytorch_lightning/loops/dataloader/evaluation_loop.py
+++ b/pytorch_lightning/loops/dataloader/evaluation_loop.py
@@ -24,8 +24,10 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.accelerators import GPUAccelerator
+from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE
 from pytorch_lightning.loops.dataloader import DataLoaderLoop
 from pytorch_lightning.loops.epoch import EvaluationEpochLoop
+from pytorch_lightning.loops.utilities import _set_sampler_epoch
 from pytorch_lightning.trainer.connectors.logger_connector.result import _OUT_DICT, _ResultCollection
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities.apply_func import apply_to_collection
@@ -36,7 +38,6 @@
     DataLoaderIterDataFetcher,
     InterBatchParallelDataFetcher,
 )
-from pytorch_lightning.utilities.imports import _RICH_AVAILABLE
 from pytorch_lightning.utilities.rank_zero import rank_zero_warn
 from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature
 from pytorch_lightning.utilities.types import EPOCH_OUTPUT
@@ -161,14 +162,8 @@ def advance(self, *args: Any, **kwargs: Any) -> None:
             self._has_run = True
 
     def on_advance_start(self, *args: Any, **kwargs: Any) -> None:
-        dataloader = self.current_dataloader
-        if (
-            dataloader is not None
-            and getattr(dataloader, "sampler", None)
-            and callable(getattr(dataloader.sampler, "set_epoch", None))
-        ):
-            # set seed for distributed sampler (enables shuffling for each epoch)
-            dataloader.sampler.set_epoch(self.trainer.fit_loop.epoch_progress.current.processed)
+        if self.current_dataloader is not None:
+            _set_sampler_epoch(self.current_dataloader, self.trainer.fit_loop.epoch_progress.current.processed)
 
         super().on_advance_start(*args, **kwargs)
 
diff --git a/pytorch_lightning/loops/dataloader/prediction_loop.py b/pytorch_lightning/loops/dataloader/prediction_loop.py
index a14a218ef67e9..36648b7f43e34 100644
--- a/pytorch_lightning/loops/dataloader/prediction_loop.py
+++ b/pytorch_lightning/loops/dataloader/prediction_loop.py
@@ -5,6 +5,7 @@
 
 from pytorch_lightning.loops.dataloader.dataloader_loop import DataLoaderLoop
 from pytorch_lightning.loops.epoch.prediction_epoch_loop import PredictionEpochLoop
+from pytorch_lightning.loops.utilities import _set_sampler_epoch
 from pytorch_lightning.strategies import DDPSpawnStrategy
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.types import _PREDICT_OUTPUT
@@ -87,13 +88,8 @@ def advance(self, *args: Any, **kwargs: Any) -> None:
         """Predicts one entire dataloader."""
         void(*args, **kwargs)
         dataloader = self.current_dataloader
-        if (
-            dataloader is not None
-            and getattr(dataloader, "sampler", None)
-            and callable(getattr(dataloader.sampler, "set_epoch", None))
-        ):
-            # set seed for distributed sampler (enables shuffling for each epoch)
-            dataloader.sampler.set_epoch(self.trainer.fit_loop.epoch_progress.current.processed)
+        if dataloader is not None:
+            _set_sampler_epoch(dataloader, self.trainer.fit_loop.epoch_progress.current.processed)
         dataloader = self.trainer.strategy.process_dataloader(dataloader)
         dataloader_iter = enumerate(dataloader)
         dl_max_batches = self.max_batches[self.current_dataloader_idx]
diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
index 32fd893b759ee..9136cfc9d7f6a 100644
--- a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
@@ -195,7 +195,7 @@ def _reload_dataloader_state_dict(self, data_fetcher: AbstractDataFetcher) -> No
         if isinstance(dataloader, CombinedLoader):
             raise MisconfigurationException(
                 "Reloading support hasn't been implemented for `CombinedLoader`. You can request it by opening an issue"
-                " in `https://github.com/PyTorchLightning/pytorch-lightning/issues`."
+                " in `https://github.com/Lightning-AI/lightning/issues`."
             )
         assert isinstance(dataloader, DataLoader)
         _reload_dataloader_state_dict(dataloader, self._dataloader_state_dict)
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index b6887a4cf546c..395471861a289 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -281,6 +281,7 @@ def teardown(self) -> None:
 
     def on_save_checkpoint(self) -> Dict:
         state_dict = super().on_save_checkpoint()
+        state_dict["_batches_that_stepped"] = self._batches_that_stepped
 
         if (
             self.trainer is not None
@@ -300,6 +301,7 @@ def on_save_checkpoint(self) -> Dict:
     def on_load_checkpoint(self, state_dict: Dict) -> None:
         # cache the dataloader state dict until the dataloader objects are available
         self._dataloader_state_dict = state_dict.get("dataloader_state_dict")
+        self._batches_that_stepped = state_dict.get("_batches_that_stepped", 0)
 
     def _run_validation(self) -> None:
         # reload dataloaders
diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
index ac33390a97cec..0771a4a71de9f 100644
--- a/pytorch_lightning/loops/fit_loop.py
+++ b/pytorch_lightning/loops/fit_loop.py
@@ -21,7 +21,7 @@
 from pytorch_lightning.loops import Loop
 from pytorch_lightning.loops.epoch import TrainingEpochLoop
 from pytorch_lightning.loops.epoch.training_epoch_loop import _OUTPUTS_TYPE as _EPOCH_OUTPUTS_TYPE
-from pytorch_lightning.loops.utilities import _is_max_limit_reached
+from pytorch_lightning.loops.utilities import _is_max_limit_reached, _set_sampler_epoch
 from pytorch_lightning.trainer.connectors.logger_connector.result import _ResultCollection
 from pytorch_lightning.trainer.progress import Progress
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
@@ -232,11 +232,8 @@ def on_advance_start(self) -> None:  # type: ignore[override]
         # reset outputs here instead of in `reset` as they are not accumulated between epochs
         self._outputs = []
 
-        if self.trainer.train_dataloader is not None and callable(
-            getattr(self.trainer.train_dataloader.sampler, "set_epoch", None)
-        ):
-            # set seed for distributed sampler (enables shuffling for each epoch)
-            self.trainer.train_dataloader.sampler.set_epoch(self.epoch_progress.current.processed)
+        if self.trainer.train_dataloader is not None:
+            _set_sampler_epoch(self.trainer.train_dataloader, self.epoch_progress.current.processed)
 
         # changing gradient according accumulation_scheduler
         self.trainer.accumulation_scheduler.on_train_epoch_start(self.trainer, self.trainer.lightning_module)
diff --git a/pytorch_lightning/loops/utilities.py b/pytorch_lightning/loops/utilities.py
index d84c195d758f9..15142be626587 100644
--- a/pytorch_lightning/loops/utilities.py
+++ b/pytorch_lightning/loops/utilities.py
@@ -21,6 +21,7 @@
 import numpy as np
 import torch
 from torch.optim import Optimizer
+from torch.utils.data import DataLoader
 
 import pytorch_lightning as pl
 from pytorch_lightning.loops import Loop
@@ -228,3 +229,16 @@ def _reset_progress(loop: Loop) -> None:
 def _v1_8_output_format(fx: Callable) -> bool:
     parameters = inspect.signature(fx).parameters
     return "new_format" in parameters and parameters["new_format"].default is True
+
+
+def _set_sampler_epoch(dataloader: DataLoader, epoch: int) -> None:
+    """Calls the ``set_epoch`` method on either the sampler or the batch sampler of the given dataloader.
+
+    Every PyTorch dataloader has either a sampler or a batch sampler, and if it is wrapped by a
+    :class:`~torch.utils.data.distributed.DistributedSampler`, ``set_epoch`` must be called at the beginning
+    of every epoch to ensure shuffling applies a new ordering. This has no effect if shuffling is off.
+    """
+    for sampler_name in ("sampler", "batch_sampler"):
+        sampler = getattr(dataloader, sampler_name, None)
+        if sampler is not None and callable(getattr(sampler, "set_epoch", None)):
+            sampler.set_epoch(epoch)
diff --git a/pytorch_lightning/overrides/fairscale.py b/pytorch_lightning/overrides/fairscale.py
index c33bed60902a1..9ab860774cf31 100644
--- a/pytorch_lightning/overrides/fairscale.py
+++ b/pytorch_lightning/overrides/fairscale.py
@@ -11,17 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import operator
+
 import torch.nn as nn
 
 import pytorch_lightning as pl
 from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, unwrap_lightning_module
-from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE
+from pytorch_lightning.utilities import _module_available
+from pytorch_lightning.utilities.imports import _compare_version, _IS_WINDOWS
+
+_FAIRSCALE_AVAILABLE = not _IS_WINDOWS and _module_available("fairscale.nn")
+_FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.3")
+_FAIRSCALE_FULLY_SHARDED_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.4")
 
-LightningShardedDataParallel = None
 if _FAIRSCALE_AVAILABLE:
     from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel
 
-    class LightningShardedDataParallel(_LightningModuleWrapperBase):  # type: ignore[no-redef]
+    class LightningShardedDataParallel(_LightningModuleWrapperBase):
         # Just do this for later docstrings
         pass
 
@@ -31,3 +37,7 @@ def unwrap_lightning_module_sharded(wrapped_model: nn.Module) -> "pl.LightningMo
             model = model.module
 
         return unwrap_lightning_module(model)
+
+else:
+    LightningShardedDataParallel = ...  # type: ignore[assignment,misc]
+    unwrap_lightning_module_sharded = ...  # type: ignore[assignment]
diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py
index ec09b7723132d..26d8ca425bb4d 100644
--- a/pytorch_lightning/overrides/torch_distributed.py
+++ b/pytorch_lightning/overrides/torch_distributed.py
@@ -10,10 +10,10 @@
 _pickler = pickle.Pickler
 _unpickler = pickle.Unpickler
 
-
+_TORCH_DIST_AVAILABLE = torch.distributed.is_available()
 logger = logging.getLogger(__name__)
 
-if torch.distributed.is_available():
+if _TORCH_DIST_AVAILABLE:
     from torch._C._distributed_c10d import ProcessGroup
     from torch.distributed import Backend, broadcast, get_backend, get_rank, GroupMember
 
@@ -157,7 +157,7 @@ def _broadcast_object_list(object_list, src=0, group=None, device=None):
             object_list[i] = _tensor_to_object(obj_view, obj_size)
 
 
-if not torch.distributed.is_available():
+if not _TORCH_DIST_AVAILABLE:
     # avoid failures on early PyTorch versions for Windows where
     # not all functions used in `broadcast_object_list` are available.
     def _broadcast_noop(obj, *_, **__):
diff --git a/pytorch_lightning/plugins/io/torch_plugin.py b/pytorch_lightning/plugins/io/torch_plugin.py
index be10bf967ab05..8791249e7d90c 100644
--- a/pytorch_lightning/plugins/io/torch_plugin.py
+++ b/pytorch_lightning/plugins/io/torch_plugin.py
@@ -54,7 +54,7 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
             atomic_save(checkpoint, path)
         except AttributeError as err:
             # todo (sean): is this try catch necessary still?
-            # https://github.com/PyTorchLightning/pytorch-lightning/pull/431
+            # https://github.com/Lightning-AI/lightning/pull/431
             key = pl.LightningModule.CHECKPOINT_HYPER_PARAMS_KEY
             checkpoint.pop(key, None)
             rank_zero_warn(f"Warning, `{key}` dropped from checkpoint. An attribute is not picklable: {err}")
diff --git a/pytorch_lightning/plugins/precision/ipu.py b/pytorch_lightning/plugins/precision/ipu.py
index 9df0edb53913b..a299be9a730a5 100644
--- a/pytorch_lightning/plugins/precision/ipu.py
+++ b/pytorch_lightning/plugins/precision/ipu.py
@@ -72,7 +72,7 @@ def optimizer_step(
             # we lack coverage here and IPUs are (currently) limited - something to explore if there's demand
             raise MisconfigurationException(
                 "Skipping backward by returning `None` from your `training_step` is not implemented for IPUs."
-                " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`"
+                " Please, open an issue in `https://github.com/Lightning-AI/lightning/issues`"
                 " requesting this feature."
             )
         return closure_result
diff --git a/pytorch_lightning/plugins/precision/sharded_native_amp.py b/pytorch_lightning/plugins/precision/sharded_native_amp.py
index e40aea8ecf4eb..a1484559e5ce4 100644
--- a/pytorch_lightning/plugins/precision/sharded_native_amp.py
+++ b/pytorch_lightning/plugins/precision/sharded_native_amp.py
@@ -15,8 +15,8 @@
 
 import torch
 
+from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
 from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin
-from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 if _FAIRSCALE_AVAILABLE:
diff --git a/pytorch_lightning/plugins/precision/tpu.py b/pytorch_lightning/plugins/precision/tpu.py
index 1afd34264c60c..a0ed9de0a4239 100644
--- a/pytorch_lightning/plugins/precision/tpu.py
+++ b/pytorch_lightning/plugins/precision/tpu.py
@@ -46,7 +46,7 @@ def optimizer_step(
             # we lack coverage here so disable this - something to explore if there's demand
             raise MisconfigurationException(
                 "Skipping backward by returning `None` from your `training_step` is not implemented for TPUs."
-                " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`"
+                " Please, open an issue in `https://github.com/Lightning-AI/lightning/issues`"
                 " requesting this feature."
             )
         return closure_result
diff --git a/pytorch_lightning/setup_tools.py b/pytorch_lightning/setup_tools.py
index 2df3c7946b4d9..82a8b85553232 100644
--- a/pytorch_lightning/setup_tools.py
+++ b/pytorch_lightning/setup_tools.py
@@ -59,7 +59,7 @@ def _load_readme_description(path_dir: str, homepage: str, version: str) -> str:
     # drop images from readme
     text = text.replace("![PT to PL](docs/source/_static/images/general/pl_quick_start_full_compressed.gif)", "")
 
-    # https://github.com/PyTorchLightning/pytorch-lightning/raw/master/docs/source/_static/images/lightning_module/pt_to_pl.png
+    # https://github.com/Lightning-AI/lightning/raw/master/docs/source/_static/images/lightning_module/pt_to_pl.png
     github_source_url = os.path.join(homepage, "raw", version)
     # replace relative repository path to absolute link to the release
     #  do not replace all "docs" as in the readme we reger some other sources with particular path to docs
@@ -81,7 +81,7 @@ def _load_readme_description(path_dir: str, homepage: str, version: str) -> str:
     # todo: wrap content as commented description
     text = re.sub(rf"{skip_begin}.+?{skip_end}", "<!--  -->", text, flags=re.IGNORECASE + re.DOTALL)
 
-    # # https://github.com/Borda/pytorch-lightning/releases/download/1.1.0a6/codecov_badge.png
+    # # https://github.com/Borda/lightning/releases/download/1.1.0a6/codecov_badge.png
     # github_release_url = os.path.join(homepage, "releases", "download", version)
     # # download badge and replace url with local file
     # text = _parse_for_badge(text, github_release_url)
diff --git a/pytorch_lightning/strategies/bagua.py b/pytorch_lightning/strategies/bagua.py
index 6b6598c790b36..9bf619aac8594 100644
--- a/pytorch_lightning/strategies/bagua.py
+++ b/pytorch_lightning/strategies/bagua.py
@@ -19,10 +19,12 @@
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities.distributed import ReduceOp
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _BAGUA_AVAILABLE
+from pytorch_lightning.utilities.imports import _package_available
 from pytorch_lightning.utilities.optimizer import optimizers_to_device
 from pytorch_lightning.utilities.seed import reset_seed
 
+_BAGUA_AVAILABLE = _package_available("bagua")
+
 if _BAGUA_AVAILABLE:
     import bagua.torch_api as bagua
     from bagua.torch_api.algorithms import Algorithm
diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py
index 453c821b3f59a..16caaffc0819f 100644
--- a/pytorch_lightning/strategies/ddp.py
+++ b/pytorch_lightning/strategies/ddp.py
@@ -30,6 +30,7 @@
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.overrides import LightningDistributedModule
 from pytorch_lightning.overrides.distributed import prepare_for_backward
+from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
@@ -50,7 +51,6 @@
 )
 from pytorch_lightning.utilities.exceptions import DeadlockDetectedException
 from pytorch_lightning.utilities.imports import (
-    _FAIRSCALE_AVAILABLE,
     _IS_WINDOWS,
     _TORCH_GREATER_EQUAL_1_9,
     _TORCH_GREATER_EQUAL_1_10,
diff --git a/pytorch_lightning/strategies/deepspeed.py b/pytorch_lightning/strategies/deepspeed.py
index e3b470cd90363..1ccda976d810c 100644
--- a/pytorch_lightning/strategies/deepspeed.py
+++ b/pytorch_lightning/strategies/deepspeed.py
@@ -356,6 +356,8 @@ def setup_distributed(self):
 
     def setup(self, trainer: "pl.Trainer") -> None:
         self.accelerator.setup(trainer)
+        # we set the device so that optimizers can be created with distributed comms.
+        self.lightning_module._device = self.root_device
         self.setup_optimizers(trainer)
         self.setup_precision_plugin()
         optimizers_to_device(self.optimizers, self.root_device)
diff --git a/pytorch_lightning/strategies/fully_sharded.py b/pytorch_lightning/strategies/fully_sharded.py
index 971441160d333..7dea5a3f0d9ad 100644
--- a/pytorch_lightning/strategies/fully_sharded.py
+++ b/pytorch_lightning/strategies/fully_sharded.py
@@ -18,12 +18,12 @@
 import torch
 
 import pytorch_lightning as pl
+from pytorch_lightning.overrides.fairscale import _FAIRSCALE_FULLY_SHARDED_AVAILABLE
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
 from pytorch_lightning.strategies.ddp import DDPStrategy
 from pytorch_lightning.trainer.states import TrainerFn
-from pytorch_lightning.utilities import _FAIRSCALE_FULLY_SHARDED_AVAILABLE
 from pytorch_lightning.utilities.enums import PrecisionType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.optimizer import optimizers_to_device
diff --git a/pytorch_lightning/strategies/launchers/spawn.py b/pytorch_lightning/strategies/launchers/spawn.py
index d67f9e620a45d..fe2c7763f82c9 100644
--- a/pytorch_lightning/strategies/launchers/spawn.py
+++ b/pytorch_lightning/strategies/launchers/spawn.py
@@ -53,7 +53,7 @@ def __init__(self, strategy: Strategy) -> None:
     def is_interactive_compatible(self) -> bool:
         # The start method 'spawn' is currently the only one that works with DDP and CUDA support
         # The start method 'fork' is the only one supported in Jupyter environments but not compatible with CUDA
-        # For more context, see https://github.com/PyTorchLightning/pytorch-lightning/issues/7550
+        # For more context, see https://github.com/Lightning-AI/lightning/issues/7550
         return self._start_method == "fork" and self._strategy.root_device.type != "cuda"
 
     def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] = None, **kwargs: Any) -> Any:
diff --git a/pytorch_lightning/strategies/sharded.py b/pytorch_lightning/strategies/sharded.py
index 8a76520755345..06fc8c0f34a42 100644
--- a/pytorch_lightning/strategies/sharded.py
+++ b/pytorch_lightning/strategies/sharded.py
@@ -20,11 +20,11 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.core.optimizer import LightningOptimizer
+from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE, _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE
 from pytorch_lightning.strategies.ddp import DDPStrategy
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities.enums import PrecisionType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE, _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE
 from pytorch_lightning.utilities.optimizer import optimizers_to_device
 from pytorch_lightning.utilities.rank_zero import rank_zero_only
 
diff --git a/pytorch_lightning/strategies/sharded_spawn.py b/pytorch_lightning/strategies/sharded_spawn.py
index 58ad47f464bfc..40b5f359fe9c5 100644
--- a/pytorch_lightning/strategies/sharded_spawn.py
+++ b/pytorch_lightning/strategies/sharded_spawn.py
@@ -19,10 +19,10 @@
 from torch.optim import Optimizer
 
 import pytorch_lightning as pl
+from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
 from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE
 from pytorch_lightning.utilities.optimizer import optimizers_to_device
 from pytorch_lightning.utilities.rank_zero import rank_zero_only
 
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
index d242236e8317e..e795358ef3420 100644
--- a/pytorch_lightning/trainer/connectors/accelerator_connector.py
+++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -221,7 +221,7 @@ def _init_deterministic(self, deterministic: Optional[bool]) -> None:
         torch.use_deterministic_algorithms(self.deterministic)
         if self.deterministic:
             # fixing non-deterministic part of horovod
-            # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
+            # https://github.com/Lightning-AI/lightning/pull/1572/files#r420279383
             os.environ["HOROVOD_FUSION_THRESHOLD"] = "0"
 
             # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
@@ -605,7 +605,7 @@ def _check_strategy_and_fallback(self) -> None:
             if _TPU_AVAILABLE:
                 raise MisconfigurationException(
                     "`accelerator='ddp_cpu'` is not supported on TPU machines. "
-                    "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810"
+                    "Learn more: https://github.com/Lightning-AI/lightning/issues/7810"
                 )
             if self._devices_flag == 1 and self._num_nodes_flag > 1:
                 strategy_flag = DDPStrategy.strategy_name
@@ -725,7 +725,7 @@ def _validate_precision_choice(self) -> None:
             if self._precision_flag == 64:
                 raise MisconfigurationException(
                     "`Trainer(accelerator='tpu', precision=64)` is not implemented."
-                    " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`"
+                    " Please, open an issue in `https://github.com/Lightning-AI/lightning/issues`"
                     " requesting this feature."
                 )
             if self._precision_plugin_flag and not isinstance(
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py b/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py
index dadecef006278..49ef52614e8f6 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py
@@ -182,7 +182,7 @@ def check_logging(cls, fx_name: str) -> None:
         if fx_name not in cls.functions:
             raise RuntimeError(
                 f"Logging inside `{fx_name}` is not implemented."
-                " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`."
+                " Please, open an issue in `https://github.com/Lightning-AI/lightning/issues`."
             )
 
         if cls.functions[fx_name] is None:
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/result.py b/pytorch_lightning/trainer/connectors/logger_connector/result.py
index f7f708fdd1fd6..744921c7c28f8 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/result.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/result.py
@@ -125,7 +125,7 @@ def __post_init__(self) -> None:
     def _parse_reduce_fx(self) -> None:
         error = (
             "Only `self.log(..., reduce_fx={min,max,mean,sum})` are currently supported."
-            " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`."
+            " Please, open an issue in `https://github.com/Lightning-AI/lightning/issues`."
             f" Found: {self.reduce_fx}"
         )
         if isinstance(self.reduce_fx, str):
diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py
index b8f688892b318..6d3ec88b0be6a 100644
--- a/pytorch_lightning/trainer/supporters.py
+++ b/pytorch_lightning/trainer/supporters.py
@@ -438,9 +438,14 @@ class DataLoaderDict(dict):
 
     @property
     def sampler(self) -> Union[Iterable, Sequence, Mapping]:
-        """Return a collections of samplers extracting from loaders."""
+        """Return a collections of samplers extracted from loaders."""
         return apply_to_collection(self.loaders, (DataLoader, IterableDataset), getattr, "sampler", None)
 
+    @property
+    def batch_sampler(self) -> Union[Iterable, Sequence, Mapping]:
+        """Return a collections of batch samplers extracted from loaders."""
+        return apply_to_collection(self.loaders, (DataLoader, IterableDataset), getattr, "batch_sampler", None)
+
     def _wrap_loaders_max_size_cycle(self) -> Any:
         """Wraps all loaders to make sure they are cycled until the longest loader is exhausted.
 
diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py
index 87947ac9a10f3..b5b70e7220a8d 100644
--- a/pytorch_lightning/utilities/__init__.py
+++ b/pytorch_lightning/utilities/__init__.py
@@ -29,11 +29,7 @@
 from pytorch_lightning.utilities.grads import grad_norm  # noqa: F401
 from pytorch_lightning.utilities.imports import (  # noqa: F401
     _APEX_AVAILABLE,
-    _BAGUA_AVAILABLE,
     _DEEPSPEED_AVAILABLE,
-    _FAIRSCALE_AVAILABLE,
-    _FAIRSCALE_FULLY_SHARDED_AVAILABLE,
-    _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE,
     _GROUP_AVAILABLE,
     _HOROVOD_AVAILABLE,
     _HPU_AVAILABLE,
@@ -46,7 +42,6 @@
     _module_available,
     _OMEGACONF_AVAILABLE,
     _POPTORCH_AVAILABLE,
-    _RICH_AVAILABLE,
     _TORCH_GREATER_EQUAL_1_9,
     _TORCH_GREATER_EQUAL_1_10,
     _TORCH_GREATER_EQUAL_1_11,
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index 3647dbedd11ee..b9884b29cb999 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -87,7 +87,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
 
 _IS_WINDOWS = platform.system() == "Windows"
 _IS_INTERACTIVE = hasattr(sys, "ps1")  # https://stackoverflow.com/a/64523765
-_PYTHON_GREATER_EQUAL_3_8_0 = Version(platform.python_version()) >= Version("3.8.0")
+_PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8)
 _TORCH_GREATER_EQUAL_1_8_1 = _compare_version("torch", operator.ge, "1.8.1")
 _TORCH_GREATER_EQUAL_1_9 = _compare_version("torch", operator.ge, "1.9.0")
 _TORCH_GREATER_EQUAL_1_9_1 = _compare_version("torch", operator.ge, "1.9.1")
@@ -97,14 +97,10 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
 _TORCH_GREATER_EQUAL_1_12 = _compare_version("torch", operator.ge, "1.12.0", use_base_version=True)
 
 _APEX_AVAILABLE = _module_available("apex.amp")
-_BAGUA_AVAILABLE = _package_available("bagua")
 _DEEPSPEED_AVAILABLE = _package_available("deepspeed")
 _DEEPSPEED_GREATER_EQUAL_0_5_9 = _DEEPSPEED_AVAILABLE and _compare_version("deepspeed", operator.ge, "0.5.9")
 _DEEPSPEED_GREATER_EQUAL_0_6 = _DEEPSPEED_AVAILABLE and _compare_version("deepspeed", operator.ge, "0.6.0")
 _DOCSTRING_PARSER_AVAILABLE = _package_available("docstring_parser")
-_FAIRSCALE_AVAILABLE = not _IS_WINDOWS and _module_available("fairscale.nn")
-_FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.3")
-_FAIRSCALE_FULLY_SHARDED_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.4")
 _GROUP_AVAILABLE = not _IS_WINDOWS and _module_available("torch.distributed.group")
 _HOROVOD_AVAILABLE = _module_available("horovod.torch")
 _HYDRA_AVAILABLE = _package_available("hydra")
@@ -116,7 +112,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
 _OMEGACONF_AVAILABLE = _package_available("omegaconf")
 _POPTORCH_AVAILABLE = _package_available("poptorch")
 _HABANA_FRAMEWORK_AVAILABLE = _package_available("habana_frameworks")
-_RICH_AVAILABLE = _package_available("rich") and _compare_version("rich", operator.ge, "10.2.2")
+
 _TORCH_QUANTIZE_AVAILABLE = bool([eg for eg in torch.backends.quantized.supported_engines if eg != "none"])
 _TORCHTEXT_AVAILABLE = _package_available("torchtext")
 _TORCHTEXT_LEGACY: bool = _TORCHTEXT_AVAILABLE and _compare_version("torchtext", operator.lt, "0.11.0")
diff --git a/pytorch_lightning/utilities/meta.py b/pytorch_lightning/utilities/meta.py
index a5edcfb300188..ef2327d81119e 100644
--- a/pytorch_lightning/utilities/meta.py
+++ b/pytorch_lightning/utilities/meta.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import importlib
 import inspect
+import operator
 import threading
 from contextlib import contextmanager
 from functools import partial
@@ -27,9 +28,12 @@
 
 import pytorch_lightning as pl
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_10
+from pytorch_lightning.utilities.imports import _compare_version
 from pytorch_lightning.utilities.rank_zero import rank_zero_warn
 
+# this is needed for proper generating Meta package
+_TORCH_GREATER_EQUAL_1_10 = _compare_version("torch", operator.ge, "1.10.0")
+
 if _TORCH_GREATER_EQUAL_1_10:
     from torch._C import _DisableTorchDispatch  # type: ignore[attr-defined]
 
diff --git a/pytorch_lightning/utilities/migration.py b/pytorch_lightning/utilities/migration.py
index 30cc823210423..ed71f25a571f7 100644
--- a/pytorch_lightning/utilities/migration.py
+++ b/pytorch_lightning/utilities/migration.py
@@ -28,7 +28,7 @@ class pl_legacy_patch:
     unpickling old checkpoints. The following patches apply.
 
         1. ``pytorch_lightning.utilities.argparse._gpus_arg_default``: Applies to all checkpoints saved prior to
-           version 1.2.8. See: https://github.com/PyTorchLightning/pytorch-lightning/pull/6898
+           version 1.2.8. See: https://github.com/Lightning-AI/lightning/pull/6898
         2. ``pytorch_lightning.utilities.argparse_utils``: A module that was deprecated in 1.2 and removed in 1.4,
            but still needs to be available for import for legacy checkpoints.
 
diff --git a/pytorch_lightning/utilities/model_summary.py b/pytorch_lightning/utilities/model_summary.py
index f0419e4b97077..9c5ff088da368 100644
--- a/pytorch_lightning/utilities/model_summary.py
+++ b/pytorch_lightning/utilities/model_summary.py
@@ -15,7 +15,7 @@
 import contextlib
 import logging
 from collections import OrderedDict
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, cast, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -120,7 +120,9 @@ def layer_type(self) -> str:
     @property
     def num_parameters(self) -> int:
         """Returns the number of parameters in this module."""
-        return sum(np.prod(p.shape) if not _is_lazy_weight_tensor(p) else 0 for p in self._module.parameters())
+        return sum(
+            cast(int, np.prod(p.shape)) if not _is_lazy_weight_tensor(p) else 0 for p in self._module.parameters()
+        )
 
 
 class ModelSummary:
diff --git a/requirements/base.txt b/requirements/base.txt
index 555997c6576e6..768da61c48fd6 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -4,7 +4,7 @@ tqdm>=4.57.0, <=4.63.0
 PyYAML>=5.4, <=6.0
 fsspec[http]>=2021.05.0, !=2021.06.0, <=2022.2.0
 tensorboard>=2.2.0, <2.10.0
-torchmetrics>=0.4.1, <=0.7.2
+torchmetrics>=0.4.1, <0.9.2
 pyDeprecate>=0.3.1, <=0.3.2
 packaging>=17.0, <=21.3
 typing-extensions>=4.0.0, <4.2.1
diff --git a/requirements/extra.txt b/requirements/extra.txt
index cef58c6c21221..8162eed3f8518 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -6,4 +6,4 @@ omegaconf>=2.0.5, <=2.1.*
 hydra-core>=1.0.5, <=1.1.*
 jsonargparse[signatures]>=4.7.1, <4.7.4
 gcsfs>=2021.5.0, <=2022.2.0
-rich>=10.2.2,!=10.15.*, <=12.0.0
+rich>=10.2.2, !=10.15.0.a, <13.0.0
diff --git a/setup.py b/setup.py
index 57f59045ed256..2627ebdc6eca8 100755
--- a/setup.py
+++ b/setup.py
@@ -69,7 +69,7 @@ def _load_py_module(fname, pkg="pytorch_lightning"):
     author=about.__author__,
     author_email=about.__author_email__,
     url=about.__homepage__,
-    download_url="https://github.com/PyTorchLightning/pytorch-lightning",
+    download_url="https://github.com/Lightning-AI/lightning",
     license=about.__license__,
     packages=find_packages(exclude=["tests*", "pl_examples*", "legacy*"]),
     include_package_data=True,
@@ -82,9 +82,9 @@ def _load_py_module(fname, pkg="pytorch_lightning"):
     install_requires=setup_tools._load_requirements(_PATH_REQUIRE),
     extras_require=extras,
     project_urls={
-        "Bug Tracker": "https://github.com/PyTorchLightning/pytorch-lightning/issues",
-        "Documentation": "https://pytorch-lightning.rtfd.io/en/latest/",
-        "Source Code": "https://github.com/PyTorchLightning/pytorch-lightning",
+        "Bug Tracker": "https://github.com/Lightning-AI/lightning/issues",
+        "Documentation": "https://lightning.rtfd.io/en/latest/",
+        "Source Code": "https://github.com/Lightning-AI/lightning",
     },
     classifiers=[
         "Environment :: Console",
diff --git a/tests/README.md b/tests/README.md
index 105aed20004ef..278dd9fe45ea0 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -22,8 +22,7 @@ pre-commit install
 Additionally, for testing backward compatibility with older versions of PyTorch Lightning, you also need to download all saved version-checkpoints from the public AWS storage. Run the following script to get all saved version-checkpoints:
 
 ```bash
-wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -P legacy/
-unzip -o legacy/checkpoints.zip -d legacy/
+bash .actions/pull_legacy_checkpoints.sh
 ```
 
 Note: These checkpoints are generated to set baselines for maintaining backward compatibility with legacy versions of PyTorch Lightning. Details of checkpoints for back-compatibility can be found [here](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/legacy/README.md).
diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py
index 180e7c46fe7d4..4fe9d300c3596 100644
--- a/tests/core/test_lightning_module.py
+++ b/tests/core/test_lightning_module.py
@@ -299,7 +299,10 @@ def assert_device(device: torch.device) -> None:
 
 @RunIf(min_torch="1.10", skip_windows=True)
 def test_sharded_tensor_state_dict(single_process_pg):
-    from torch.distributed._sharded_tensor import empty as sharded_tensor_empty
+    if _TORCH_GREATER_EQUAL_1_11:
+        from torch.distributed._shard.sharded_tensor import empty as sharded_tensor_empty
+    else:
+        from torch.distributed._sharded_tensor import empty as sharded_tensor_empty
     from torch.distributed._sharding_spec import ChunkShardingSpec
 
     class BoringModelWithShardedTensor(BoringModel):
diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py
index 5a2464f6fd6ba..ed1b8ee7bc044 100644
--- a/tests/helpers/runif.py
+++ b/tests/helpers/runif.py
@@ -20,17 +20,16 @@
 from packaging.version import Version
 from pkg_resources import get_distribution
 
+from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE
+from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE, _FAIRSCALE_FULLY_SHARDED_AVAILABLE
+from pytorch_lightning.strategies.bagua import _BAGUA_AVAILABLE
 from pytorch_lightning.utilities import (
     _APEX_AVAILABLE,
-    _BAGUA_AVAILABLE,
     _DEEPSPEED_AVAILABLE,
-    _FAIRSCALE_AVAILABLE,
-    _FAIRSCALE_FULLY_SHARDED_AVAILABLE,
     _HOROVOD_AVAILABLE,
     _HPU_AVAILABLE,
     _IPU_AVAILABLE,
     _OMEGACONF_AVAILABLE,
-    _RICH_AVAILABLE,
     _TORCH_GREATER_EQUAL_1_10,
     _TORCH_QUANTIZE_AVAILABLE,
     _TPU_AVAILABLE,
diff --git a/tests/loggers/test_neptune.py b/tests/loggers/test_neptune.py
index 87e52159b61d6..de9329564292b 100644
--- a/tests/loggers/test_neptune.py
+++ b/tests/loggers/test_neptune.py
@@ -47,7 +47,7 @@ class Run:
 
     def __setitem__(self, key, value):
         # called once
-        assert key == "source_code/integrations/pytorch-lightning"
+        assert key == "source_code/integrations/lightning"
         assert value == __version__
 
     def wait(self):
@@ -89,7 +89,7 @@ def test_neptune_online(self, neptune):
         self.assertEqual(created_run_mock.__getitem__.call_count, 2)
         self.assertEqual(created_run_mock.__setitem__.call_count, 1)
         created_run_mock.__getitem__.assert_has_calls([call("sys/id"), call("sys/name")], any_order=True)
-        created_run_mock.__setitem__.assert_called_once_with("source_code/integrations/pytorch-lightning", __version__)
+        created_run_mock.__setitem__.assert_called_once_with("source_code/integrations/lightning", __version__)
 
     @patch("pytorch_lightning.loggers.neptune.Run", Run)
     def test_online_with_custom_run(self, neptune):
diff --git a/tests/loops/test_evaluation_loop.py b/tests/loops/test_evaluation_loop.py
index 137608c426ee0..bddf819aafdd6 100644
--- a/tests/loops/test_evaluation_loop.py
+++ b/tests/loops/test_evaluation_loop.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from unittest import mock
-from unittest.mock import Mock
+from unittest.mock import call, Mock
 
 import torch
 from torch.utils.data.dataloader import DataLoader
-from torch.utils.data.sampler import RandomSampler
+from torch.utils.data.sampler import BatchSampler, RandomSampler
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.loops import EvaluationEpochLoop
@@ -44,9 +44,8 @@ def test_on_evaluation_epoch_end(eval_epoch_end_mock, tmpdir):
     assert eval_epoch_end_mock.call_count == 4
 
 
-def test_set_epoch_called_eval_predict(tmpdir):
-    """Tests that set_epoch (if the sampler has one) is called on the DataLoader during evaluation and
-    prediction."""
+def test_evaluation_loop_sampler_set_epoch_called(tmpdir):
+    """Tests that set_epoch is called on the dataloader's sampler (if any) during training and validation."""
 
     def _get_dataloader():
         dataset = RandomDataset(32, 64)
@@ -56,20 +55,60 @@ def _get_dataloader():
 
     model = BoringModel()
     trainer = Trainer(
-        default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, max_epochs=2, enable_model_summary=False
+        default_root_dir=tmpdir,
+        limit_train_batches=1,
+        limit_val_batches=1,
+        max_epochs=2,
+        enable_model_summary=False,
+        enable_checkpointing=False,
+        logger=False,
+    )
+
+    train_dataloader = _get_dataloader()
+    val_dataloader = _get_dataloader()
+    trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)
+    # One for each epoch
+    assert train_dataloader.sampler.set_epoch.call_args_list == [call(0), call(1)]
+    # One for each epoch + sanity check
+    assert val_dataloader.sampler.set_epoch.call_args_list == [call(0), call(0), call(1)]
+
+    val_dataloader = _get_dataloader()
+    trainer.validate(model, val_dataloader)
+    assert val_dataloader.sampler.set_epoch.call_args_list == [call(2)]
+
+
+def test_evaluation_loop_batch_sampler_set_epoch_called(tmpdir):
+    """Tests that set_epoch is called on the dataloader's batch sampler (if any) during training and validation."""
+
+    def _get_dataloader():
+        dataset = RandomDataset(32, 64)
+        sampler = RandomSampler(dataset)
+        batch_sampler = BatchSampler(sampler, 2, True)
+        batch_sampler.set_epoch = Mock()
+        return DataLoader(dataset, batch_sampler=batch_sampler)
+
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=1,
+        limit_val_batches=1,
+        max_epochs=2,
+        enable_model_summary=False,
+        enable_checkpointing=False,
+        logger=False,
     )
 
     train_dataloader = _get_dataloader()
     val_dataloader = _get_dataloader()
     trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)
     # One for each epoch
-    assert train_dataloader.sampler.set_epoch.call_count == 2
+    assert train_dataloader.batch_sampler.set_epoch.call_args_list == [call(0), call(1)]
     # One for each epoch + sanity check
-    assert val_dataloader.sampler.set_epoch.call_count == 3
+    assert val_dataloader.batch_sampler.set_epoch.call_args_list == [call(0), call(0), call(1)]
 
     val_dataloader = _get_dataloader()
     trainer.validate(model, val_dataloader)
-    assert val_dataloader.sampler.set_epoch.call_count == 1
+    assert val_dataloader.batch_sampler.set_epoch.call_args_list == [call(2)]
 
 
 @mock.patch(
diff --git a/tests/loops/test_loop_state_dict.py b/tests/loops/test_loop_state_dict.py
index 1e67fcc0ed8db..f9630095502d1 100644
--- a/tests/loops/test_loop_state_dict.py
+++ b/tests/loops/test_loop_state_dict.py
@@ -47,7 +47,7 @@ def test_loops_state_dict_structure():
     expected = {
         "fit_loop": {
             "state_dict": {},
-            "epoch_loop.state_dict": {},
+            "epoch_loop.state_dict": {"_batches_that_stepped": 0},
             "epoch_loop.batch_progress": {
                 "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
                 "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
diff --git a/tests/loops/test_utilities.py b/tests/loops/test_utilities.py
index c5d2e98d008b0..914c1de8e115b 100644
--- a/tests/loops/test_utilities.py
+++ b/tests/loops/test_utilities.py
@@ -11,10 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from unittest.mock import Mock
+
 import pytest
 import torch
 
-from pytorch_lightning.loops.utilities import _extract_hiddens, _v1_8_output_format
+from pytorch_lightning.loops.utilities import _extract_hiddens, _set_sampler_epoch, _v1_8_output_format
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
@@ -61,3 +63,23 @@ def training_epoch_end(outputs, new_format=True):
         ...
 
     assert _v1_8_output_format(training_epoch_end)
+
+
+def test_set_sampler_epoch():
+    # No samplers
+    dataloader = Mock()
+    dataloader.sampler = None
+    dataloader.batch_sampler = None
+    _set_sampler_epoch(dataloader, 55)
+
+    # set_epoch not callable
+    dataloader = Mock()
+    dataloader.sampler.set_epoch = None
+    dataloader.batch_sampler.set_epoch = None
+    _set_sampler_epoch(dataloader, 55)
+
+    # set_epoch callable
+    dataloader = Mock()
+    _set_sampler_epoch(dataloader, 55)
+    dataloader.sampler.set_epoch.assert_called_once_with(55)
+    dataloader.batch_sampler.set_epoch.assert_called_once_with(55)
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
index 136e8ee516bbb..d0519de90e7a2 100644
--- a/tests/models/test_restore.py
+++ b/tests/models/test_restore.py
@@ -259,6 +259,7 @@ def on_train_start(self) -> None:
     trainer.fit(TestModel(), ckpt_path=ckpt_path)
     assert trainer.current_epoch == max_epochs
     assert trainer.global_step == max_epochs * train_batches
+    assert trainer.fit_loop.epoch_loop._batches_that_stepped == max_epochs * train_batches
 
 
 def test_fit_twice(tmpdir):
diff --git a/tests/plugins/precision/test_sharded_precision.py b/tests/plugins/precision/test_sharded_precision.py
index 754095912fb53..8fde1946459b2 100644
--- a/tests/plugins/precision/test_sharded_precision.py
+++ b/tests/plugins/precision/test_sharded_precision.py
@@ -15,8 +15,8 @@
 import pytest
 import torch
 
+from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
 from pytorch_lightning.plugins import ShardedNativeMixedPrecisionPlugin
-from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE
 from tests.helpers.runif import RunIf
 
 ShardedGradScaler = None
diff --git a/tests/strategies/test_ddp_fully_sharded_with_full_state_dict.py b/tests/strategies/test_ddp_fully_sharded_with_full_state_dict.py
index 2912d59598220..0a26236acecdd 100644
--- a/tests/strategies/test_ddp_fully_sharded_with_full_state_dict.py
+++ b/tests/strategies/test_ddp_fully_sharded_with_full_state_dict.py
@@ -7,9 +7,9 @@
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.overrides.fairscale import _FAIRSCALE_FULLY_SHARDED_AVAILABLE
 from pytorch_lightning.plugins import FullyShardedNativeMixedPrecisionPlugin
 from pytorch_lightning.strategies import DDPFullyShardedStrategy
-from pytorch_lightning.utilities import _FAIRSCALE_FULLY_SHARDED_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.runif import RunIf
diff --git a/tests/strategies/test_deepspeed_strategy.py b/tests/strategies/test_deepspeed_strategy.py
index f3c99203d70eb..c829b203f3846 100644
--- a/tests/strategies/test_deepspeed_strategy.py
+++ b/tests/strategies/test_deepspeed_strategy.py
@@ -1199,3 +1199,26 @@ def training_step(self, *args, **kwargs):
     ckpt_path = os.path.join(trainer.checkpoint_callback.dirpath, filepath)
     expected = {"latest", "zero_to_fp32.py", "checkpoint"}
     assert expected == set(os.listdir(ckpt_path))
+
+
+@RunIf(min_cuda_gpus=2, deepspeed=True, standalone=True)
+def test_deepspeed_configure_optimizer_device_set(tmpdir):
+    """Test to ensure that the LM has access to the device within the ``configure_optimizer`` function, and
+    estimated_stepping_batches works correctly as a result."""
+
+    class TestModel(BoringModel):
+        def configure_optimizers(self):
+            assert self.trainer.estimated_stepping_batches == 1
+            assert self.device.type == "cuda"
+            raise SystemExit
+
+    model = TestModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        accelerator="gpu",
+        devices=2,
+        strategy=DeepSpeedStrategy(),
+    )
+    with pytest.raises(SystemExit):
+        trainer.fit(model)
diff --git a/tests/strategies/test_sharded_strategy.py b/tests/strategies/test_sharded_strategy.py
index 8a1313e5a6a45..dff7ca0a0d75d 100644
--- a/tests/strategies/test_sharded_strategy.py
+++ b/tests/strategies/test_sharded_strategy.py
@@ -6,9 +6,9 @@
 import torch
 
 from pytorch_lightning import LightningModule, Trainer
+from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
 from pytorch_lightning.strategies import DDPShardedStrategy, DDPSpawnShardedStrategy
 from pytorch_lightning.trainer.states import TrainerFn
-from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.runif import RunIf
 
diff --git a/tests/trainer/logging_/test_eval_loop_logging.py b/tests/trainer/logging_/test_eval_loop_logging.py
index 4457aba18e796..d41044240fa92 100644
--- a/tests/trainer/logging_/test_eval_loop_logging.py
+++ b/tests/trainer/logging_/test_eval_loop_logging.py
@@ -25,11 +25,12 @@
 import torch
 
 from pytorch_lightning import callbacks, Trainer
+from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE
 from pytorch_lightning.loggers import TensorBoardLogger
 from pytorch_lightning.loops.dataloader import EvaluationLoop
 from pytorch_lightning.trainer.states import RunningStage
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0, _RICH_AVAILABLE
+from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0
 from tests.helpers import BoringModel, RandomDataset
 from tests.helpers.runif import RunIf
 
diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index 108ea323ecd89..bbd2d61f3d03d 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -19,7 +19,7 @@
 import torch
 from torch.utils.data import RandomSampler
 from torch.utils.data.dataloader import DataLoader
-from torch.utils.data.dataset import Dataset, IterableDataset, Subset
+from torch.utils.data.dataset import Dataset, IterableDataset
 from torch.utils.data.distributed import DistributedSampler
 from torch.utils.data.sampler import SequentialSampler
 
@@ -855,53 +855,6 @@ def test_dataloader_distributed_sampler_already_attached(tmpdir):
     assert trainer.state.finished, "DDP Training failed"
 
 
-@RunIf(min_gpus=3)
-def test_batch_size_smaller_than_num_gpus(tmpdir):
-    # we need at least 3 gpus for this test
-    num_gpus = 3
-    batch_size = 3
-
-    class CurrentTestModel(BoringModel):
-        def __init__(self, batch_size) -> None:
-            super().__init__()
-            self.save_hyperparameters()
-            # batch norm doesn't work with batch size 1, we replace it
-            self.c_d1_bn = torch.nn.ReLU()
-
-        def training_step(self, *args, **kwargs):
-            output = super().training_step(*args, **kwargs)
-            loss = output["loss"]
-            # we make sure to add some metrics to the output dict,
-            # this is essential for this test
-            output["progress_bar"] = {"train_loss": loss}
-            return output
-
-        def train_dataloader(self):
-            dataset = RandomDataset(32, 64)
-            # construct a dataset with a size that is not divisible by num_gpus
-            # therefore the last batch will have a size < num_gpus
-            size = num_gpus * self.hparams.batch_size + (num_gpus - 1)
-            dataset = Subset(dataset, range(size))
-            dataloader = DataLoader(dataset, batch_size=self.hparams.batch_size, drop_last=False)
-            return dataloader
-
-    model = CurrentTestModel(batch_size=batch_size)
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        max_epochs=1,
-        limit_train_batches=0.1,
-        limit_val_batches=0,
-        accelerator="gpu",
-        devices=num_gpus,
-    )
-
-    # we expect the reduction for the metrics also to happen on the last batch
-    # where we will get fewer metrics than gpus
-    trainer.fit(model)
-    assert trainer.state.finished, f"Training failed with {trainer.state}"
-
-
 @pytest.mark.parametrize(
     ["multiple_trainloader_mode", "num_training_batches"],
     [("min_size", 16), ("max_size_cycle", 64)],
diff --git a/tests/utilities/test_imports.py b/tests/utilities/test_imports.py
index aa40f71da4982..629517d9c51f9 100644
--- a/tests/utilities/test_imports.py
+++ b/tests/utilities/test_imports.py
@@ -13,11 +13,11 @@
 # limitations under the License.
 import operator
 
+from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
+from pytorch_lightning.strategies.bagua import _BAGUA_AVAILABLE
 from pytorch_lightning.utilities import (
     _APEX_AVAILABLE,
-    _BAGUA_AVAILABLE,
     _DEEPSPEED_AVAILABLE,
-    _FAIRSCALE_AVAILABLE,
     _HOROVOD_AVAILABLE,
     _module_available,
     _OMEGACONF_AVAILABLE,