From e95b3e8ca20881cbb8d27e98363ba740b27d946c Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 2 Jun 2022 14:54:13 +0200 Subject: [PATCH 01/24] update NGC docker (#13136) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update docker * Apply suggestions from code review Co-authored-by: Akihiro Nitta Co-authored-by: Carlos Mocholí --- dockers/nvidia/Dockerfile | 15 +++++++-------- dockers/release/Dockerfile | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/dockers/nvidia/Dockerfile b/dockers/nvidia/Dockerfile index 792835c574ada..6848f6cf34eae 100644 --- a/dockers/nvidia/Dockerfile +++ b/dockers/nvidia/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG PYTORCH_VERSION=21.11 +ARG PYTORCH_VERSION=22.04 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes FROM nvcr.io/nvidia/pytorch:${PYTORCH_VERSION}-py3 @@ -37,20 +37,19 @@ RUN \ cd .. ; \ fi && \ # save the examples - mv pytorch-lightning/_notebooks notebooks && \ + mv pytorch-lightning/_notebooks/.notebooks/ notebooks && \ mv pytorch-lightning/pl_examples . && \ # Installations \ pip install -q fire && \ - python ./pytorch-lightning/.actions/assistant.py requirements_prune_pkgs horovod --req_files ./pytorch-lightning/requirements/extra.txt && \ - pip install "Pillow>=8.2, !=8.3.0" "cryptography>=3.4" "py>=1.10" --no-cache-dir --upgrade-strategy only-if-needed && \ - pip install -r ./pytorch-lightning/requirements/extra.txt --no-cache-dir --upgrade-strategy only-if-needed && \ - pip install -r ./pytorch-lightning/requirements/examples.txt --no-cache-dir --upgrade-strategy only-if-needed && \ - pip install ./pytorch-lightning --no-cache-dir && \ + pip install "Pillow>=8.2, !=8.3.0" "cryptography>=3.4" "py>=1.10" --no-cache-dir && \ + pip install ./pytorch-lightning["extra","loggers","strategies"] --no-cache-dir && \ + pip install -r ./pytorch-lightning/requirements/examples.txt --no-cache-dir && \ rm -rf pytorch-lightning && \ - pip install jupyterlab[all] -U && \ pip list +RUN pip install jupyterlab[all] -U + RUN pip install lightning-grid -U && \ pip install "py>=1.10" "protobuf>=3.15.6" --upgrade-strategy only-if-needed diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile index f4083f2dd42fc..a0ba3a4a41c37 100644 --- a/dockers/release/Dockerfile +++ b/dockers/release/Dockerfile @@ -36,7 +36,7 @@ RUN \ mv pytorch-lightning-*/ pytorch-lightning ; \ rm *.zip ; \ fi && \ - pip install ./pytorch-lightning["extra"] --no-cache-dir && \ + pip install ./pytorch-lightning["extra","loggers","strategies"] --no-cache-dir && \ rm -rf pytorch-lightning RUN python --version && \ From fbdaf3810ea887bbc52224404aa221e1657a955e Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 2 Jun 2022 22:39:14 +0900 Subject: [PATCH 02/24] Decouple pulling legacy checkpoints from existing GHA workflows and docker files (#13185) * Add pull-legacy-checkpoints action * Replace pulls with the new action and script * Simplify --- .actions/pull_legacy_checkpoints.sh | 9 +++++++++ .azure-pipelines/gpu-tests.yml | 5 +---- .github/workflows/ci_test-conda.yml | 9 ++------- .github/workflows/ci_test-full.yml | 10 ++-------- dockers/tpu-tests/Dockerfile | 5 +---- legacy/README.md | 3 +-- tests/README.md | 3 +-- 7 files changed, 17 insertions(+), 27 deletions(-) create mode 100644 .actions/pull_legacy_checkpoints.sh diff --git a/.actions/pull_legacy_checkpoints.sh b/.actions/pull_legacy_checkpoints.sh new file mode 100644 index 0000000000000..8b3f791297b66 --- /dev/null +++ b/.actions/pull_legacy_checkpoints.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# Run this script from the project root. +URL="https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip" +mkdir -p legacy +# wget is simpler but does not work on Windows +python -c "from urllib.request import urlretrieve; urlretrieve('$URL', 'legacy/checkpoints.zip')" +ls -l legacy/ +unzip -o legacy/checkpoints.zip -d legacy/ +ls -l legacy/checkpoints/ diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index c0074adf4e81c..157bb1d535f9a 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -69,10 +69,7 @@ jobs: python requirements/check-avail-extras.py displayName: 'Env details' - - bash: | - wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -P legacy/ - unzip -o legacy/checkpoints.zip -d legacy/ - ls -l legacy/checkpoints/ + - bash: bash .actions/pull_legacy_checkpoints.sh displayName: 'Get legacy checkpoints' - bash: | diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index c907e13b4ac23..7e6b8842eab5d 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -55,13 +55,8 @@ jobs: # sanity check python requirements/check-avail-extras.py - - name: Pull checkpoints from S3 - working-directory: ./legacy - run: | - # enter legacy and update checkpoints from S3 - curl https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip --output checkpoints.zip - unzip -o checkpoints.zip - ls -l checkpoints/ + - name: Pull legacy checkpoints + run: bash .actions/pull_legacy_checkpoints.sh - name: Tests run: | diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index f260c67069912..b1a1ac4c5d4a3 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -76,14 +76,8 @@ jobs: restore-keys: | ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}- - - name: Pull checkpoints from S3 - working-directory: ./legacy - run: | - # wget is simpler but does not work on Windows - python -c "from urllib.request import urlretrieve ; urlretrieve('https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip', 'checkpoints.zip')" - ls -l . - unzip -o checkpoints.zip - ls -l checkpoints/ + - name: Pull legacy checkpoints + run: bash .actions/pull_legacy_checkpoints.sh - name: Install dependencies run: | diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile index 602ef1684b859..d4c58c665e7a5 100644 --- a/dockers/tpu-tests/Dockerfile +++ b/dockers/tpu-tests/Dockerfile @@ -22,10 +22,7 @@ LABEL maintainer="PyTorchLightning " COPY ./ ./pytorch-lightning/ # Pull the legacy checkpoints -RUN cd pytorch-lightning && \ - wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -P legacy/ && \ - unzip -o legacy/checkpoints.zip -d legacy/ && \ - ls -l legacy/checkpoints/ +RUN cd pytorch-lightning && bash .actions/pull_legacy_checkpoints.sh RUN \ pip install -q fire && \ diff --git a/legacy/README.md b/legacy/README.md index efbd18f7eede6..68eb718a98b07 100644 --- a/legacy/README.md +++ b/legacy/README.md @@ -7,8 +7,7 @@ At this moment we focus on ability running old checkpoints, so the flow here is If you want to pull all saved version-checkpoints for local testing/development, call ```bash -wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -unzip -o checkpoints.zip +bash .actions/pull_legacy_checkpoints.sh ``` To back populate collection with past version you can use following bash: diff --git a/tests/README.md b/tests/README.md index 105aed20004ef..278dd9fe45ea0 100644 --- a/tests/README.md +++ b/tests/README.md @@ -22,8 +22,7 @@ pre-commit install Additionally, for testing backward compatibility with older versions of PyTorch Lightning, you also need to download all saved version-checkpoints from the public AWS storage. Run the following script to get all saved version-checkpoints: ```bash -wget https://pl-public-data.s3.amazonaws.com/legacy/checkpoints.zip -P legacy/ -unzip -o legacy/checkpoints.zip -d legacy/ +bash .actions/pull_legacy_checkpoints.sh ``` Note: These checkpoints are generated to set baselines for maintaining backward compatibility with legacy versions of PyTorch Lightning. Details of checkpoints for back-compatibility can be found [here](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/legacy/README.md). From 96d1f92b0293bcf7337426e7a845fd447abccf16 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 9 Jun 2022 03:36:52 +0900 Subject: [PATCH 03/24] Merge pull request #13250 from PyTorchLightning/ci/rm-base CI: Remove simple test `ci_test-base.yml` --- .github/workflows/ci_test-base.yml | 81 ----------------------------- .github/workflows/ci_test-conda.yml | 18 +++++-- .github/workflows/ci_test-full.yml | 11 ++-- 3 files changed, 20 insertions(+), 90 deletions(-) delete mode 100644 .github/workflows/ci_test-base.yml diff --git a/.github/workflows/ci_test-base.yml b/.github/workflows/ci_test-base.yml deleted file mode 100644 index 221f92d26ad5e..0000000000000 --- a/.github/workflows/ci_test-base.yml +++ /dev/null @@ -1,81 +0,0 @@ -# this jobs runs `pytest` over the source directory. It does not install any extra dependencies. -# this is useful to catch errors where an import has been added which is not part of the basic dependencies. -name: Test simple - -# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: # Trigger the workflow on push or pull request, but only for the master branch - push: - branches: [master, "release/*"] - pull_request: - branches: [master, "release/*"] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} - cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} - -jobs: - source: - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-20.04] - # this will install stable torch - python-version: [3.9] - - # lower timeout as this should run very quickly - timeout-minutes: 20 - steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - name: Reset caching - run: python -c "import time; days = time.time() / 60 / 60 / 24; print(f'TIME_PERIOD=d{int(days / 2) * 2}')" >> $GITHUB_ENV - - # Note: This uses an internal pip API and may not always work - # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow - - name: Get pip cache - id: pip-cache - run: python -c "from pip._internal.locations import USER_CACHE_DIR; print('::set-output name=dir::' + USER_CACHE_DIR)" - - - name: Cache pip - uses: actions/cache@v2 - with: - path: ${{ steps.pip-cache.outputs.dir }} - key: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ hashFiles('requirements/base.txt') }} - restore-keys: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.requires }}- - - - name: Install dependencies - run: | - python --version - python -m pip install --upgrade --user pip - pip --version - pip install -r requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade - pip install -r requirements/test.txt - pip list - shell: bash - - - name: Test Package [only] - run: | - coverage run --source pytorch_lightning -m pytest pytorch_lightning -v - - - name: Statistics - if: success() - run: | - coverage report - coverage xml - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 - if: always() - # see: https://github.com/actions/toolkit/issues/399 - continue-on-error: true - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: coverage.xml - flags: cpu,pytest - name: Base-coverage - fail_ci_if_error: false diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index 7e6b8842eab5d..0466a83760338 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -35,15 +35,23 @@ jobs: - uses: actions/checkout@v2 - - name: Update dependencies + - name: Update base dependencies + run: | + conda info + conda list + pip install -r requirements/test.txt + + - name: DocTests + run: | + coverage run --source pytorch_lightning -m pytest pytorch_lightning + + - name: Update all dependencies env: HOROVOD_BUILD_ARCH_FLAGS: "-mfma" HOROVOD_WITHOUT_MXNET: 1 HOROVOD_WITHOUT_TENSORFLOW: 1 run: | set -e - conda info - conda list # adjust versions according installed Torch version python ./requirements/adjust-versions.py requirements/extra.txt python ./requirements/adjust-versions.py requirements/examples.txt @@ -58,9 +66,9 @@ jobs: - name: Pull legacy checkpoints run: bash .actions/pull_legacy_checkpoints.sh - - name: Tests + - name: UnitTests run: | - coverage run --source pytorch_lightning -m pytest --timeout 150 pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml + coverage run --source pytorch_lightning -m pytest --timeout 150 tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml - name: Upload pytest results uses: actions/upload-artifact@v2 diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index b1a1ac4c5d4a3..bca5699d43029 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -84,13 +84,13 @@ jobs: flag=$(python -c "print('--pre' if '${{matrix.release}}' == 'pre' else '')" 2>&1) url=$(python -c "print('test/cpu/torch_test.html' if '${{matrix.release}}' == 'pre' else 'cpu/torch_stable.html')" 2>&1) pip install -r requirements.txt --upgrade $flag --find-links "https://download.pytorch.org/whl/${url}" - # adjust versions according installed Torch version - python ./requirements/adjust-versions.py requirements/examples.txt - pip install -r requirements/examples.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade pip install -r requirements/test.txt --upgrade pip list shell: bash + - name: DocTests + run: coverage run --source pytorch_lightning -m pytest pytorch_lightning + - name: Install extra dependencies run: | # adjust versions according installed Torch version @@ -126,13 +126,16 @@ jobs: run: | python requirements/check-avail-extras.py - - name: Tests + - name: UnitTests run: | # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - name: Examples run: | + # adjust versions according installed Torch version + python ./requirements/adjust-versions.py requirements/examples.txt + pip install -r requirements/examples.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade python -m pytest pl_examples -v --durations=10 - name: Upload pytest results From 7db3b9f67f191af847493ff32d241529971f7213 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 14 Jun 2022 17:17:57 +0000 Subject: [PATCH 04/24] Update rich requirement from !=10.15.*,<=12.0.0,>=10.2.2 to >=10.2.2,!=10.15.0.a,<13.0.0 in /requirements (#13047) * Update rich requirement in /requirements Updates the requirements on [rich](https://github.com/willmcgugan/rich) to permit the latest version. - [Release notes](https://github.com/willmcgugan/rich/releases) - [Changelog](https://github.com/Textualize/rich/blob/master/CHANGELOG.md) - [Commits](https://github.com/willmcgugan/rich/compare/v10.2.2...v12.4.1) --- updated-dependencies: - dependency-name: rich dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- requirements/extra.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/extra.txt b/requirements/extra.txt index cef58c6c21221..8162eed3f8518 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -6,4 +6,4 @@ omegaconf>=2.0.5, <=2.1.* hydra-core>=1.0.5, <=1.1.* jsonargparse[signatures]>=4.7.1, <4.7.4 gcsfs>=2021.5.0, <=2022.2.0 -rich>=10.2.2,!=10.15.*, <=12.0.0 +rich>=10.2.2, !=10.15.0.a, <13.0.0 From 1d5f1003cece81935493ad992eb177a770f192ba Mon Sep 17 00:00:00 2001 From: "Adam J. Stewart" Date: Tue, 21 Jun 2022 01:52:06 -0700 Subject: [PATCH 05/24] Fix torch.distributed._sharded_tensor DeprecationWarning (#13261) --- pytorch_lightning/core/lightning.py | 7 +++++-- tests/core/test_lightning_module.py | 5 ++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index bf86471fe92ae..849d6715ef0eb 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -46,7 +46,7 @@ from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.distributed import distributed_available, sync_ddp from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11, _TORCH_GREATER_EQUAL_1_12 from pytorch_lightning.utilities.memory import get_model_size_mb from pytorch_lightning.utilities.model_summary import ModelSummary, summarize from pytorch_lightning.utilities.parsing import collect_init_args @@ -2064,7 +2064,10 @@ def _register_sharded_tensor_state_dict_hooks_if_available(self) -> None: rank_zero_debug("Could not register sharded tensor state dict hooks") return - from torch.distributed._sharded_tensor import pre_load_state_dict_hook, state_dict_hook + if _TORCH_GREATER_EQUAL_1_11: + from torch.distributed._shard.sharded_tensor import pre_load_state_dict_hook, state_dict_hook + else: + from torch.distributed._sharded_tensor import pre_load_state_dict_hook, state_dict_hook self._register_state_dict_hook(state_dict_hook) diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py index 180e7c46fe7d4..4fe9d300c3596 100644 --- a/tests/core/test_lightning_module.py +++ b/tests/core/test_lightning_module.py @@ -299,7 +299,10 @@ def assert_device(device: torch.device) -> None: @RunIf(min_torch="1.10", skip_windows=True) def test_sharded_tensor_state_dict(single_process_pg): - from torch.distributed._sharded_tensor import empty as sharded_tensor_empty + if _TORCH_GREATER_EQUAL_1_11: + from torch.distributed._shard.sharded_tensor import empty as sharded_tensor_empty + else: + from torch.distributed._sharded_tensor import empty as sharded_tensor_empty from torch.distributed._sharding_spec import ChunkShardingSpec class BoringModelWithShardedTensor(BoringModel): From 043ff03c2177e42d013e3f09f58d5de620a8184b Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 21 Jun 2022 11:06:49 +0200 Subject: [PATCH 06/24] update tutorials (#13268) --- _notebooks | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_notebooks b/_notebooks index 290fb466de1fc..8a36a41548f34 160000 --- a/_notebooks +++ b/_notebooks @@ -1 +1 @@ -Subproject commit 290fb466de1fcc2ac6025f74b56906592911e856 +Subproject commit 8a36a41548f34c44ac455d515a72994487e85813 From 541392fee534c5de69613af6bdfef61a041f0f47 Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Tue, 21 Jun 2022 17:48:27 +0100 Subject: [PATCH 07/24] [BUG] `estimated_stepping_batches` requires distributed comms in `configure_optimizers` for `DeepSpeedStrategy` (#13350) --- CHANGELOG.md | 240 ++++++++++++++++++++ pytorch_lightning/strategies/deepspeed.py | 2 + tests/strategies/test_deepspeed_strategy.py | 23 ++ 3 files changed, 265 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bdcd6e07da750..703a0f2830752 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,246 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.7.0] - 2022-MM-DD + +### Added + +- Added a flag named `log_rank_zero_only` to `EarlyStopping` to disable logging to non-zero rank processes ([#13233](https://github.com/PyTorchLightning/pytorch-lightning/pull/13233)) + + +- Added support for reloading the last checkpoint saved by passing `ckpt_path="last"` ([#12816](https://github.com/PyTorchLightning/pytorch-lightning/pull/12816)) + + +- Added `LightningDataModule.load_from_checkpoint` to support loading datamodules directly from checkpoint ([#12550](https://github.com/PyTorchLightning/pytorch-lightning/pull/12550)) + + +- Added a friendly error message when attempting to call `Trainer.save_checkpoint()` without a model attached ([#12772](https://github.com/PyTorchLightning/pytorch-lightning/pull/12772)) + + +- Added a friendly error message when attempting to use `DeepSpeedStrategy` on unsupported accelerators ([#12699](https://github.com/PyTorchLightning/pytorch-lightning/pull/12699)) + + +- Enabled `torch.inference_mode` for evaluation and prediction ([#12715](https://github.com/PyTorchLightning/pytorch-lightning/pull/12715)) + + +- Added support for setting `val_check_interval` to a value higher than the amount of training batches when `check_val_every_n_epoch=None` ([#11993](https://github.com/PyTorchLightning/pytorch-lightning/pull/11993)) + + +- Include the `pytorch_lightning` version as a header in the CLI config files ([#12532](https://github.com/PyTorchLightning/pytorch-lightning/pull/12532)) + + +- Added support for `Callback` registration through entry points ([#12739](https://github.com/PyTorchLightning/pytorch-lightning/pull/12739)) + + +- Added support for `Trainer(deterministic="warn")` to warn instead of fail when a non-deterministic operation is encountered ([#12588](https://github.com/PyTorchLightning/pytorch-lightning/pull/12588)) + + +- Added profiling to the loops' dataloader `__next__` calls ([#12124](https://github.com/PyTorchLightning/pytorch-lightning/pull/12124)) + + +- Added `CollaborativeStrategy` ([#12842](https://github.com/PyTorchLightning/pytorch-lightning/pull/12842)) + + +- Include a version suffix for new "last" checkpoints of later runs in the same directory ([#12902](https://github.com/PyTorchLightning/pytorch-lightning/pull/12902)) + + +- Added missing `predict_dataset` argument in `LightningDataModule.from_datasets` to create predict dataloaders ([#12942](https://github.com/PyTorchLightning/pytorch-lightning/pull/12942)) + + +- Added class name prefix to metrics logged by `DeviceStatsMonitor` ([#12228](https://github.com/PyTorchLightning/pytorch-lightning/pull/12228)) + + +- Added profiling of `LightningDataModule` hooks ([#12971](https://github.com/PyTorchLightning/pytorch-lightning/pull/12971)) + + +- Added Native FSDP Strategy ([#12447](https://github.com/PyTorchLightning/pytorch-lightning/pull/12447)) + + +- Added breaking of lazy graph across training, validation, test and predict steps when training with habana accelerators to ensure better performance ([#12938](https://github.com/PyTorchLightning/pytorch-lightning/pull/12938)) + + +- Added CPU metric tracking to `DeviceStatsMonitor` ([#11795](https://github.com/PyTorchLightning/pytorch-lightning/pull/11795)) + + +- Added `teardown()` method to `Accelerator` ([#11935](https://github.com/PyTorchLightning/pytorch-lightning/pull/11935)) +- + + +- Added a `timeout` argument to `DDPStrategy`. ([#13244](https://github.com/PyTorchLightning/pytorch-lightning/pull/13244)) +- + + +### Changed + +- Enable validation during overfitting ([#12527](https://github.com/PyTorchLightning/pytorch-lightning/pull/12527)) + + +- Added dataclass support to `extract_batch_size` ([#12573](https://github.com/PyTorchLightning/pytorch-lightning/pull/12573)) + + +- Changed checkpoints save path in the case of one logger and user-provided weights_save_path from `weights_save_path/name/version/checkpoints` to `weights_save_path/checkpoints` ([#12372](https://github.com/PyTorchLightning/pytorch-lightning/pull/12372)) + + +- Changed checkpoints save path in the case of multiple loggers and user-provided weights_save_path from `weights_save_path/name1_name2/version1_version2/checkpoints` to `weights_save_path/checkpoints` ([#12372](https://github.com/PyTorchLightning/pytorch-lightning/pull/12372)) + + +- Marked `swa_lrs` argument in `StochasticWeightAveraging` callback as required ([#12556](https://github.com/PyTorchLightning/pytorch-lightning/pull/12556)) + + +- `LightningCLI`'s shorthand notation changed to use jsonargparse native feature ([#12614](https://github.com/PyTorchLightning/pytorch-lightning/pull/12614)) + + +- `LightningCLI` changed to use jsonargparse native support for list append ([#13129](https://github.com/PyTorchLightning/pytorch-lightning/pull/13129)) + + +- Changed `seed_everything_default` argument in the `LightningCLI` to type `Union[bool, int]`. If set to `True` a seed is automatically generated for the parser argument `--seed_everything`. ([#12822](https://github.com/PyTorchLightning/pytorch-lightning/pull/12822), [#13110](https://github.com/PyTorchLightning/pytorch-lightning/pull/13110)) + + +- Make positional arguments required for classes passed into the `add_argparse_args` function. ([#12504](https://github.com/PyTorchLightning/pytorch-lightning/pull/12504)) + + +- Raise an error if there are insufficient training batches when using a float value of `limit_train_batches` ([#12885](https://github.com/PyTorchLightning/pytorch-lightning/pull/12885)) + + +- The `WandbLogger` will now use the run name in the logs folder if it is provided, and otherwise the project name ([#12604](https://github.com/PyTorchLightning/pytorch-lightning/pull/12604)) + + + +### Deprecated + +- Deprecated `pytorch_lightning.loggers.base.LightningLoggerBase` in favor of `pytorch_lightning.loggers.logger.Logger`, and deprecated `pytorch_lightning.loggers.base` in favor of `pytorch_lightning.loggers.logger` ([#120148](https://github.com/PyTorchLightning/pytorch-lightning/pull/12014)) + + +- Deprecated `pytorch_lightning.callbacks.base.Callback` in favor of `pytorch_lightning.callbacks.callback.Callback` ([#13031](https://github.com/PyTorchLightning/pytorch-lightning/pull/13031)) + + +- Deprecated `num_processes`, `gpus`, `tpu_cores,` and `ipus` from the `Trainer` constructor in favor of using the `accelerator` and `devices` arguments ([#11040](https://github.com/PyTorchLightning/pytorch-lightning/pull/11040)) + + +- Deprecated setting `LightningCLI(seed_everything_default=None)` in favor of `False` ([#12804](https://github.com/PyTorchLightning/pytorch-lightning/issues/12804)). + + +- Deprecated `pytorch_lightning.core.lightning.LightningModule` in favor of `pytorch_lightning.core.module.LightningModule` ([#12740](https://github.com/PyTorchLightning/pytorch-lightning/pull/12740)) + + +- Deprecated `pytorch_lightning.loops.base.Loop` in favor of `pytorch_lightning.loops.loop.Loop` ([#13043](https://github.com/PyTorchLightning/pytorch-lightning/pull/13043)) + + +- Deprecated `Trainer.reset_train_val_dataloaders()` in favor of `Trainer.reset_{train,val}_dataloader` ([#12184](https://github.com/PyTorchLightning/pytorch-lightning/pull/12184)) + + +- Deprecated LightningCLI's registries in favor of importing the respective package ([#13221](https://github.com/PyTorchLightning/pytorch-lightning/pull/13221)) + + +### Removed + +- Removed the deprecated `Logger.close` method ([#13149](https://github.com/PyTorchLightning/pytorch-lightning/pull/13149)) + + +- Removed the deprecated `weights_summary` argument from the `Trainer` constructor ([#13070](https://github.com/PyTorchLightning/pytorch-lightning/pull/13070)) + + +- Removed the deprecated `flush_logs_every_n_steps` argument from the `Trainer` constructor ([#13074](https://github.com/PyTorchLightning/pytorch-lightning/pull/13074)) + + +- Removed the deprecated `process_position` argument from the `Trainer` constructor ([13071](https://github.com/PyTorchLightning/pytorch-lightning/pull/13071)) + + +- Removed the deprecated `checkpoint_callback` argument from the `Trainer` constructor ([#13027](https://github.com/PyTorchLightning/pytorch-lightning/pull/13027)) + + +- Removed the deprecated `on_{train,val,test,predict}_dataloader` hooks from the `LightningModule` and `LightningDataModule` ([#13033](https://github.com/PyTorchLightning/pytorch-lightning/pull/13033)) + + +- Removed the deprecated `TestTubeLogger` ([#12859](https://github.com/PyTorchLightning/pytorch-lightning/pull/12859)) + + +- Removed the deprecated `pytorch_lightning.core.memory.LayerSummary` and `pytorch_lightning.core.memory.ModelSummary` ([#12593](https://github.com/PyTorchLightning/pytorch-lightning/pull/12593)) + + +- Removed the deprecated `summarize` method from the `LightningModule` ([#12559](https://github.com/PyTorchLightning/pytorch-lightning/pull/12559)) + + +- Removed the deprecated `model_size` property from the `LightningModule` class ([#12641](https://github.com/PyTorchLightning/pytorch-lightning/pull/12641)) + + +- Removed the deprecated `stochastic_weight_avg` argument from the `Trainer` constructor ([#12535](https://github.com/PyTorchLightning/pytorch-lightning/pull/12535)) + + +- Removed the deprecated `progress_bar_refresh_rate` argument from the `Trainer` constructor ([#12514](https://github.com/PyTorchLightning/pytorch-lightning/pull/12514)) + + +- Removed the deprecated `prepare_data_per_node` argument from the `Trainer` constructor ([#12536](https://github.com/PyTorchLightning/pytorch-lightning/pull/12536)) + + +- Removed the deprecated `pytorch_lightning.core.memory.{get_gpu_memory_map,get_memory_profile}` ([#12659](https://github.com/PyTorchLightning/pytorch-lightning/pull/12659)) + + +- Removed the deprecated `terminate_on_nan` argument from the `Trainer` constructor ([#12553](https://github.com/PyTorchLightning/pytorch-lightning/pull/12553)) + + +- Removed the deprecated `XLAStatsMonitor` callback ([#12688](https://github.com/PyTorchLightning/pytorch-lightning/pull/12688)) + + +- Remove deprecated `pytorch_lightning.callbacks.progress.progress` ([#12658](https://github.com/PyTorchLightning/pytorch-lightning/pull/12658)) + + +- Removed the deprecated `dim` and `size` arguments from the `LightningDataModule` constructor([#12780](https://github.com/PyTorchLightning/pytorch-lightning/pull/12780)) + + +- Removed the deprecated `train_transforms` argument from the `LightningDataModule` constructor([#12662](https://github.com/PyTorchLightning/pytorch-lightning/pull/12662)) + + +- Removed the deprecated `log_gpu_memory` argument from the `Trainer` constructor ([#12657](https://github.com/PyTorchLightning/pytorch-lightning/pull/12657)) + + +- Removed the deprecated automatic logging of GPU stats by the logger connector ([#12657](https://github.com/PyTorchLightning/pytorch-lightning/pull/12657)) + + +- Removed deprecated `GPUStatsMonitor` callback ([#12554](https://github.com/PyTorchLightning/pytorch-lightning/pull/12554)) + + +- Removed support for passing strategy names or strategy instances to the accelerator Trainer argument ([#12696](https://github.com/PyTorchLightning/pytorch-lightning/pull/12696)) + + +- Removed support for passing strategy names or strategy instances to the plugins Trainer argument ([#12700](https://github.com/PyTorchLightning/pytorch-lightning/pull/12700)) + + +- Removed the deprecated `val_transforms` argument from the `LightningDataModule` constructor ([#12763](https://github.com/PyTorchLightning/pytorch-lightning/pull/12763)) + + +- Removed the deprecated `test_transforms` argument from the `LightningDataModule` constructor ([#12773](https://github.com/PyTorchLightning/pytorch-lightning/pull/12773)) + + +- Removed deprecated `dataloader_idx` argument from `on_train_batch_start/end` hooks `Callback` and `LightningModule` ([#12769](https://github.com/PyTorchLightning/pytorch-lightning/pull/12769), [#12977](https://github.com/PyTorchLightning/pytorch-lightning/pull/12977)) + + +- Removed deprecated `get_progress_bar_dict` property from `LightningModule` ([#12839](https://github.com/PyTorchLightning/pytorch-lightning/pull/12839)) + +- Removed sanity check for multi-optimizer support with habana backends ([#13217](https://github.com/PyTorchLightning/pytorch-lightning/pull/13217)) + + +- Removed the need to explicitly load habana module ([#13338](https://github.com/PyTorchLightning/pytorch-lightning/pull/13338)) + + +### Fixed + +- Fixed an issue with unsupported torch.inference_mode() on hpu backends by making it use no_grad ([#13014](https://github.com/PyTorchLightning/pytorch-lightning/pull/13014)) + + +- The model wrapper returned by `LightningLite.setup()` now properly supports pass-through when looking up attributes ([#12597](https://github.com/PyTorchLightning/pytorch-lightning/pull/12597)) + + +- Fixed issue where the CLI fails with certain torch objects ([#13153](https://github.com/PyTorchLightning/pytorch-lightning/pull/13153)) + + +## [1.6.5] - 2022-07-05 + +### Fixed + +- Fixed `estimated_stepping_batches` requiring distributed comms in `configure_optimizers` for the `DeepSpeedStrategy` ([#13350](https://github.com/PyTorchLightning/pytorch-lightning/pull/13350)) + + ## [1.6.4] - 2022-06-01 ### Added diff --git a/pytorch_lightning/strategies/deepspeed.py b/pytorch_lightning/strategies/deepspeed.py index e3b470cd90363..1ccda976d810c 100644 --- a/pytorch_lightning/strategies/deepspeed.py +++ b/pytorch_lightning/strategies/deepspeed.py @@ -356,6 +356,8 @@ def setup_distributed(self): def setup(self, trainer: "pl.Trainer") -> None: self.accelerator.setup(trainer) + # we set the device so that optimizers can be created with distributed comms. + self.lightning_module._device = self.root_device self.setup_optimizers(trainer) self.setup_precision_plugin() optimizers_to_device(self.optimizers, self.root_device) diff --git a/tests/strategies/test_deepspeed_strategy.py b/tests/strategies/test_deepspeed_strategy.py index f3c99203d70eb..c829b203f3846 100644 --- a/tests/strategies/test_deepspeed_strategy.py +++ b/tests/strategies/test_deepspeed_strategy.py @@ -1199,3 +1199,26 @@ def training_step(self, *args, **kwargs): ckpt_path = os.path.join(trainer.checkpoint_callback.dirpath, filepath) expected = {"latest", "zero_to_fp32.py", "checkpoint"} assert expected == set(os.listdir(ckpt_path)) + + +@RunIf(min_cuda_gpus=2, deepspeed=True, standalone=True) +def test_deepspeed_configure_optimizer_device_set(tmpdir): + """Test to ensure that the LM has access to the device within the ``configure_optimizer`` function, and + estimated_stepping_batches works correctly as a result.""" + + class TestModel(BoringModel): + def configure_optimizers(self): + assert self.trainer.estimated_stepping_batches == 1 + assert self.device.type == "cuda" + raise SystemExit + + model = TestModel() + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + accelerator="gpu", + devices=2, + strategy=DeepSpeedStrategy(), + ) + with pytest.raises(SystemExit): + trainer.fit(model) From 2c18ee4d4ccfd293580b9689006ddb3311fc8703 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 21 Jun 2022 22:09:11 +0000 Subject: [PATCH 08/24] Update torchmetrics requirement from <=0.7.2,>=0.4.1 to >=0.4.1,<0.9.2 in /requirements (#13275) Update torchmetrics requirement in /requirements Updates the requirements on [torchmetrics](https://github.com/PyTorchLightning/metrics) to permit the latest version. - [Release notes](https://github.com/PyTorchLightning/metrics/releases) - [Changelog](https://github.com/PyTorchLightning/metrics/blob/master/CHANGELOG.md) - [Commits](https://github.com/PyTorchLightning/metrics/compare/v0.4.1...v0.9.1) --- updated-dependencies: - dependency-name: torchmetrics dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/base.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/base.txt b/requirements/base.txt index 555997c6576e6..768da61c48fd6 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -4,7 +4,7 @@ tqdm>=4.57.0, <=4.63.0 PyYAML>=5.4, <=6.0 fsspec[http]>=2021.05.0, !=2021.06.0, <=2022.2.0 tensorboard>=2.2.0, <2.10.0 -torchmetrics>=0.4.1, <=0.7.2 +torchmetrics>=0.4.1, <0.9.2 pyDeprecate>=0.3.1, <=0.3.2 packaging>=17.0, <=21.3 typing-extensions>=4.0.0, <4.2.1 From 0172a94bbadf1928d18aec3c82a455e6489c6e65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 23 Jun 2022 16:21:55 +0200 Subject: [PATCH 09/24] Fix mypy errors for model summary utilities (#13384) --- CHANGELOG.md | 233 ------------------- pytorch_lightning/utilities/model_summary.py | 6 +- 2 files changed, 4 insertions(+), 235 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 703a0f2830752..3fd1b33c4dfd7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,239 +5,6 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [1.7.0] - 2022-MM-DD - -### Added - -- Added a flag named `log_rank_zero_only` to `EarlyStopping` to disable logging to non-zero rank processes ([#13233](https://github.com/PyTorchLightning/pytorch-lightning/pull/13233)) - - -- Added support for reloading the last checkpoint saved by passing `ckpt_path="last"` ([#12816](https://github.com/PyTorchLightning/pytorch-lightning/pull/12816)) - - -- Added `LightningDataModule.load_from_checkpoint` to support loading datamodules directly from checkpoint ([#12550](https://github.com/PyTorchLightning/pytorch-lightning/pull/12550)) - - -- Added a friendly error message when attempting to call `Trainer.save_checkpoint()` without a model attached ([#12772](https://github.com/PyTorchLightning/pytorch-lightning/pull/12772)) - - -- Added a friendly error message when attempting to use `DeepSpeedStrategy` on unsupported accelerators ([#12699](https://github.com/PyTorchLightning/pytorch-lightning/pull/12699)) - - -- Enabled `torch.inference_mode` for evaluation and prediction ([#12715](https://github.com/PyTorchLightning/pytorch-lightning/pull/12715)) - - -- Added support for setting `val_check_interval` to a value higher than the amount of training batches when `check_val_every_n_epoch=None` ([#11993](https://github.com/PyTorchLightning/pytorch-lightning/pull/11993)) - - -- Include the `pytorch_lightning` version as a header in the CLI config files ([#12532](https://github.com/PyTorchLightning/pytorch-lightning/pull/12532)) - - -- Added support for `Callback` registration through entry points ([#12739](https://github.com/PyTorchLightning/pytorch-lightning/pull/12739)) - - -- Added support for `Trainer(deterministic="warn")` to warn instead of fail when a non-deterministic operation is encountered ([#12588](https://github.com/PyTorchLightning/pytorch-lightning/pull/12588)) - - -- Added profiling to the loops' dataloader `__next__` calls ([#12124](https://github.com/PyTorchLightning/pytorch-lightning/pull/12124)) - - -- Added `CollaborativeStrategy` ([#12842](https://github.com/PyTorchLightning/pytorch-lightning/pull/12842)) - - -- Include a version suffix for new "last" checkpoints of later runs in the same directory ([#12902](https://github.com/PyTorchLightning/pytorch-lightning/pull/12902)) - - -- Added missing `predict_dataset` argument in `LightningDataModule.from_datasets` to create predict dataloaders ([#12942](https://github.com/PyTorchLightning/pytorch-lightning/pull/12942)) - - -- Added class name prefix to metrics logged by `DeviceStatsMonitor` ([#12228](https://github.com/PyTorchLightning/pytorch-lightning/pull/12228)) - - -- Added profiling of `LightningDataModule` hooks ([#12971](https://github.com/PyTorchLightning/pytorch-lightning/pull/12971)) - - -- Added Native FSDP Strategy ([#12447](https://github.com/PyTorchLightning/pytorch-lightning/pull/12447)) - - -- Added breaking of lazy graph across training, validation, test and predict steps when training with habana accelerators to ensure better performance ([#12938](https://github.com/PyTorchLightning/pytorch-lightning/pull/12938)) - - -- Added CPU metric tracking to `DeviceStatsMonitor` ([#11795](https://github.com/PyTorchLightning/pytorch-lightning/pull/11795)) - - -- Added `teardown()` method to `Accelerator` ([#11935](https://github.com/PyTorchLightning/pytorch-lightning/pull/11935)) -- - - -- Added a `timeout` argument to `DDPStrategy`. ([#13244](https://github.com/PyTorchLightning/pytorch-lightning/pull/13244)) -- - - -### Changed - -- Enable validation during overfitting ([#12527](https://github.com/PyTorchLightning/pytorch-lightning/pull/12527)) - - -- Added dataclass support to `extract_batch_size` ([#12573](https://github.com/PyTorchLightning/pytorch-lightning/pull/12573)) - - -- Changed checkpoints save path in the case of one logger and user-provided weights_save_path from `weights_save_path/name/version/checkpoints` to `weights_save_path/checkpoints` ([#12372](https://github.com/PyTorchLightning/pytorch-lightning/pull/12372)) - - -- Changed checkpoints save path in the case of multiple loggers and user-provided weights_save_path from `weights_save_path/name1_name2/version1_version2/checkpoints` to `weights_save_path/checkpoints` ([#12372](https://github.com/PyTorchLightning/pytorch-lightning/pull/12372)) - - -- Marked `swa_lrs` argument in `StochasticWeightAveraging` callback as required ([#12556](https://github.com/PyTorchLightning/pytorch-lightning/pull/12556)) - - -- `LightningCLI`'s shorthand notation changed to use jsonargparse native feature ([#12614](https://github.com/PyTorchLightning/pytorch-lightning/pull/12614)) - - -- `LightningCLI` changed to use jsonargparse native support for list append ([#13129](https://github.com/PyTorchLightning/pytorch-lightning/pull/13129)) - - -- Changed `seed_everything_default` argument in the `LightningCLI` to type `Union[bool, int]`. If set to `True` a seed is automatically generated for the parser argument `--seed_everything`. ([#12822](https://github.com/PyTorchLightning/pytorch-lightning/pull/12822), [#13110](https://github.com/PyTorchLightning/pytorch-lightning/pull/13110)) - - -- Make positional arguments required for classes passed into the `add_argparse_args` function. ([#12504](https://github.com/PyTorchLightning/pytorch-lightning/pull/12504)) - - -- Raise an error if there are insufficient training batches when using a float value of `limit_train_batches` ([#12885](https://github.com/PyTorchLightning/pytorch-lightning/pull/12885)) - - -- The `WandbLogger` will now use the run name in the logs folder if it is provided, and otherwise the project name ([#12604](https://github.com/PyTorchLightning/pytorch-lightning/pull/12604)) - - - -### Deprecated - -- Deprecated `pytorch_lightning.loggers.base.LightningLoggerBase` in favor of `pytorch_lightning.loggers.logger.Logger`, and deprecated `pytorch_lightning.loggers.base` in favor of `pytorch_lightning.loggers.logger` ([#120148](https://github.com/PyTorchLightning/pytorch-lightning/pull/12014)) - - -- Deprecated `pytorch_lightning.callbacks.base.Callback` in favor of `pytorch_lightning.callbacks.callback.Callback` ([#13031](https://github.com/PyTorchLightning/pytorch-lightning/pull/13031)) - - -- Deprecated `num_processes`, `gpus`, `tpu_cores,` and `ipus` from the `Trainer` constructor in favor of using the `accelerator` and `devices` arguments ([#11040](https://github.com/PyTorchLightning/pytorch-lightning/pull/11040)) - - -- Deprecated setting `LightningCLI(seed_everything_default=None)` in favor of `False` ([#12804](https://github.com/PyTorchLightning/pytorch-lightning/issues/12804)). - - -- Deprecated `pytorch_lightning.core.lightning.LightningModule` in favor of `pytorch_lightning.core.module.LightningModule` ([#12740](https://github.com/PyTorchLightning/pytorch-lightning/pull/12740)) - - -- Deprecated `pytorch_lightning.loops.base.Loop` in favor of `pytorch_lightning.loops.loop.Loop` ([#13043](https://github.com/PyTorchLightning/pytorch-lightning/pull/13043)) - - -- Deprecated `Trainer.reset_train_val_dataloaders()` in favor of `Trainer.reset_{train,val}_dataloader` ([#12184](https://github.com/PyTorchLightning/pytorch-lightning/pull/12184)) - - -- Deprecated LightningCLI's registries in favor of importing the respective package ([#13221](https://github.com/PyTorchLightning/pytorch-lightning/pull/13221)) - - -### Removed - -- Removed the deprecated `Logger.close` method ([#13149](https://github.com/PyTorchLightning/pytorch-lightning/pull/13149)) - - -- Removed the deprecated `weights_summary` argument from the `Trainer` constructor ([#13070](https://github.com/PyTorchLightning/pytorch-lightning/pull/13070)) - - -- Removed the deprecated `flush_logs_every_n_steps` argument from the `Trainer` constructor ([#13074](https://github.com/PyTorchLightning/pytorch-lightning/pull/13074)) - - -- Removed the deprecated `process_position` argument from the `Trainer` constructor ([13071](https://github.com/PyTorchLightning/pytorch-lightning/pull/13071)) - - -- Removed the deprecated `checkpoint_callback` argument from the `Trainer` constructor ([#13027](https://github.com/PyTorchLightning/pytorch-lightning/pull/13027)) - - -- Removed the deprecated `on_{train,val,test,predict}_dataloader` hooks from the `LightningModule` and `LightningDataModule` ([#13033](https://github.com/PyTorchLightning/pytorch-lightning/pull/13033)) - - -- Removed the deprecated `TestTubeLogger` ([#12859](https://github.com/PyTorchLightning/pytorch-lightning/pull/12859)) - - -- Removed the deprecated `pytorch_lightning.core.memory.LayerSummary` and `pytorch_lightning.core.memory.ModelSummary` ([#12593](https://github.com/PyTorchLightning/pytorch-lightning/pull/12593)) - - -- Removed the deprecated `summarize` method from the `LightningModule` ([#12559](https://github.com/PyTorchLightning/pytorch-lightning/pull/12559)) - - -- Removed the deprecated `model_size` property from the `LightningModule` class ([#12641](https://github.com/PyTorchLightning/pytorch-lightning/pull/12641)) - - -- Removed the deprecated `stochastic_weight_avg` argument from the `Trainer` constructor ([#12535](https://github.com/PyTorchLightning/pytorch-lightning/pull/12535)) - - -- Removed the deprecated `progress_bar_refresh_rate` argument from the `Trainer` constructor ([#12514](https://github.com/PyTorchLightning/pytorch-lightning/pull/12514)) - - -- Removed the deprecated `prepare_data_per_node` argument from the `Trainer` constructor ([#12536](https://github.com/PyTorchLightning/pytorch-lightning/pull/12536)) - - -- Removed the deprecated `pytorch_lightning.core.memory.{get_gpu_memory_map,get_memory_profile}` ([#12659](https://github.com/PyTorchLightning/pytorch-lightning/pull/12659)) - - -- Removed the deprecated `terminate_on_nan` argument from the `Trainer` constructor ([#12553](https://github.com/PyTorchLightning/pytorch-lightning/pull/12553)) - - -- Removed the deprecated `XLAStatsMonitor` callback ([#12688](https://github.com/PyTorchLightning/pytorch-lightning/pull/12688)) - - -- Remove deprecated `pytorch_lightning.callbacks.progress.progress` ([#12658](https://github.com/PyTorchLightning/pytorch-lightning/pull/12658)) - - -- Removed the deprecated `dim` and `size` arguments from the `LightningDataModule` constructor([#12780](https://github.com/PyTorchLightning/pytorch-lightning/pull/12780)) - - -- Removed the deprecated `train_transforms` argument from the `LightningDataModule` constructor([#12662](https://github.com/PyTorchLightning/pytorch-lightning/pull/12662)) - - -- Removed the deprecated `log_gpu_memory` argument from the `Trainer` constructor ([#12657](https://github.com/PyTorchLightning/pytorch-lightning/pull/12657)) - - -- Removed the deprecated automatic logging of GPU stats by the logger connector ([#12657](https://github.com/PyTorchLightning/pytorch-lightning/pull/12657)) - - -- Removed deprecated `GPUStatsMonitor` callback ([#12554](https://github.com/PyTorchLightning/pytorch-lightning/pull/12554)) - - -- Removed support for passing strategy names or strategy instances to the accelerator Trainer argument ([#12696](https://github.com/PyTorchLightning/pytorch-lightning/pull/12696)) - - -- Removed support for passing strategy names or strategy instances to the plugins Trainer argument ([#12700](https://github.com/PyTorchLightning/pytorch-lightning/pull/12700)) - - -- Removed the deprecated `val_transforms` argument from the `LightningDataModule` constructor ([#12763](https://github.com/PyTorchLightning/pytorch-lightning/pull/12763)) - - -- Removed the deprecated `test_transforms` argument from the `LightningDataModule` constructor ([#12773](https://github.com/PyTorchLightning/pytorch-lightning/pull/12773)) - - -- Removed deprecated `dataloader_idx` argument from `on_train_batch_start/end` hooks `Callback` and `LightningModule` ([#12769](https://github.com/PyTorchLightning/pytorch-lightning/pull/12769), [#12977](https://github.com/PyTorchLightning/pytorch-lightning/pull/12977)) - - -- Removed deprecated `get_progress_bar_dict` property from `LightningModule` ([#12839](https://github.com/PyTorchLightning/pytorch-lightning/pull/12839)) - -- Removed sanity check for multi-optimizer support with habana backends ([#13217](https://github.com/PyTorchLightning/pytorch-lightning/pull/13217)) - - -- Removed the need to explicitly load habana module ([#13338](https://github.com/PyTorchLightning/pytorch-lightning/pull/13338)) - - -### Fixed - -- Fixed an issue with unsupported torch.inference_mode() on hpu backends by making it use no_grad ([#13014](https://github.com/PyTorchLightning/pytorch-lightning/pull/13014)) - - -- The model wrapper returned by `LightningLite.setup()` now properly supports pass-through when looking up attributes ([#12597](https://github.com/PyTorchLightning/pytorch-lightning/pull/12597)) - - -- Fixed issue where the CLI fails with certain torch objects ([#13153](https://github.com/PyTorchLightning/pytorch-lightning/pull/13153)) - - ## [1.6.5] - 2022-07-05 ### Fixed diff --git a/pytorch_lightning/utilities/model_summary.py b/pytorch_lightning/utilities/model_summary.py index f0419e4b97077..9c5ff088da368 100644 --- a/pytorch_lightning/utilities/model_summary.py +++ b/pytorch_lightning/utilities/model_summary.py @@ -15,7 +15,7 @@ import contextlib import logging from collections import OrderedDict -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, cast, Dict, List, Optional, Tuple, Union import numpy as np import torch @@ -120,7 +120,9 @@ def layer_type(self) -> str: @property def num_parameters(self) -> int: """Returns the number of parameters in this module.""" - return sum(np.prod(p.shape) if not _is_lazy_weight_tensor(p) else 0 for p in self._module.parameters()) + return sum( + cast(int, np.prod(p.shape)) if not _is_lazy_weight_tensor(p) else 0 for p in self._module.parameters() + ) class ModelSummary: From fadfee165c351f9297eaa346a3519f9f916a2186 Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 1 Jul 2022 22:04:50 +0200 Subject: [PATCH 10/24] rename org Lightning AI --- .../basic_examples/mnist_datamodule.py | 2 +- .../domain_templates/reinforce_learn_ppo.py | 2 +- pytorch_lightning/loggers/neptune.py | 22 +++++++++---------- .../loops/epoch/evaluation_epoch_loop.py | 2 +- pytorch_lightning/plugins/io/torch_plugin.py | 2 +- pytorch_lightning/plugins/precision/ipu.py | 2 +- pytorch_lightning/plugins/precision/tpu.py | 2 +- pytorch_lightning/setup_tools.py | 4 ++-- .../strategies/launchers/spawn.py | 2 +- .../connectors/accelerator_connector.py | 6 ++--- .../logger_connector/fx_validator.py | 2 +- .../connectors/logger_connector/result.py | 2 +- pytorch_lightning/utilities/migration.py | 2 +- setup.py | 8 +++---- tests/loggers/test_neptune.py | 4 ++-- 15 files changed, 32 insertions(+), 32 deletions(-) diff --git a/pl_examples/basic_examples/mnist_datamodule.py b/pl_examples/basic_examples/mnist_datamodule.py index 31a2e284dd8ba..02d7ff8fd7a37 100644 --- a/pl_examples/basic_examples/mnist_datamodule.py +++ b/pl_examples/basic_examples/mnist_datamodule.py @@ -36,7 +36,7 @@ class _MNIST(Dataset): """Carbon copy of ``tests.helpers.datasets.MNIST``. We cannot import the tests as they are not distributed with the package. - See https://github.com/PyTorchLightning/pytorch-lightning/pull/7614#discussion_r671183652 for more context. + See https://github.com/Lightning-AI/lightning/pull/7614#discussion_r671183652 for more context. """ RESOURCES = ( diff --git a/pl_examples/domain_templates/reinforce_learn_ppo.py b/pl_examples/domain_templates/reinforce_learn_ppo.py index a400de062bf6f..baae200f9e2bc 100644 --- a/pl_examples/domain_templates/reinforce_learn_ppo.py +++ b/pl_examples/domain_templates/reinforce_learn_ppo.py @@ -125,7 +125,7 @@ def get_log_prob(self, pi: Normal, actions: torch.Tensor): class ExperienceSourceDataset(IterableDataset): - """Implementation from PyTorch Lightning Bolts: https://github.com/PyTorchLightning/lightning- + """Implementation from PyTorch Lightning Bolts: https://github.com/Lightning-AI/lightning- bolts/blob/master/pl_bolts/datamodules/experience_source.py. Basic experience source dataset. Takes a generate_batch function that returns an iterator. The logic for the diff --git a/pytorch_lightning/loggers/neptune.py b/pytorch_lightning/loggers/neptune.py index 05952f6bc5747..e3cae87d19d18 100644 --- a/pytorch_lightning/loggers/neptune.py +++ b/pytorch_lightning/loggers/neptune.py @@ -54,7 +54,7 @@ log = logging.getLogger(__name__) -_INTEGRATION_VERSION_KEY = "source_code/integrations/pytorch-lightning" +_INTEGRATION_VERSION_KEY = "source_code/integrations/lightning" # kwargs used in previous NeptuneLogger version, now deprecated _LEGACY_NEPTUNE_INIT_KWARGS = [ @@ -113,7 +113,7 @@ class NeptuneLogger(LightningLoggerBase): neptune_logger = NeptuneLogger( api_key="ANONYMOUS", # replace with your own - project="common/pytorch-lightning-integration", # format "" + project="common/lightning-integration", # format "" tags=["training", "resnet"], # optional ) trainer = Trainer(max_epochs=10, logger=neptune_logger) @@ -157,7 +157,7 @@ def any_lightning_module_function_or_hook(self): .. code-block:: python - neptune_logger = NeptuneLogger(project="common/pytorch-lightning-integration") + neptune_logger = NeptuneLogger(project="common/lightning-integration") trainer = pl.Trainer(logger=neptune_logger) model = ... @@ -182,7 +182,7 @@ def any_lightning_module_function_or_hook(self): .. code-block:: python - neptune_logger = NeptuneLogger(project="common/pytorch-lightning-integration", log_model_checkpoints=False) + neptune_logger = NeptuneLogger(project="common/lightning-integration", log_model_checkpoints=False) **Pass additional parameters to the Neptune run** @@ -194,7 +194,7 @@ def any_lightning_module_function_or_hook(self): from pytorch_lightning.loggers import NeptuneLogger neptune_logger = NeptuneLogger( - project="common/pytorch-lightning-integration", + project="common/lightning-integration", name="lightning-run", description="mlp quick run with pytorch-lightning", tags=["mlp", "quick-run"], @@ -216,10 +216,10 @@ def any_lightning_module_function_or_hook(self): See Also: - Read about `what object you can log to Neptune `_. - - Check `example run `_ + - Check `example run `_ with multiple types of metadata logged. - For more detailed info check - `user guide `_. + `user guide `_. Args: api_key: Optional. @@ -350,7 +350,7 @@ def _verify_input_arguments( " - https://docs-legacy.neptune.ai/integrations/pytorch_lightning.html\n" "The NeptuneLogger was re-written to use the neptune.new Python API\n" " - https://neptune.ai/blog/neptune-new\n" - " - https://docs.neptune.ai/integrations-and-supported-tools/model-training/pytorch-lightning\n" + " - https://docs.neptune.ai/integrations-and-supported-tools/model-training/lightning\n" "You should use arguments accepted by either NeptuneLogger.init() or neptune.init()" ) @@ -377,7 +377,7 @@ def _verify_input_arguments( " - https://docs-legacy.neptune.ai/integrations/pytorch_lightning.html\n" "The NeptuneLogger was re-written to use the neptune.new Python API\n" " - https://neptune.ai/blog/neptune-new\n" - " - https://docs.neptune.ai/integrations-and-supported-tools/model-training/pytorch-lightning\n" + " - https://docs.neptune.ai/integrations-and-supported-tools/model-training/lightning\n" ) # check if user passed redundant neptune.init arguments when passed run @@ -477,7 +477,7 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: # neptune_logger = NeptuneLogger( api_key="ANONYMOUS", - project="common/pytorch-lightning-integration" + project="common/lightning-integration" ) neptune_logger.log_hyperparams(PARAMS) @@ -627,7 +627,7 @@ def _signal_deprecated_api_usage(f_name, sample_code, raise_exception=False): f" - https://docs-legacy.neptune.ai/integrations/pytorch_lightning.html\n" f"The NeptuneLogger was re-written to use the neptune.new Python API\n" f" - https://neptune.ai/blog/neptune-new\n" - f" - https://docs.neptune.ai/integrations-and-supported-tools/model-training/pytorch-lightning\n" + f" - https://docs.neptune.ai/integrations-and-supported-tools/model-training/lightning\n" f"Instead of `logger.{f_name}` you can use:\n" f"\t{sample_code}" ) diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py index 32fd893b759ee..9136cfc9d7f6a 100644 --- a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py @@ -195,7 +195,7 @@ def _reload_dataloader_state_dict(self, data_fetcher: AbstractDataFetcher) -> No if isinstance(dataloader, CombinedLoader): raise MisconfigurationException( "Reloading support hasn't been implemented for `CombinedLoader`. You can request it by opening an issue" - " in `https://github.com/PyTorchLightning/pytorch-lightning/issues`." + " in `https://github.com/Lightning-AI/lightning/issues`." ) assert isinstance(dataloader, DataLoader) _reload_dataloader_state_dict(dataloader, self._dataloader_state_dict) diff --git a/pytorch_lightning/plugins/io/torch_plugin.py b/pytorch_lightning/plugins/io/torch_plugin.py index be10bf967ab05..8791249e7d90c 100644 --- a/pytorch_lightning/plugins/io/torch_plugin.py +++ b/pytorch_lightning/plugins/io/torch_plugin.py @@ -54,7 +54,7 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio atomic_save(checkpoint, path) except AttributeError as err: # todo (sean): is this try catch necessary still? - # https://github.com/PyTorchLightning/pytorch-lightning/pull/431 + # https://github.com/Lightning-AI/lightning/pull/431 key = pl.LightningModule.CHECKPOINT_HYPER_PARAMS_KEY checkpoint.pop(key, None) rank_zero_warn(f"Warning, `{key}` dropped from checkpoint. An attribute is not picklable: {err}") diff --git a/pytorch_lightning/plugins/precision/ipu.py b/pytorch_lightning/plugins/precision/ipu.py index 9df0edb53913b..a299be9a730a5 100644 --- a/pytorch_lightning/plugins/precision/ipu.py +++ b/pytorch_lightning/plugins/precision/ipu.py @@ -72,7 +72,7 @@ def optimizer_step( # we lack coverage here and IPUs are (currently) limited - something to explore if there's demand raise MisconfigurationException( "Skipping backward by returning `None` from your `training_step` is not implemented for IPUs." - " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`" + " Please, open an issue in `https://github.com/Lightning-AI/lightning/issues`" " requesting this feature." ) return closure_result diff --git a/pytorch_lightning/plugins/precision/tpu.py b/pytorch_lightning/plugins/precision/tpu.py index 1afd34264c60c..a0ed9de0a4239 100644 --- a/pytorch_lightning/plugins/precision/tpu.py +++ b/pytorch_lightning/plugins/precision/tpu.py @@ -46,7 +46,7 @@ def optimizer_step( # we lack coverage here so disable this - something to explore if there's demand raise MisconfigurationException( "Skipping backward by returning `None` from your `training_step` is not implemented for TPUs." - " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`" + " Please, open an issue in `https://github.com/Lightning-AI/lightning/issues`" " requesting this feature." ) return closure_result diff --git a/pytorch_lightning/setup_tools.py b/pytorch_lightning/setup_tools.py index 2df3c7946b4d9..82a8b85553232 100644 --- a/pytorch_lightning/setup_tools.py +++ b/pytorch_lightning/setup_tools.py @@ -59,7 +59,7 @@ def _load_readme_description(path_dir: str, homepage: str, version: str) -> str: # drop images from readme text = text.replace("![PT to PL](docs/source/_static/images/general/pl_quick_start_full_compressed.gif)", "") - # https://github.com/PyTorchLightning/pytorch-lightning/raw/master/docs/source/_static/images/lightning_module/pt_to_pl.png + # https://github.com/Lightning-AI/lightning/raw/master/docs/source/_static/images/lightning_module/pt_to_pl.png github_source_url = os.path.join(homepage, "raw", version) # replace relative repository path to absolute link to the release # do not replace all "docs" as in the readme we reger some other sources with particular path to docs @@ -81,7 +81,7 @@ def _load_readme_description(path_dir: str, homepage: str, version: str) -> str: # todo: wrap content as commented description text = re.sub(rf"{skip_begin}.+?{skip_end}", "", text, flags=re.IGNORECASE + re.DOTALL) - # # https://github.com/Borda/pytorch-lightning/releases/download/1.1.0a6/codecov_badge.png + # # https://github.com/Borda/lightning/releases/download/1.1.0a6/codecov_badge.png # github_release_url = os.path.join(homepage, "releases", "download", version) # # download badge and replace url with local file # text = _parse_for_badge(text, github_release_url) diff --git a/pytorch_lightning/strategies/launchers/spawn.py b/pytorch_lightning/strategies/launchers/spawn.py index d67f9e620a45d..fe2c7763f82c9 100644 --- a/pytorch_lightning/strategies/launchers/spawn.py +++ b/pytorch_lightning/strategies/launchers/spawn.py @@ -53,7 +53,7 @@ def __init__(self, strategy: Strategy) -> None: def is_interactive_compatible(self) -> bool: # The start method 'spawn' is currently the only one that works with DDP and CUDA support # The start method 'fork' is the only one supported in Jupyter environments but not compatible with CUDA - # For more context, see https://github.com/PyTorchLightning/pytorch-lightning/issues/7550 + # For more context, see https://github.com/Lightning-AI/lightning/issues/7550 return self._start_method == "fork" and self._strategy.root_device.type != "cuda" def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] = None, **kwargs: Any) -> Any: diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py index d242236e8317e..e795358ef3420 100644 --- a/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -221,7 +221,7 @@ def _init_deterministic(self, deterministic: Optional[bool]) -> None: torch.use_deterministic_algorithms(self.deterministic) if self.deterministic: # fixing non-deterministic part of horovod - # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383 + # https://github.com/Lightning-AI/lightning/pull/1572/files#r420279383 os.environ["HOROVOD_FUSION_THRESHOLD"] = "0" # https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility @@ -605,7 +605,7 @@ def _check_strategy_and_fallback(self) -> None: if _TPU_AVAILABLE: raise MisconfigurationException( "`accelerator='ddp_cpu'` is not supported on TPU machines. " - "Learn more: https://github.com/PyTorchLightning/pytorch-lightning/issues/7810" + "Learn more: https://github.com/Lightning-AI/lightning/issues/7810" ) if self._devices_flag == 1 and self._num_nodes_flag > 1: strategy_flag = DDPStrategy.strategy_name @@ -725,7 +725,7 @@ def _validate_precision_choice(self) -> None: if self._precision_flag == 64: raise MisconfigurationException( "`Trainer(accelerator='tpu', precision=64)` is not implemented." - " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`" + " Please, open an issue in `https://github.com/Lightning-AI/lightning/issues`" " requesting this feature." ) if self._precision_plugin_flag and not isinstance( diff --git a/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py b/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py index dadecef006278..49ef52614e8f6 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py @@ -182,7 +182,7 @@ def check_logging(cls, fx_name: str) -> None: if fx_name not in cls.functions: raise RuntimeError( f"Logging inside `{fx_name}` is not implemented." - " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`." + " Please, open an issue in `https://github.com/Lightning-AI/lightning/issues`." ) if cls.functions[fx_name] is None: diff --git a/pytorch_lightning/trainer/connectors/logger_connector/result.py b/pytorch_lightning/trainer/connectors/logger_connector/result.py index f7f708fdd1fd6..744921c7c28f8 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -125,7 +125,7 @@ def __post_init__(self) -> None: def _parse_reduce_fx(self) -> None: error = ( "Only `self.log(..., reduce_fx={min,max,mean,sum})` are currently supported." - " Please, open an issue in `https://github.com/PyTorchLightning/pytorch-lightning/issues`." + " Please, open an issue in `https://github.com/Lightning-AI/lightning/issues`." f" Found: {self.reduce_fx}" ) if isinstance(self.reduce_fx, str): diff --git a/pytorch_lightning/utilities/migration.py b/pytorch_lightning/utilities/migration.py index 30cc823210423..ed71f25a571f7 100644 --- a/pytorch_lightning/utilities/migration.py +++ b/pytorch_lightning/utilities/migration.py @@ -28,7 +28,7 @@ class pl_legacy_patch: unpickling old checkpoints. The following patches apply. 1. ``pytorch_lightning.utilities.argparse._gpus_arg_default``: Applies to all checkpoints saved prior to - version 1.2.8. See: https://github.com/PyTorchLightning/pytorch-lightning/pull/6898 + version 1.2.8. See: https://github.com/Lightning-AI/lightning/pull/6898 2. ``pytorch_lightning.utilities.argparse_utils``: A module that was deprecated in 1.2 and removed in 1.4, but still needs to be available for import for legacy checkpoints. diff --git a/setup.py b/setup.py index 57f59045ed256..2627ebdc6eca8 100755 --- a/setup.py +++ b/setup.py @@ -69,7 +69,7 @@ def _load_py_module(fname, pkg="pytorch_lightning"): author=about.__author__, author_email=about.__author_email__, url=about.__homepage__, - download_url="https://github.com/PyTorchLightning/pytorch-lightning", + download_url="https://github.com/Lightning-AI/lightning", license=about.__license__, packages=find_packages(exclude=["tests*", "pl_examples*", "legacy*"]), include_package_data=True, @@ -82,9 +82,9 @@ def _load_py_module(fname, pkg="pytorch_lightning"): install_requires=setup_tools._load_requirements(_PATH_REQUIRE), extras_require=extras, project_urls={ - "Bug Tracker": "https://github.com/PyTorchLightning/pytorch-lightning/issues", - "Documentation": "https://pytorch-lightning.rtfd.io/en/latest/", - "Source Code": "https://github.com/PyTorchLightning/pytorch-lightning", + "Bug Tracker": "https://github.com/Lightning-AI/lightning/issues", + "Documentation": "https://lightning.rtfd.io/en/latest/", + "Source Code": "https://github.com/Lightning-AI/lightning", }, classifiers=[ "Environment :: Console", diff --git a/tests/loggers/test_neptune.py b/tests/loggers/test_neptune.py index 87e52159b61d6..de9329564292b 100644 --- a/tests/loggers/test_neptune.py +++ b/tests/loggers/test_neptune.py @@ -47,7 +47,7 @@ class Run: def __setitem__(self, key, value): # called once - assert key == "source_code/integrations/pytorch-lightning" + assert key == "source_code/integrations/lightning" assert value == __version__ def wait(self): @@ -89,7 +89,7 @@ def test_neptune_online(self, neptune): self.assertEqual(created_run_mock.__getitem__.call_count, 2) self.assertEqual(created_run_mock.__setitem__.call_count, 1) created_run_mock.__getitem__.assert_has_calls([call("sys/id"), call("sys/name")], any_order=True) - created_run_mock.__setitem__.assert_called_once_with("source_code/integrations/pytorch-lightning", __version__) + created_run_mock.__setitem__.assert_called_once_with("source_code/integrations/lightning", __version__) @patch("pytorch_lightning.loggers.neptune.Run", Run) def test_online_with_custom_run(self, neptune): From 032c9eb14079662ed404936ce4a86a1286a0b0f5 Mon Sep 17 00:00:00 2001 From: Martino Sorbaro Date: Wed, 29 Jun 2022 19:34:23 +0200 Subject: [PATCH 11/24] Modified python version check to accommodate for legacy version styles (#13420) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí (cherry picked from commit b332b6632821e3f8fd451bbdf158bc9389eea51a) --- CHANGELOG.md | 4 ++++ pytorch_lightning/utilities/imports.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3fd1b33c4dfd7..459de3d363273 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `estimated_stepping_batches` requiring distributed comms in `configure_optimizers` for the `DeepSpeedStrategy` ([#13350](https://github.com/PyTorchLightning/pytorch-lightning/pull/13350)) +- Fixed bug with Python version check that prevented use with development versions of Python ([#13420](https://github.com/PyTorchLightning/pytorch-lightning/pull/13420)) + + + ## [1.6.4] - 2022-06-01 ### Added diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index 3647dbedd11ee..2ed4a49b5d902 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -87,7 +87,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: _IS_WINDOWS = platform.system() == "Windows" _IS_INTERACTIVE = hasattr(sys, "ps1") # https://stackoverflow.com/a/64523765 -_PYTHON_GREATER_EQUAL_3_8_0 = Version(platform.python_version()) >= Version("3.8.0") +_PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8) _TORCH_GREATER_EQUAL_1_8_1 = _compare_version("torch", operator.ge, "1.8.1") _TORCH_GREATER_EQUAL_1_9 = _compare_version("torch", operator.ge, "1.9.0") _TORCH_GREATER_EQUAL_1_9_1 = _compare_version("torch", operator.ge, "1.9.1") From 922324c9df88eb92d5882cb4b7052b2340d7308c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 29 Jun 2022 21:09:35 +0200 Subject: [PATCH 12/24] Call `set_epoch` for distributed batch samplers (#13396) Co-authored-by: Jirka Co-authored-by: Rohit Gupta (cherry picked from commit 2dd332f9c795aa5e590dce4d83e76d791a7b43df) --- CHANGELOG.md | 3 + .../loops/dataloader/evaluation_loop.py | 11 +--- .../loops/dataloader/prediction_loop.py | 10 +--- pytorch_lightning/loops/fit_loop.py | 9 +-- pytorch_lightning/loops/utilities.py | 14 +++++ pytorch_lightning/trainer/supporters.py | 7 ++- tests/loops/test_evaluation_loop.py | 57 ++++++++++++++++--- tests/loops/test_utilities.py | 24 +++++++- 8 files changed, 103 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 459de3d363273..ed3095391fdf3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed bug with Python version check that prevented use with development versions of Python ([#13420](https://github.com/PyTorchLightning/pytorch-lightning/pull/13420)) +- The loops now call `.set_epoch()` also on batch samplers if the dataloader has one wrapped in a distributed sampler ([#13396](https://github.com/PyTorchLightning/pytorch-lightning/pull/13396)) + + ## [1.6.4] - 2022-06-01 diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py index 2ec42fa0acbb5..d45df5885c0ec 100644 --- a/pytorch_lightning/loops/dataloader/evaluation_loop.py +++ b/pytorch_lightning/loops/dataloader/evaluation_loop.py @@ -26,6 +26,7 @@ from pytorch_lightning.accelerators import GPUAccelerator from pytorch_lightning.loops.dataloader import DataLoaderLoop from pytorch_lightning.loops.epoch import EvaluationEpochLoop +from pytorch_lightning.loops.utilities import _set_sampler_epoch from pytorch_lightning.trainer.connectors.logger_connector.result import _OUT_DICT, _ResultCollection from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.apply_func import apply_to_collection @@ -161,14 +162,8 @@ def advance(self, *args: Any, **kwargs: Any) -> None: self._has_run = True def on_advance_start(self, *args: Any, **kwargs: Any) -> None: - dataloader = self.current_dataloader - if ( - dataloader is not None - and getattr(dataloader, "sampler", None) - and callable(getattr(dataloader.sampler, "set_epoch", None)) - ): - # set seed for distributed sampler (enables shuffling for each epoch) - dataloader.sampler.set_epoch(self.trainer.fit_loop.epoch_progress.current.processed) + if self.current_dataloader is not None: + _set_sampler_epoch(self.current_dataloader, self.trainer.fit_loop.epoch_progress.current.processed) super().on_advance_start(*args, **kwargs) diff --git a/pytorch_lightning/loops/dataloader/prediction_loop.py b/pytorch_lightning/loops/dataloader/prediction_loop.py index a14a218ef67e9..36648b7f43e34 100644 --- a/pytorch_lightning/loops/dataloader/prediction_loop.py +++ b/pytorch_lightning/loops/dataloader/prediction_loop.py @@ -5,6 +5,7 @@ from pytorch_lightning.loops.dataloader.dataloader_loop import DataLoaderLoop from pytorch_lightning.loops.epoch.prediction_epoch_loop import PredictionEpochLoop +from pytorch_lightning.loops.utilities import _set_sampler_epoch from pytorch_lightning.strategies import DDPSpawnStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.types import _PREDICT_OUTPUT @@ -87,13 +88,8 @@ def advance(self, *args: Any, **kwargs: Any) -> None: """Predicts one entire dataloader.""" void(*args, **kwargs) dataloader = self.current_dataloader - if ( - dataloader is not None - and getattr(dataloader, "sampler", None) - and callable(getattr(dataloader.sampler, "set_epoch", None)) - ): - # set seed for distributed sampler (enables shuffling for each epoch) - dataloader.sampler.set_epoch(self.trainer.fit_loop.epoch_progress.current.processed) + if dataloader is not None: + _set_sampler_epoch(dataloader, self.trainer.fit_loop.epoch_progress.current.processed) dataloader = self.trainer.strategy.process_dataloader(dataloader) dataloader_iter = enumerate(dataloader) dl_max_batches = self.max_batches[self.current_dataloader_idx] diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py index ac33390a97cec..0771a4a71de9f 100644 --- a/pytorch_lightning/loops/fit_loop.py +++ b/pytorch_lightning/loops/fit_loop.py @@ -21,7 +21,7 @@ from pytorch_lightning.loops import Loop from pytorch_lightning.loops.epoch import TrainingEpochLoop from pytorch_lightning.loops.epoch.training_epoch_loop import _OUTPUTS_TYPE as _EPOCH_OUTPUTS_TYPE -from pytorch_lightning.loops.utilities import _is_max_limit_reached +from pytorch_lightning.loops.utilities import _is_max_limit_reached, _set_sampler_epoch from pytorch_lightning.trainer.connectors.logger_connector.result import _ResultCollection from pytorch_lightning.trainer.progress import Progress from pytorch_lightning.trainer.supporters import TensorRunningAccum @@ -232,11 +232,8 @@ def on_advance_start(self) -> None: # type: ignore[override] # reset outputs here instead of in `reset` as they are not accumulated between epochs self._outputs = [] - if self.trainer.train_dataloader is not None and callable( - getattr(self.trainer.train_dataloader.sampler, "set_epoch", None) - ): - # set seed for distributed sampler (enables shuffling for each epoch) - self.trainer.train_dataloader.sampler.set_epoch(self.epoch_progress.current.processed) + if self.trainer.train_dataloader is not None: + _set_sampler_epoch(self.trainer.train_dataloader, self.epoch_progress.current.processed) # changing gradient according accumulation_scheduler self.trainer.accumulation_scheduler.on_train_epoch_start(self.trainer, self.trainer.lightning_module) diff --git a/pytorch_lightning/loops/utilities.py b/pytorch_lightning/loops/utilities.py index d84c195d758f9..15142be626587 100644 --- a/pytorch_lightning/loops/utilities.py +++ b/pytorch_lightning/loops/utilities.py @@ -21,6 +21,7 @@ import numpy as np import torch from torch.optim import Optimizer +from torch.utils.data import DataLoader import pytorch_lightning as pl from pytorch_lightning.loops import Loop @@ -228,3 +229,16 @@ def _reset_progress(loop: Loop) -> None: def _v1_8_output_format(fx: Callable) -> bool: parameters = inspect.signature(fx).parameters return "new_format" in parameters and parameters["new_format"].default is True + + +def _set_sampler_epoch(dataloader: DataLoader, epoch: int) -> None: + """Calls the ``set_epoch`` method on either the sampler or the batch sampler of the given dataloader. + + Every PyTorch dataloader has either a sampler or a batch sampler, and if it is wrapped by a + :class:`~torch.utils.data.distributed.DistributedSampler`, ``set_epoch`` must be called at the beginning + of every epoch to ensure shuffling applies a new ordering. This has no effect if shuffling is off. + """ + for sampler_name in ("sampler", "batch_sampler"): + sampler = getattr(dataloader, sampler_name, None) + if sampler is not None and callable(getattr(sampler, "set_epoch", None)): + sampler.set_epoch(epoch) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index b8f688892b318..6d3ec88b0be6a 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -438,9 +438,14 @@ class DataLoaderDict(dict): @property def sampler(self) -> Union[Iterable, Sequence, Mapping]: - """Return a collections of samplers extracting from loaders.""" + """Return a collections of samplers extracted from loaders.""" return apply_to_collection(self.loaders, (DataLoader, IterableDataset), getattr, "sampler", None) + @property + def batch_sampler(self) -> Union[Iterable, Sequence, Mapping]: + """Return a collections of batch samplers extracted from loaders.""" + return apply_to_collection(self.loaders, (DataLoader, IterableDataset), getattr, "batch_sampler", None) + def _wrap_loaders_max_size_cycle(self) -> Any: """Wraps all loaders to make sure they are cycled until the longest loader is exhausted. diff --git a/tests/loops/test_evaluation_loop.py b/tests/loops/test_evaluation_loop.py index 137608c426ee0..bddf819aafdd6 100644 --- a/tests/loops/test_evaluation_loop.py +++ b/tests/loops/test_evaluation_loop.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. from unittest import mock -from unittest.mock import Mock +from unittest.mock import call, Mock import torch from torch.utils.data.dataloader import DataLoader -from torch.utils.data.sampler import RandomSampler +from torch.utils.data.sampler import BatchSampler, RandomSampler from pytorch_lightning import Trainer from pytorch_lightning.loops import EvaluationEpochLoop @@ -44,9 +44,8 @@ def test_on_evaluation_epoch_end(eval_epoch_end_mock, tmpdir): assert eval_epoch_end_mock.call_count == 4 -def test_set_epoch_called_eval_predict(tmpdir): - """Tests that set_epoch (if the sampler has one) is called on the DataLoader during evaluation and - prediction.""" +def test_evaluation_loop_sampler_set_epoch_called(tmpdir): + """Tests that set_epoch is called on the dataloader's sampler (if any) during training and validation.""" def _get_dataloader(): dataset = RandomDataset(32, 64) @@ -56,20 +55,60 @@ def _get_dataloader(): model = BoringModel() trainer = Trainer( - default_root_dir=tmpdir, limit_train_batches=2, limit_val_batches=2, max_epochs=2, enable_model_summary=False + default_root_dir=tmpdir, + limit_train_batches=1, + limit_val_batches=1, + max_epochs=2, + enable_model_summary=False, + enable_checkpointing=False, + logger=False, + ) + + train_dataloader = _get_dataloader() + val_dataloader = _get_dataloader() + trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader) + # One for each epoch + assert train_dataloader.sampler.set_epoch.call_args_list == [call(0), call(1)] + # One for each epoch + sanity check + assert val_dataloader.sampler.set_epoch.call_args_list == [call(0), call(0), call(1)] + + val_dataloader = _get_dataloader() + trainer.validate(model, val_dataloader) + assert val_dataloader.sampler.set_epoch.call_args_list == [call(2)] + + +def test_evaluation_loop_batch_sampler_set_epoch_called(tmpdir): + """Tests that set_epoch is called on the dataloader's batch sampler (if any) during training and validation.""" + + def _get_dataloader(): + dataset = RandomDataset(32, 64) + sampler = RandomSampler(dataset) + batch_sampler = BatchSampler(sampler, 2, True) + batch_sampler.set_epoch = Mock() + return DataLoader(dataset, batch_sampler=batch_sampler) + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=1, + limit_val_batches=1, + max_epochs=2, + enable_model_summary=False, + enable_checkpointing=False, + logger=False, ) train_dataloader = _get_dataloader() val_dataloader = _get_dataloader() trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader) # One for each epoch - assert train_dataloader.sampler.set_epoch.call_count == 2 + assert train_dataloader.batch_sampler.set_epoch.call_args_list == [call(0), call(1)] # One for each epoch + sanity check - assert val_dataloader.sampler.set_epoch.call_count == 3 + assert val_dataloader.batch_sampler.set_epoch.call_args_list == [call(0), call(0), call(1)] val_dataloader = _get_dataloader() trainer.validate(model, val_dataloader) - assert val_dataloader.sampler.set_epoch.call_count == 1 + assert val_dataloader.batch_sampler.set_epoch.call_args_list == [call(2)] @mock.patch( diff --git a/tests/loops/test_utilities.py b/tests/loops/test_utilities.py index c5d2e98d008b0..914c1de8e115b 100644 --- a/tests/loops/test_utilities.py +++ b/tests/loops/test_utilities.py @@ -11,10 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from unittest.mock import Mock + import pytest import torch -from pytorch_lightning.loops.utilities import _extract_hiddens, _v1_8_output_format +from pytorch_lightning.loops.utilities import _extract_hiddens, _set_sampler_epoch, _v1_8_output_format from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -61,3 +63,23 @@ def training_epoch_end(outputs, new_format=True): ... assert _v1_8_output_format(training_epoch_end) + + +def test_set_sampler_epoch(): + # No samplers + dataloader = Mock() + dataloader.sampler = None + dataloader.batch_sampler = None + _set_sampler_epoch(dataloader, 55) + + # set_epoch not callable + dataloader = Mock() + dataloader.sampler.set_epoch = None + dataloader.batch_sampler.set_epoch = None + _set_sampler_epoch(dataloader, 55) + + # set_epoch callable + dataloader = Mock() + _set_sampler_epoch(dataloader, 55) + dataloader.sampler.set_epoch.assert_called_once_with(55) + dataloader.batch_sampler.set_epoch.assert_called_once_with(55) From 609d923063aee1639ebc087a2310c490255b6b27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 30 Jun 2022 00:42:34 +0200 Subject: [PATCH 13/24] _RICH_AVAILABLE --- pytorch_lightning/callbacks/progress/rich_progress.py | 5 ++++- pytorch_lightning/callbacks/rich_model_summary.py | 2 +- pytorch_lightning/loops/dataloader/evaluation_loop.py | 2 +- pytorch_lightning/utilities/__init__.py | 1 - pytorch_lightning/utilities/imports.py | 2 +- tests/helpers/runif.py | 2 +- tests/trainer/logging_/test_eval_loop_logging.py | 3 ++- 7 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/callbacks/progress/rich_progress.py b/pytorch_lightning/callbacks/progress/rich_progress.py index fb5914a7a5d41..ad68224c8879a 100644 --- a/pytorch_lightning/callbacks/progress/rich_progress.py +++ b/pytorch_lightning/callbacks/progress/rich_progress.py @@ -12,13 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. import math +import operator from dataclasses import dataclass from datetime import timedelta from typing import Any, Dict, Optional, Union import pytorch_lightning as pl from pytorch_lightning.callbacks.progress.base import ProgressBarBase -from pytorch_lightning.utilities.imports import _RICH_AVAILABLE +from pytorch_lightning.utilities.imports import _compare_version, _package_available + +_RICH_AVAILABLE = _package_available("rich") and _compare_version("rich", operator.ge, "10.2.2") Task, Style = None, None if _RICH_AVAILABLE: diff --git a/pytorch_lightning/callbacks/rich_model_summary.py b/pytorch_lightning/callbacks/rich_model_summary.py index 148de6275950e..f2833bb33ff8c 100644 --- a/pytorch_lightning/callbacks/rich_model_summary.py +++ b/pytorch_lightning/callbacks/rich_model_summary.py @@ -14,7 +14,7 @@ from typing import List, Tuple from pytorch_lightning.callbacks import ModelSummary -from pytorch_lightning.utilities.imports import _RICH_AVAILABLE +from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE from pytorch_lightning.utilities.model_summary import get_human_readable_count if _RICH_AVAILABLE: diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py index d45df5885c0ec..46ea5ba631d10 100644 --- a/pytorch_lightning/loops/dataloader/evaluation_loop.py +++ b/pytorch_lightning/loops/dataloader/evaluation_loop.py @@ -24,6 +24,7 @@ import pytorch_lightning as pl from pytorch_lightning.accelerators import GPUAccelerator +from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE from pytorch_lightning.loops.dataloader import DataLoaderLoop from pytorch_lightning.loops.epoch import EvaluationEpochLoop from pytorch_lightning.loops.utilities import _set_sampler_epoch @@ -37,7 +38,6 @@ DataLoaderIterDataFetcher, InterBatchParallelDataFetcher, ) -from pytorch_lightning.utilities.imports import _RICH_AVAILABLE from pytorch_lightning.utilities.rank_zero import rank_zero_warn from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature from pytorch_lightning.utilities.types import EPOCH_OUTPUT diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index 87947ac9a10f3..8f554ad8cb4d1 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -46,7 +46,6 @@ _module_available, _OMEGACONF_AVAILABLE, _POPTORCH_AVAILABLE, - _RICH_AVAILABLE, _TORCH_GREATER_EQUAL_1_9, _TORCH_GREATER_EQUAL_1_10, _TORCH_GREATER_EQUAL_1_11, diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index 2ed4a49b5d902..8f5c6f8f311c8 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -116,7 +116,7 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: _OMEGACONF_AVAILABLE = _package_available("omegaconf") _POPTORCH_AVAILABLE = _package_available("poptorch") _HABANA_FRAMEWORK_AVAILABLE = _package_available("habana_frameworks") -_RICH_AVAILABLE = _package_available("rich") and _compare_version("rich", operator.ge, "10.2.2") + _TORCH_QUANTIZE_AVAILABLE = bool([eg for eg in torch.backends.quantized.supported_engines if eg != "none"]) _TORCHTEXT_AVAILABLE = _package_available("torchtext") _TORCHTEXT_LEGACY: bool = _TORCHTEXT_AVAILABLE and _compare_version("torchtext", operator.lt, "0.11.0") diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index 5a2464f6fd6ba..9adda205c0e63 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -20,6 +20,7 @@ from packaging.version import Version from pkg_resources import get_distribution +from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE from pytorch_lightning.utilities import ( _APEX_AVAILABLE, _BAGUA_AVAILABLE, @@ -30,7 +31,6 @@ _HPU_AVAILABLE, _IPU_AVAILABLE, _OMEGACONF_AVAILABLE, - _RICH_AVAILABLE, _TORCH_GREATER_EQUAL_1_10, _TORCH_QUANTIZE_AVAILABLE, _TPU_AVAILABLE, diff --git a/tests/trainer/logging_/test_eval_loop_logging.py b/tests/trainer/logging_/test_eval_loop_logging.py index 4457aba18e796..d41044240fa92 100644 --- a/tests/trainer/logging_/test_eval_loop_logging.py +++ b/tests/trainer/logging_/test_eval_loop_logging.py @@ -25,11 +25,12 @@ import torch from pytorch_lightning import callbacks, Trainer +from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE from pytorch_lightning.loggers import TensorBoardLogger from pytorch_lightning.loops.dataloader import EvaluationLoop from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0, _RICH_AVAILABLE +from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0 from tests.helpers import BoringModel, RandomDataset from tests.helpers.runif import RunIf From 05d7931a6a2daaf9af9a9ebc95b4d7d62d4e4f77 Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 1 Jul 2022 23:22:32 +0200 Subject: [PATCH 14/24] _FAIRSCALE_AVAILABLE --- pytorch_lightning/overrides/fairscale.py | 9 ++++++++- .../plugins/precision/sharded_native_amp.py | 2 +- pytorch_lightning/strategies/ddp.py | 2 +- pytorch_lightning/strategies/fully_sharded.py | 2 +- pytorch_lightning/strategies/sharded.py | 2 +- pytorch_lightning/strategies/sharded_spawn.py | 2 +- pytorch_lightning/utilities/__init__.py | 3 --- pytorch_lightning/utilities/imports.py | 3 --- tests/helpers/runif.py | 3 +-- tests/plugins/precision/test_sharded_precision.py | 2 +- .../test_ddp_fully_sharded_with_full_state_dict.py | 2 +- tests/strategies/test_sharded_strategy.py | 2 +- tests/utilities/test_imports.py | 2 +- 13 files changed, 18 insertions(+), 18 deletions(-) diff --git a/pytorch_lightning/overrides/fairscale.py b/pytorch_lightning/overrides/fairscale.py index c33bed60902a1..f02047d5ee0e8 100644 --- a/pytorch_lightning/overrides/fairscale.py +++ b/pytorch_lightning/overrides/fairscale.py @@ -11,11 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import operator + import torch.nn as nn import pytorch_lightning as pl from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, unwrap_lightning_module -from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE +from pytorch_lightning.utilities import _module_available +from pytorch_lightning.utilities.imports import _compare_version, _IS_WINDOWS + +_FAIRSCALE_AVAILABLE = not _IS_WINDOWS and _module_available("fairscale.nn") +_FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.3") +_FAIRSCALE_FULLY_SHARDED_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.4") LightningShardedDataParallel = None if _FAIRSCALE_AVAILABLE: diff --git a/pytorch_lightning/plugins/precision/sharded_native_amp.py b/pytorch_lightning/plugins/precision/sharded_native_amp.py index e40aea8ecf4eb..a1484559e5ce4 100644 --- a/pytorch_lightning/plugins/precision/sharded_native_amp.py +++ b/pytorch_lightning/plugins/precision/sharded_native_amp.py @@ -15,8 +15,8 @@ import torch +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin -from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException if _FAIRSCALE_AVAILABLE: diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py index 453c821b3f59a..16caaffc0819f 100644 --- a/pytorch_lightning/strategies/ddp.py +++ b/pytorch_lightning/strategies/ddp.py @@ -30,6 +30,7 @@ from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.distributed import prepare_for_backward +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin @@ -50,7 +51,6 @@ ) from pytorch_lightning.utilities.exceptions import DeadlockDetectedException from pytorch_lightning.utilities.imports import ( - _FAIRSCALE_AVAILABLE, _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_9, _TORCH_GREATER_EQUAL_1_10, diff --git a/pytorch_lightning/strategies/fully_sharded.py b/pytorch_lightning/strategies/fully_sharded.py index 971441160d333..7dea5a3f0d9ad 100644 --- a/pytorch_lightning/strategies/fully_sharded.py +++ b/pytorch_lightning/strategies/fully_sharded.py @@ -18,12 +18,12 @@ import torch import pytorch_lightning as pl +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_FULLY_SHARDED_AVAILABLE from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities import _FAIRSCALE_FULLY_SHARDED_AVAILABLE from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.optimizer import optimizers_to_device diff --git a/pytorch_lightning/strategies/sharded.py b/pytorch_lightning/strategies/sharded.py index 8a76520755345..06fc8c0f34a42 100644 --- a/pytorch_lightning/strategies/sharded.py +++ b/pytorch_lightning/strategies/sharded.py @@ -20,11 +20,11 @@ import pytorch_lightning as pl from pytorch_lightning.core.optimizer import LightningOptimizer +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE, _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE, _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_only diff --git a/pytorch_lightning/strategies/sharded_spawn.py b/pytorch_lightning/strategies/sharded_spawn.py index 58ad47f464bfc..40b5f359fe9c5 100644 --- a/pytorch_lightning/strategies/sharded_spawn.py +++ b/pytorch_lightning/strategies/sharded_spawn.py @@ -19,10 +19,10 @@ from torch.optim import Optimizer import pytorch_lightning as pl +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_only diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index 8f554ad8cb4d1..cced745664435 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -31,9 +31,6 @@ _APEX_AVAILABLE, _BAGUA_AVAILABLE, _DEEPSPEED_AVAILABLE, - _FAIRSCALE_AVAILABLE, - _FAIRSCALE_FULLY_SHARDED_AVAILABLE, - _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE, _GROUP_AVAILABLE, _HOROVOD_AVAILABLE, _HPU_AVAILABLE, diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index 8f5c6f8f311c8..0a3984210acb0 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -102,9 +102,6 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: _DEEPSPEED_GREATER_EQUAL_0_5_9 = _DEEPSPEED_AVAILABLE and _compare_version("deepspeed", operator.ge, "0.5.9") _DEEPSPEED_GREATER_EQUAL_0_6 = _DEEPSPEED_AVAILABLE and _compare_version("deepspeed", operator.ge, "0.6.0") _DOCSTRING_PARSER_AVAILABLE = _package_available("docstring_parser") -_FAIRSCALE_AVAILABLE = not _IS_WINDOWS and _module_available("fairscale.nn") -_FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.3") -_FAIRSCALE_FULLY_SHARDED_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.4") _GROUP_AVAILABLE = not _IS_WINDOWS and _module_available("torch.distributed.group") _HOROVOD_AVAILABLE = _module_available("horovod.torch") _HYDRA_AVAILABLE = _package_available("hydra") diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index 9adda205c0e63..fe3d39bc8ea2b 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -21,12 +21,11 @@ from pkg_resources import get_distribution from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE, _FAIRSCALE_FULLY_SHARDED_AVAILABLE from pytorch_lightning.utilities import ( _APEX_AVAILABLE, _BAGUA_AVAILABLE, _DEEPSPEED_AVAILABLE, - _FAIRSCALE_AVAILABLE, - _FAIRSCALE_FULLY_SHARDED_AVAILABLE, _HOROVOD_AVAILABLE, _HPU_AVAILABLE, _IPU_AVAILABLE, diff --git a/tests/plugins/precision/test_sharded_precision.py b/tests/plugins/precision/test_sharded_precision.py index 754095912fb53..8fde1946459b2 100644 --- a/tests/plugins/precision/test_sharded_precision.py +++ b/tests/plugins/precision/test_sharded_precision.py @@ -15,8 +15,8 @@ import pytest import torch +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins import ShardedNativeMixedPrecisionPlugin -from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE from tests.helpers.runif import RunIf ShardedGradScaler = None diff --git a/tests/strategies/test_ddp_fully_sharded_with_full_state_dict.py b/tests/strategies/test_ddp_fully_sharded_with_full_state_dict.py index 2912d59598220..0a26236acecdd 100644 --- a/tests/strategies/test_ddp_fully_sharded_with_full_state_dict.py +++ b/tests/strategies/test_ddp_fully_sharded_with_full_state_dict.py @@ -7,9 +7,9 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_FULLY_SHARDED_AVAILABLE from pytorch_lightning.plugins import FullyShardedNativeMixedPrecisionPlugin from pytorch_lightning.strategies import DDPFullyShardedStrategy -from pytorch_lightning.utilities import _FAIRSCALE_FULLY_SHARDED_AVAILABLE from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers.boring_model import BoringModel from tests.helpers.runif import RunIf diff --git a/tests/strategies/test_sharded_strategy.py b/tests/strategies/test_sharded_strategy.py index 8a1313e5a6a45..dff7ca0a0d75d 100644 --- a/tests/strategies/test_sharded_strategy.py +++ b/tests/strategies/test_sharded_strategy.py @@ -6,9 +6,9 @@ import torch from pytorch_lightning import LightningModule, Trainer +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies import DDPShardedStrategy, DDPSpawnShardedStrategy from pytorch_lightning.trainer.states import TrainerFn -from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE from tests.helpers.boring_model import BoringModel from tests.helpers.runif import RunIf diff --git a/tests/utilities/test_imports.py b/tests/utilities/test_imports.py index aa40f71da4982..94b17a91ec629 100644 --- a/tests/utilities/test_imports.py +++ b/tests/utilities/test_imports.py @@ -13,11 +13,11 @@ # limitations under the License. import operator +from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.utilities import ( _APEX_AVAILABLE, _BAGUA_AVAILABLE, _DEEPSPEED_AVAILABLE, - _FAIRSCALE_AVAILABLE, _HOROVOD_AVAILABLE, _module_available, _OMEGACONF_AVAILABLE, From d4b691f88adef10733ace60eadb840ceb0cc067a Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 1 Jul 2022 23:39:22 +0200 Subject: [PATCH 15/24] _BAGUA_AVAILABLE --- pytorch_lightning/strategies/bagua.py | 4 +++- pytorch_lightning/utilities/__init__.py | 1 - pytorch_lightning/utilities/imports.py | 1 - tests/helpers/runif.py | 2 +- tests/utilities/test_imports.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/strategies/bagua.py b/pytorch_lightning/strategies/bagua.py index 6b6598c790b36..9bf619aac8594 100644 --- a/pytorch_lightning/strategies/bagua.py +++ b/pytorch_lightning/strategies/bagua.py @@ -19,10 +19,12 @@ from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.distributed import ReduceOp from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _BAGUA_AVAILABLE +from pytorch_lightning.utilities.imports import _package_available from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.seed import reset_seed +_BAGUA_AVAILABLE = _package_available("bagua") + if _BAGUA_AVAILABLE: import bagua.torch_api as bagua from bagua.torch_api.algorithms import Algorithm diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index cced745664435..b5b70e7220a8d 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -29,7 +29,6 @@ from pytorch_lightning.utilities.grads import grad_norm # noqa: F401 from pytorch_lightning.utilities.imports import ( # noqa: F401 _APEX_AVAILABLE, - _BAGUA_AVAILABLE, _DEEPSPEED_AVAILABLE, _GROUP_AVAILABLE, _HOROVOD_AVAILABLE, diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py index 0a3984210acb0..b9884b29cb999 100644 --- a/pytorch_lightning/utilities/imports.py +++ b/pytorch_lightning/utilities/imports.py @@ -97,7 +97,6 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: _TORCH_GREATER_EQUAL_1_12 = _compare_version("torch", operator.ge, "1.12.0", use_base_version=True) _APEX_AVAILABLE = _module_available("apex.amp") -_BAGUA_AVAILABLE = _package_available("bagua") _DEEPSPEED_AVAILABLE = _package_available("deepspeed") _DEEPSPEED_GREATER_EQUAL_0_5_9 = _DEEPSPEED_AVAILABLE and _compare_version("deepspeed", operator.ge, "0.5.9") _DEEPSPEED_GREATER_EQUAL_0_6 = _DEEPSPEED_AVAILABLE and _compare_version("deepspeed", operator.ge, "0.6.0") diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index fe3d39bc8ea2b..ed1b8ee7bc044 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -22,9 +22,9 @@ from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE, _FAIRSCALE_FULLY_SHARDED_AVAILABLE +from pytorch_lightning.strategies.bagua import _BAGUA_AVAILABLE from pytorch_lightning.utilities import ( _APEX_AVAILABLE, - _BAGUA_AVAILABLE, _DEEPSPEED_AVAILABLE, _HOROVOD_AVAILABLE, _HPU_AVAILABLE, diff --git a/tests/utilities/test_imports.py b/tests/utilities/test_imports.py index 94b17a91ec629..629517d9c51f9 100644 --- a/tests/utilities/test_imports.py +++ b/tests/utilities/test_imports.py @@ -14,9 +14,9 @@ import operator from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE +from pytorch_lightning.strategies.bagua import _BAGUA_AVAILABLE from pytorch_lightning.utilities import ( _APEX_AVAILABLE, - _BAGUA_AVAILABLE, _DEEPSPEED_AVAILABLE, _HOROVOD_AVAILABLE, _module_available, From 9b6c97ea37a1748500e0a138176af7eabb987b8f Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 1 Jul 2022 23:50:51 +0200 Subject: [PATCH 16/24] redefine --- pytorch_lightning/callbacks/progress/rich_progress.py | 2 +- pytorch_lightning/overrides/fairscale.py | 7 +++++-- pytorch_lightning/overrides/torch_distributed.py | 6 +++--- pytorch_lightning/utilities/meta.py | 6 +++++- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/callbacks/progress/rich_progress.py b/pytorch_lightning/callbacks/progress/rich_progress.py index ad68224c8879a..9dcbb1f4522fd 100644 --- a/pytorch_lightning/callbacks/progress/rich_progress.py +++ b/pytorch_lightning/callbacks/progress/rich_progress.py @@ -21,7 +21,7 @@ from pytorch_lightning.callbacks.progress.base import ProgressBarBase from pytorch_lightning.utilities.imports import _compare_version, _package_available -_RICH_AVAILABLE = _package_available("rich") and _compare_version("rich", operator.ge, "10.2.2") +_RICH_AVAILABLE: bool = _package_available("rich") and _compare_version("rich", operator.ge, "10.2.2") Task, Style = None, None if _RICH_AVAILABLE: diff --git a/pytorch_lightning/overrides/fairscale.py b/pytorch_lightning/overrides/fairscale.py index f02047d5ee0e8..9ab860774cf31 100644 --- a/pytorch_lightning/overrides/fairscale.py +++ b/pytorch_lightning/overrides/fairscale.py @@ -24,11 +24,10 @@ _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.3") _FAIRSCALE_FULLY_SHARDED_AVAILABLE = _FAIRSCALE_AVAILABLE and _compare_version("fairscale", operator.ge, "0.3.4") -LightningShardedDataParallel = None if _FAIRSCALE_AVAILABLE: from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel - class LightningShardedDataParallel(_LightningModuleWrapperBase): # type: ignore[no-redef] + class LightningShardedDataParallel(_LightningModuleWrapperBase): # Just do this for later docstrings pass @@ -38,3 +37,7 @@ def unwrap_lightning_module_sharded(wrapped_model: nn.Module) -> "pl.LightningMo model = model.module return unwrap_lightning_module(model) + +else: + LightningShardedDataParallel = ... # type: ignore[assignment,misc] + unwrap_lightning_module_sharded = ... # type: ignore[assignment] diff --git a/pytorch_lightning/overrides/torch_distributed.py b/pytorch_lightning/overrides/torch_distributed.py index ec09b7723132d..26d8ca425bb4d 100644 --- a/pytorch_lightning/overrides/torch_distributed.py +++ b/pytorch_lightning/overrides/torch_distributed.py @@ -10,10 +10,10 @@ _pickler = pickle.Pickler _unpickler = pickle.Unpickler - +_TORCH_DIST_AVAILABLE = torch.distributed.is_available() logger = logging.getLogger(__name__) -if torch.distributed.is_available(): +if _TORCH_DIST_AVAILABLE: from torch._C._distributed_c10d import ProcessGroup from torch.distributed import Backend, broadcast, get_backend, get_rank, GroupMember @@ -157,7 +157,7 @@ def _broadcast_object_list(object_list, src=0, group=None, device=None): object_list[i] = _tensor_to_object(obj_view, obj_size) -if not torch.distributed.is_available(): +if not _TORCH_DIST_AVAILABLE: # avoid failures on early PyTorch versions for Windows where # not all functions used in `broadcast_object_list` are available. def _broadcast_noop(obj, *_, **__): diff --git a/pytorch_lightning/utilities/meta.py b/pytorch_lightning/utilities/meta.py index a5edcfb300188..ef2327d81119e 100644 --- a/pytorch_lightning/utilities/meta.py +++ b/pytorch_lightning/utilities/meta.py @@ -13,6 +13,7 @@ # limitations under the License. import importlib import inspect +import operator import threading from contextlib import contextmanager from functools import partial @@ -27,9 +28,12 @@ import pytorch_lightning as pl from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_10 +from pytorch_lightning.utilities.imports import _compare_version from pytorch_lightning.utilities.rank_zero import rank_zero_warn +# this is needed for proper generating Meta package +_TORCH_GREATER_EQUAL_1_10 = _compare_version("torch", operator.ge, "1.10.0") + if _TORCH_GREATER_EQUAL_1_10: from torch._C import _DisableTorchDispatch # type: ignore[attr-defined] From c02369128f8116afcea34219dceb2546cf78a6c9 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 4 Jul 2022 15:18:32 +0200 Subject: [PATCH 17/24] chlog spaces --- CHANGELOG.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ed3095391fdf3..b9f9613be09a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,15 +10,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed - Fixed `estimated_stepping_batches` requiring distributed comms in `configure_optimizers` for the `DeepSpeedStrategy` ([#13350](https://github.com/PyTorchLightning/pytorch-lightning/pull/13350)) - - - Fixed bug with Python version check that prevented use with development versions of Python ([#13420](https://github.com/PyTorchLightning/pytorch-lightning/pull/13420)) - - - The loops now call `.set_epoch()` also on batch samplers if the dataloader has one wrapped in a distributed sampler ([#13396](https://github.com/PyTorchLightning/pytorch-lightning/pull/13396)) - ## [1.6.4] - 2022-06-01 ### Added From e70889f92279cc7ee4886050eda72bd29c218f17 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Tue, 5 Jul 2022 04:45:03 +0900 Subject: [PATCH 18/24] CI: Fix `fatal: unsafe repository` (#13515) --- .github/workflows/ci_test-conda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index 0466a83760338..1064c7a095d54 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -31,7 +31,7 @@ jobs: timeout-minutes: 30 steps: - name: Workaround for https://github.com/actions/checkout/issues/760 - run: git config --global --add safe.directory /__w/pytorch-lightning/pytorch-lightning + run: git config --global --add safe.directory /__w/lightning/lightning - uses: actions/checkout@v2 From a1484801e2f5522ea2573f2f93d19468af85ed41 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Mon, 11 Jul 2022 23:34:21 +0530 Subject: [PATCH 19/24] update release date --- CHANGELOG.md | 2 +- pytorch_lightning/__about__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b9f9613be09a7..2dc5dcc777752 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [1.6.5] - 2022-07-05 +## [1.6.5] - 2022-07-12 ### Fixed diff --git a/pytorch_lightning/__about__.py b/pytorch_lightning/__about__.py index fdaa499392c7e..ec1d1701bebd2 100644 --- a/pytorch_lightning/__about__.py +++ b/pytorch_lightning/__about__.py @@ -1,7 +1,7 @@ import time _this_year = time.strftime("%Y") -__version__ = "1.6.4" +__version__ = "1.6.5" __author__ = "William Falcon et al." __author_email__ = "waf2107@columbia.edu" __license__ = "Apache-2.0" From e1d55b6d0d2a1e617d48fd4d3e64216a6d5be2f4 Mon Sep 17 00:00:00 2001 From: Jirka Date: Tue, 12 Jul 2022 11:24:42 +0200 Subject: [PATCH 20/24] CI: azure rename --- {.azure-pipelines => .azure}/gpu-benchmark.yml | 0 {.azure-pipelines => .azure}/gpu-tests.yml | 0 {.azure-pipelines => .azure}/hpu-tests.yml | 0 {.azure-pipelines => .azure}/ipu-tests.yml | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename {.azure-pipelines => .azure}/gpu-benchmark.yml (100%) rename {.azure-pipelines => .azure}/gpu-tests.yml (100%) rename {.azure-pipelines => .azure}/hpu-tests.yml (100%) rename {.azure-pipelines => .azure}/ipu-tests.yml (100%) diff --git a/.azure-pipelines/gpu-benchmark.yml b/.azure/gpu-benchmark.yml similarity index 100% rename from .azure-pipelines/gpu-benchmark.yml rename to .azure/gpu-benchmark.yml diff --git a/.azure-pipelines/gpu-tests.yml b/.azure/gpu-tests.yml similarity index 100% rename from .azure-pipelines/gpu-tests.yml rename to .azure/gpu-tests.yml diff --git a/.azure-pipelines/hpu-tests.yml b/.azure/hpu-tests.yml similarity index 100% rename from .azure-pipelines/hpu-tests.yml rename to .azure/hpu-tests.yml diff --git a/.azure-pipelines/ipu-tests.yml b/.azure/ipu-tests.yml similarity index 100% rename from .azure-pipelines/ipu-tests.yml rename to .azure/ipu-tests.yml From 0ed112e5c41e7003d574ea8552b5c4362ebeedb8 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Tue, 12 Jul 2022 15:15:59 +0530 Subject: [PATCH 21/24] Restore log step during restart (#13467) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- CHANGELOG.md | 1 + pytorch_lightning/loops/epoch/training_epoch_loop.py | 2 ++ tests/loops/test_loop_state_dict.py | 2 +- tests/models/test_restore.py | 1 + 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2dc5dcc777752..455fa67add3f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `estimated_stepping_batches` requiring distributed comms in `configure_optimizers` for the `DeepSpeedStrategy` ([#13350](https://github.com/PyTorchLightning/pytorch-lightning/pull/13350)) - Fixed bug with Python version check that prevented use with development versions of Python ([#13420](https://github.com/PyTorchLightning/pytorch-lightning/pull/13420)) - The loops now call `.set_epoch()` also on batch samplers if the dataloader has one wrapped in a distributed sampler ([#13396](https://github.com/PyTorchLightning/pytorch-lightning/pull/13396)) +- Fixed the restoration of log step during restart ([#13467](https://github.com/PyTorchLightning/pytorch-lightning/pull/13467)) ## [1.6.4] - 2022-06-01 diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py index b6887a4cf546c..395471861a289 100644 --- a/pytorch_lightning/loops/epoch/training_epoch_loop.py +++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py @@ -281,6 +281,7 @@ def teardown(self) -> None: def on_save_checkpoint(self) -> Dict: state_dict = super().on_save_checkpoint() + state_dict["_batches_that_stepped"] = self._batches_that_stepped if ( self.trainer is not None @@ -300,6 +301,7 @@ def on_save_checkpoint(self) -> Dict: def on_load_checkpoint(self, state_dict: Dict) -> None: # cache the dataloader state dict until the dataloader objects are available self._dataloader_state_dict = state_dict.get("dataloader_state_dict") + self._batches_that_stepped = state_dict.get("_batches_that_stepped", 0) def _run_validation(self) -> None: # reload dataloaders diff --git a/tests/loops/test_loop_state_dict.py b/tests/loops/test_loop_state_dict.py index 1e67fcc0ed8db..f9630095502d1 100644 --- a/tests/loops/test_loop_state_dict.py +++ b/tests/loops/test_loop_state_dict.py @@ -47,7 +47,7 @@ def test_loops_state_dict_structure(): expected = { "fit_loop": { "state_dict": {}, - "epoch_loop.state_dict": {}, + "epoch_loop.state_dict": {"_batches_that_stepped": 0}, "epoch_loop.batch_progress": { "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0}, "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0}, diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py index 136e8ee516bbb..d0519de90e7a2 100644 --- a/tests/models/test_restore.py +++ b/tests/models/test_restore.py @@ -259,6 +259,7 @@ def on_train_start(self) -> None: trainer.fit(TestModel(), ckpt_path=ckpt_path) assert trainer.current_epoch == max_epochs assert trainer.global_step == max_epochs * train_batches + assert trainer.fit_loop.epoch_loop._batches_that_stepped == max_epochs * train_batches def test_fit_twice(tmpdir): From 2780a926287861c79716709f8e1e525e96988087 Mon Sep 17 00:00:00 2001 From: rohitgr7 Date: Tue, 12 Jul 2022 18:01:45 +0530 Subject: [PATCH 22/24] remove redundant test --- tests/trainer/test_dataloaders.py | 49 +------------------------------ 1 file changed, 1 insertion(+), 48 deletions(-) diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py index 108ea323ecd89..bbd2d61f3d03d 100644 --- a/tests/trainer/test_dataloaders.py +++ b/tests/trainer/test_dataloaders.py @@ -19,7 +19,7 @@ import torch from torch.utils.data import RandomSampler from torch.utils.data.dataloader import DataLoader -from torch.utils.data.dataset import Dataset, IterableDataset, Subset +from torch.utils.data.dataset import Dataset, IterableDataset from torch.utils.data.distributed import DistributedSampler from torch.utils.data.sampler import SequentialSampler @@ -855,53 +855,6 @@ def test_dataloader_distributed_sampler_already_attached(tmpdir): assert trainer.state.finished, "DDP Training failed" -@RunIf(min_gpus=3) -def test_batch_size_smaller_than_num_gpus(tmpdir): - # we need at least 3 gpus for this test - num_gpus = 3 - batch_size = 3 - - class CurrentTestModel(BoringModel): - def __init__(self, batch_size) -> None: - super().__init__() - self.save_hyperparameters() - # batch norm doesn't work with batch size 1, we replace it - self.c_d1_bn = torch.nn.ReLU() - - def training_step(self, *args, **kwargs): - output = super().training_step(*args, **kwargs) - loss = output["loss"] - # we make sure to add some metrics to the output dict, - # this is essential for this test - output["progress_bar"] = {"train_loss": loss} - return output - - def train_dataloader(self): - dataset = RandomDataset(32, 64) - # construct a dataset with a size that is not divisible by num_gpus - # therefore the last batch will have a size < num_gpus - size = num_gpus * self.hparams.batch_size + (num_gpus - 1) - dataset = Subset(dataset, range(size)) - dataloader = DataLoader(dataset, batch_size=self.hparams.batch_size, drop_last=False) - return dataloader - - model = CurrentTestModel(batch_size=batch_size) - - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=1, - limit_train_batches=0.1, - limit_val_batches=0, - accelerator="gpu", - devices=num_gpus, - ) - - # we expect the reduction for the metrics also to happen on the last batch - # where we will get fewer metrics than gpus - trainer.fit(model) - assert trainer.state.finished, f"Training failed with {trainer.state}" - - @pytest.mark.parametrize( ["multiple_trainloader_mode", "num_training_batches"], [("min_size", 16), ("max_size_cycle", 64)], From 2f38d986cb65bc0128422b0ef7eeb528883aba3a Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 14 Jun 2022 19:11:54 +0200 Subject: [PATCH 23/24] Update CI setup (#13291) * drop mamba * use legacy GPU machines --- .azure/gpu-benchmark.yml | 2 +- .azure/gpu-tests.yml | 2 +- .github/workflows/ci_dockers.yml | 4 ++-- .github/workflows/events-nightly.yml | 4 ++-- dockers/base-conda/Dockerfile | 7 +++---- 5 files changed, 9 insertions(+), 10 deletions(-) diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml index cfccbf7081f14..451f5b5646dca 100644 --- a/.azure/gpu-benchmark.yml +++ b/.azure/gpu-benchmark.yml @@ -26,7 +26,7 @@ jobs: - job: benchmarks timeoutInMinutes: "90" cancelTimeoutInMinutes: "2" - pool: azure-gpus-spot + pool: azure-jirka-spot container: image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 157bb1d535f9a..5ec9db1297b43 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -29,7 +29,7 @@ jobs: # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" - pool: azure-gpus-spot + pool: azure-jirka-spot container: image: $(image) diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 94d5be6901c5a..c9ed112516ff4 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -97,7 +97,7 @@ jobs: UBUNTU_VERSION=${{ matrix.ubuntu_version }} file: dockers/base-cuda/Dockerfile push: false - timeout-minutes: 75 + timeout-minutes: 95 build-Conda: runs-on: ubuntu-20.04 @@ -123,7 +123,7 @@ jobs: CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/base-conda/Dockerfile push: false - timeout-minutes: 75 + timeout-minutes: 95 build-ipu: runs-on: ubuntu-20.04 diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index 91a666dd2f6dd..4496e176d9720 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -146,7 +146,7 @@ jobs: file: dockers/base-cuda/Dockerfile push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} - timeout-minutes: 85 + timeout-minutes: 95 # report failure to Slack - name: Slack notification @@ -197,7 +197,7 @@ jobs: file: dockers/base-conda/Dockerfile push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} - timeout-minutes: 85 + timeout-minutes: 95 # report failure to Slack - name: Slack notification diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 05227707b31fa..72603b04ffd64 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -72,16 +72,15 @@ COPY environment.yml environment.yml # conda init RUN conda update -n base -c defaults conda && \ - conda install mamba -n base -c conda-forge && \ - mamba create -y --name $CONDA_ENV python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION} -c nvidia -c pytorch -c pytorch-test -c pytorch-nightly && \ + conda create -y --name $CONDA_ENV python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION} -c nvidia -c pytorch -c pytorch-test -c pytorch-nightly && \ conda init bash && \ # NOTE: this requires that the channel is presented in the yaml before packages \ printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \ python prune.py && \ rm prune.py && \ cat environment.yml && \ - mamba env update --name $CONDA_ENV --file environment.yml && \ - mamba clean -ya && \ + conda env update --name $CONDA_ENV --file environment.yml && \ + conda clean -ya && \ rm environment.yml ENV \ From 3a0328c5174d8b6aaa59c9f61e8c5d538adebf94 Mon Sep 17 00:00:00 2001 From: Jirka Date: Wed, 13 Jul 2022 00:08:57 +0200 Subject: [PATCH 24/24] fix schema check --- .github/workflows/ci_schema.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci_schema.yml b/.github/workflows/ci_schema.yml index 54efaff27a201..03b230124085d 100644 --- a/.github/workflows/ci_schema.yml +++ b/.github/workflows/ci_schema.yml @@ -21,4 +21,4 @@ jobs: - name: Azure Pipelines env: SCHEMA_FILE: https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.204.0/service-schema.json - run: check-jsonschema .azure-pipelines/*.yml --schemafile "$SCHEMA_FILE" + run: check-jsonschema .azure/*.yml --schemafile "$SCHEMA_FILE"