diff --git a/.mergify.yml b/.mergify.yml index 44c48f2ddced5e..cb5ef3ec7519a8 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -12,59 +12,59 @@ # See the License for the specific language governing permissions and # limitations under the License. -pull_request_rules: - - - name: Automatic merge on approval - conditions: - - base=master - # number of review approvals - - "#approved-reviews-by>=3" - # no waiting or assigned review - - "#review-requested=0" - # no requested chnages from any reviewer - - "#changes-requested-reviews-by=0" - # this serves as ALL check has to pass as we have actually around 40 tests in total - - "#status-success>=54" - # this is just in case since we rely on GPU tests (note: redundand to the above) - - status-success=continuous-integration/drone/pr - - "status-success=ci/circleci: TPU-tests" - # this is patter-like, unofrunatly serves as `any(...)` (note: redundand to the above) - #- "status-success~=^ci/circleci:" - # no conflict with master branch - - -conflict - # was not closed yet - - -closed - # filter-out GH draft PRs - - -draft - actions: - delete_head_branch: {} - merge: - # https://doc.mergify.io/merge-action.html#strict-merge - # (on head branch) $ git merge --no-ff base - # (on head branch) # Wait for CI to go green - # (on head branch) # Squash all commits - # (on base branch) $ git merge --ff head - strict: true - method: squash - comment: - message: Great job! =) - - - name: warn on conflicts - conditions: - - conflict - # filter-out GH draft PRs - - -draft - actions: - comment: - message: This pull request is now in conflict... :( - - - name: add core reviewer - conditions: - # filter-out GH draft PRs - - -draft - # number of review approvals - - "#approved-reviews-by<3" - actions: - request_reviews: - teams: - - core-contributors +#pull_request_rules: +# +# - name: Automatic merge on approval +# conditions: +# - base=master +# # number of review approvals +# - "#approved-reviews-by>=3" +# # no waiting or assigned review +# - "#review-requested=0" +# # no requested chnages from any reviewer +# - "#changes-requested-reviews-by=0" +# # this serves as ALL check has to pass as we have actually around 40 tests in total +# - "#status-success>=54" +# # this is just in case since we rely on GPU tests (note: redundand to the above) +# - status-success=continuous-integration/drone/pr +# - "status-success=ci/circleci: TPU-tests" +# # this is patter-like, unofrunatly serves as `any(...)` (note: redundand to the above) +# #- "status-success~=^ci/circleci:" +# # no conflict with master branch +# - -conflict +# # was not closed yet +# - -closed +# # filter-out GH draft PRs +# - -draft +# actions: +# delete_head_branch: {} +# merge: +# # https://doc.mergify.io/merge-action.html#strict-merge +# # (on head branch) $ git merge --no-ff base +# # (on head branch) # Wait for CI to go green +# # (on head branch) # Squash all commits +# # (on base branch) $ git merge --ff head +# strict: true +# method: squash +# comment: +# message: Great job! =) +# +# - name: warn on conflicts +# conditions: +# - conflict +# # filter-out GH draft PRs +# - -draft +# actions: +# comment: +# message: This pull request is now in conflict... :( +# +# - name: add core reviewer +# conditions: +# # filter-out GH draft PRs +# - -draft +# # number of review approvals +# - "#approved-reviews-by<3" +# actions: +# request_reviews: +# teams: +# - core-contributors diff --git a/CHANGELOG.md b/CHANGELOG.md index f078349ef3665d..051fe5fae09e5e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [unreleased.Features] - YYYY-MM-DD +## [unreleased.BugFix] - YYYY-MM-DD ### Added @@ -22,28 +22,39 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed - -## [unreleased.BugFix] - YYYY-MM-DD +## [1.1.1] - 2020-12-15 ### Added +- Add a notebook example to reach a quick baseline of ~94% accuracy on CIFAR10 using Resnet in Lightning ([#4818](https://github.com/PyTorchLightning/pytorch-lightning/pull/4818) ### Changed - -### Deprecated +- Simplify accelerator steps ([#5015](https://github.com/PyTorchLightning/pytorch-lightning/pull/5015) +- Refactor load in checkpoint connector ([#4593](https://github.com/PyTorchLightning/pytorch-lightning/pull/4593) +- Fixed the saved filename in `ModelCheckpoint` when it already exists ([#4861](https://github.com/PyTorchLightning/pytorch-lightning/pull/4861)) +======= ### Removed +- Drop duplicate metrics ([#5014](https://github.com/PyTorchLightning/pytorch-lightning/pull/5014) +- Remove beta arg from F1 class and functional ([#5076](https://github.com/PyTorchLightning/pytorch-lightning/pull/5076) ### Fixed - Fixed trainer by default `None` in `DDPAccelerator` ([#4915](https://github.com/PyTorchLightning/pytorch-lightning/pull/4915)) - - -- Fixed `LightningOptimizer` exposes optimizer attributes ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095)) - +- Fixed `LightningOptimizer` to expose optimizer attributes ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095)) +- Do not warn when the `name` key is used in the `lr_scheduler` dict ([#5057](https://github.com/PyTorchLightning/pytorch-lightning/pull/5057)) +- Check if optimizer supports closure ([#4981](https://github.com/PyTorchLightning/pytorch-lightning/pull/4981) +- Extend LightningOptimizer to exposure underlying Optimizer attributes + update doc ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095) +- Add deprecated metric utility functions back to functional ( + [#5067](https://github.com/PyTorchLightning/pytorch-lightning/pull/5067), + [#5068](https://github.com/PyTorchLightning/pytorch-lightning/pull/5068)) +- Allow any input in `to_onnx` and `to_torchscript` ([#4378](https://github.com/PyTorchLightning/pytorch-lightning/pull/4378) +- Do not warn when the name key is used in the `lr_scheduler` dict ([#5057](https://github.com/PyTorchLightning/pytorch-lightning/pull/5057) + +- Fixed `DDPHPCAccelerator` hangs in DDP construction by calling `init_device` ([#5157](https://github.com/PyTorchLightning/pytorch-lightning/pull/5157)) ## [1.1.0] - 2020-12-09 diff --git a/benchmarks/test_parity.py b/benchmarks/test_parity.py index 41bba9533e10d7..3508d5a3c28acc 100644 --- a/benchmarks/test_parity.py +++ b/benchmarks/test_parity.py @@ -4,8 +4,8 @@ import pytest import torch +from pytorch_lightning import seed_everything, Trainer import tests.base.develop_utils as tutils -from pytorch_lightning import Trainer, seed_everything from tests.base.models import ParityModuleMNIST, ParityModuleRNN diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py index 9fe49764421785..2e52613462621c 100644 --- a/benchmarks/test_sharded_parity.py +++ b/benchmarks/test_sharded_parity.py @@ -6,7 +6,7 @@ import pytest import torch -from pytorch_lightning import Trainer, seed_everything +from pytorch_lightning import seed_everything, Trainer from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.plugins.sharded_plugin import DDPShardedPlugin from pytorch_lightning.utilities import FAIRSCALE_AVAILABLE, NATIVE_AMP_AVAILABLE diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile index 8eb093295c37bb..5dfeac8c9e86ea 100644 --- a/dockers/base-xla/Dockerfile +++ b/dockers/base-xla/Dockerfile @@ -97,6 +97,8 @@ RUN \ python -c "fname = 'requirements.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('torch')] ; open(fname, 'w').writelines(lines)" && \ # drop Horovod as it is not needed python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \ + # drop fairscale as it is not needed + python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)" && \ # drop TorchVision as it was installed with XLA python -c "fname = 'requirements/examples.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('torchvision')] ; open(fname, 'w').writelines(lines)" && \ pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed && \ diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile index a514b1c3d35fed..464f7fd8f309eb 100644 --- a/dockers/tpu-tests/Dockerfile +++ b/dockers/tpu-tests/Dockerfile @@ -27,8 +27,10 @@ COPY ./ ./pytorch-lightning/ RUN \ # Install pytorch-lightning at the current PR, plus dependencies. #pip install -r pytorch-lightning/requirements.txt --no-cache-dir && \ - # drop Horovod + # drop Horovod as it is not needed python -c "fname = 'pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \ + # drop fairscale as it is not needed + python -c "fname = 'pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)" && \ pip install -r pytorch-lightning/requirements/devel.txt --no-cache-dir --upgrade-strategy only-if-needed #RUN python -c "import pytorch_lightning as pl; print(pl.__version__)" diff --git a/docs/source/introduction_guide.rst b/docs/source/introduction_guide.rst index d6d082e2ed779b..d4cf578e10bda2 100644 --- a/docs/source/introduction_guide.rst +++ b/docs/source/introduction_guide.rst @@ -601,8 +601,8 @@ In this method we do all the preparation we need to do once (instead of on every def setup(self, stage): # transform transform=transforms.Compose([transforms.ToTensor()]) - MNIST(os.getcwd(), train=True, download=False, transform=transform) - MNIST(os.getcwd(), train=False, download=False, transform=transform) + mnist_train = MNIST(os.getcwd(), train=True, download=False, transform=transform) + mnist_test = MNIST(os.getcwd(), train=False, download=False, transform=transform) # train/val split mnist_train, mnist_val = random_split(mnist_train, [55000, 5000]) diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index def47810504d69..b3e0b905f27f43 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -663,7 +663,7 @@ It is highly recommended to use Sharded Training in multi-GPU environments where A technical note: as batch size scales, storing activations for the backwards pass becomes the bottleneck in training. As a result, sharding optimizer state and gradients becomes less impactful. Work within the future will bring optional sharding to activations and model parameters to reduce memory further, but come with a speed cost. -To use Sharded Training, you need to first install FairScale using the command below or install all extras using ``pip install pytorch-lightning["extra"]``. +To use Sharded Training, you need to first install FairScale using the command below. .. code-block:: bash diff --git a/notebooks/04-transformers-text-classification.ipynb b/notebooks/04-transformers-text-classification.ipynb index 037b24e4ddd9dc..d52af84a76d975 100644 --- a/notebooks/04-transformers-text-classification.ipynb +++ b/notebooks/04-transformers-text-classification.ipynb @@ -1,5 +1,12 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Open" + ] + }, { "cell_type": "markdown", "metadata": { diff --git a/notebooks/05-trainer-flags-overview.ipynb b/notebooks/05-trainer-flags-overview.ipynb index 6413e8239bb2e5..da044a9c9b5c6e 100644 --- a/notebooks/05-trainer-flags-overview.ipynb +++ b/notebooks/05-trainer-flags-overview.ipynb @@ -1,5 +1,12 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Open" + ] + }, { "cell_type": "markdown", "metadata": { diff --git a/pyproject.toml b/pyproject.toml index 760421a56ece8c..01e416aa51d8b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ exclude = "(.eggs|.git|.hg|.mypy_cache|.nox|.tox|.venv|.svn|_build|buck-out|buil [tool.isort] known_first_party = [ - "bencharmks", + "benchmarks", "docs", "pl_examples", "pytorch_lightning", @@ -52,3 +52,5 @@ skip_glob = [ ] profile = "black" line_length = 120 +force_sort_within_sections = "True" +order_by_type = "False" diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py index 408d95a72dc470..222263ea2d3853 100644 --- a/pytorch_lightning/__init__.py +++ b/pytorch_lightning/__init__.py @@ -1,6 +1,6 @@ """Root package info.""" -__version__ = '1.1.1rc0' +__version__ = '1.1.1' __author__ = 'William Falcon et al.' __author_email__ = 'waf2107@columbia.edu' __license__ = 'Apache-2.0' diff --git a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py index a0545a4604aece..b9a71ed2717441 100644 --- a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py @@ -48,3 +48,6 @@ def model_to_device(self, model, process_idx): def get_device_ids(self): device_ids = None return device_ids + + def init_device(self, process_idx): + pass diff --git a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py index ec4c087998614e..b257884e34aef5 100644 --- a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py @@ -126,6 +126,7 @@ def ddp_train(self, process_idx, model): """ # determine which process we are and world size self.set_world_ranks(process_idx) + self.init_device(process_idx) # toggle prog bar if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index 88f1881643c9aa..4125a924cb2c59 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -19,6 +19,7 @@ Monitor a metric and stop training when it stops improving. """ +import numbers import os import numpy as np @@ -26,7 +27,8 @@ from pytorch_lightning import _logger as log from pytorch_lightning.callbacks.base import Callback -from pytorch_lightning.utilities import rank_zero_info, rank_zero_warn, TPU_AVAILABLE +from pytorch_lightning.metrics.metric import Metric +from pytorch_lightning.utilities import TPU_AVAILABLE, rank_zero_info, rank_zero_warn class EarlyStopping(Callback): @@ -201,8 +203,11 @@ def _run_early_stopping_check(self, trainer, pl_module): # when in dev debugging trainer.dev_debugger.track_early_stopping_history(self, current) - if not isinstance(current, torch.Tensor): - current = torch.tensor(current, device=pl_module.device) + if current is not None: + if isinstance(current, Metric): + current = current.compute() + elif isinstance(current, numbers.Number): + current = torch.tensor(current, device=pl_module.device, dtype=torch.float) if trainer.use_tpu and TPU_AVAILABLE: current = current.cpu() diff --git a/pytorch_lightning/callbacks/lr_monitor.py b/pytorch_lightning/callbacks/lr_monitor.py index 081aec45067cf1..9799e0d3298d35 100755 --- a/pytorch_lightning/callbacks/lr_monitor.py +++ b/pytorch_lightning/callbacks/lr_monitor.py @@ -157,7 +157,7 @@ def _find_names(self, lr_schedulers) -> List[str]: names = [] for scheduler in lr_schedulers: sch = scheduler['scheduler'] - if 'name' in scheduler: + if scheduler['name'] is not None: name = scheduler['name'] else: opt_name = 'lr-' + sch.optimizer.__class__.__name__ diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 1354f7f5056b39..82df32ce3996c2 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -20,6 +20,7 @@ """ +import numbers import os import re from copy import deepcopy @@ -32,8 +33,9 @@ from pytorch_lightning import _logger as log from pytorch_lightning.callbacks.base import Callback -from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn +from pytorch_lightning.metrics.metric import Metric from pytorch_lightning.plugins.rpc_plugin import RPCPlugin +from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -240,17 +242,14 @@ def save_checkpoint(self, trainer, pl_module): # what can be monitored monitor_candidates = self._monitor_candidates(trainer) - # ie: path/val_loss=0.5.ckpt - filepath = self._get_metric_interpolated_filepath_name(monitor_candidates, epoch, global_step) - # callback supports multiple simultaneous modes # here we call each mode sequentially # Mode 1: save all checkpoints OR only the top k if self.save_top_k: - self._save_top_k_checkpoints(monitor_candidates, trainer, pl_module, filepath) + self._save_top_k_checkpoints(trainer, pl_module, monitor_candidates) # Mode 2: save the last checkpoint - self._save_last_checkpoint(trainer, pl_module, monitor_candidates, filepath) + self._save_last_checkpoint(trainer, pl_module, monitor_candidates) def __validate_init_configuration(self): if self.save_top_k is not None and self.save_top_k < -1: @@ -444,6 +443,7 @@ def format_checkpoint_name( ) if ver is not None: filename = self.CHECKPOINT_JOIN_CHAR.join((filename, f"v{ver}")) + ckpt_name = f"{filename}{self.FILE_EXTENSION}" return os.path.join(self.dirpath, ckpt_name) if self.dirpath else ckpt_name @@ -515,13 +515,20 @@ def _validate_monitor_key(self, trainer): ) raise MisconfigurationException(m) - def _get_metric_interpolated_filepath_name(self, ckpt_name_metrics: Dict[str, Any], epoch: int, step: int): + def _get_metric_interpolated_filepath_name( + self, + ckpt_name_metrics: Dict[str, Any], + epoch: int, + step: int, + del_filepath: Optional[str] = None + ) -> str: filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics) + version_cnt = 0 - while self._fs.exists(filepath): + while self._fs.exists(filepath) and filepath != del_filepath: filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics, ver=version_cnt) - # this epoch called before version_cnt += 1 + return filepath def _monitor_candidates(self, trainer): @@ -531,13 +538,11 @@ def _monitor_candidates(self, trainer): ckpt_name_metrics.update({"step": trainer.global_step, "epoch": trainer.current_epoch}) return ckpt_name_metrics - def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics, filepath): + def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics): should_save_last = self.monitor is None or self.save_last if not should_save_last: return - last_filepath = filepath - # when user ALSO asked for the 'last.ckpt' change the name if self.save_last: last_filepath = self._format_checkpoint_name( @@ -548,6 +553,10 @@ def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics, filepath) prefix=self.prefix ) last_filepath = os.path.join(self.dirpath, f"{last_filepath}{self.FILE_EXTENSION}") + else: + last_filepath = self._get_metric_interpolated_filepath_name( + ckpt_name_metrics, trainer.current_epoch, trainer.global_step + ) accelerator_backend = trainer.accelerator_backend @@ -568,16 +577,19 @@ def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics, filepath) if self.monitor is None: self.best_model_path = self.last_model_path - def _save_top_k_checkpoints(self, metrics, trainer, pl_module, filepath): + def _save_top_k_checkpoints(self, trainer, pl_module, metrics): current = metrics.get(self.monitor) epoch = metrics.get("epoch") step = metrics.get("step") - if not isinstance(current, torch.Tensor) and current is not None: - current = torch.tensor(current, device=pl_module.device) + if current is not None: + if isinstance(current, Metric): + current = current.compute() + elif isinstance(current, numbers.Number): + current = torch.tensor(current, device=pl_module.device, dtype=torch.float) if self.check_monitor_top_k(current): - self._update_best_and_save(filepath, current, epoch, step, trainer, pl_module) + self._update_best_and_save(current, epoch, step, trainer, pl_module, metrics) elif self.verbose: rank_zero_info( f"Epoch {epoch:d}, step {step:d}: {self.monitor} was not in top {self.save_top_k}" @@ -588,25 +600,26 @@ def _is_valid_monitor_key(self, metrics): def _update_best_and_save( self, - filepath: str, current: torch.Tensor, epoch: int, step: int, trainer, pl_module, + ckpt_name_metrics ): k = len(self.best_k_models) + 1 if self.save_top_k == -1 else self.save_top_k - del_list = [] + del_filepath = None if len(self.best_k_models) == k and k > 0: - delpath = self.kth_best_model_path - self.best_k_models.pop(self.kth_best_model_path) - del_list.append(delpath) + del_filepath = self.kth_best_model_path + self.best_k_models.pop(del_filepath) # do not save nan, replace with +/- inf if torch.isnan(current): current = torch.tensor(float('inf' if self.mode == "min" else '-inf')) + filepath = self._get_metric_interpolated_filepath_name(ckpt_name_metrics, epoch, step, del_filepath) + # save the current score self.current_score = current self.best_k_models[filepath] = current @@ -630,9 +643,8 @@ def _update_best_and_save( ) self._save_model(filepath, trainer, pl_module) - for cur_path in del_list: - if cur_path != filepath: - self._del_model(cur_path) + if del_filepath is not None and filepath != del_filepath: + self._del_model(del_filepath) def to_yaml(self, filepath: Optional[Union[str, Path]] = None): """ diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index 57979b73f2cb6d..f24a4ce8beb8ac 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -14,7 +14,7 @@ """Various hooks to be used in the Lightning code.""" -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Optional, Union import torch from pytorch_lightning.utilities import move_data_to_device, rank_zero_warn @@ -501,7 +501,7 @@ def val_dataloader(self): will have an argument ``dataloader_idx`` which matches the order here. """ - def transfer_batch_to_device(self, batch: Any, device: torch.device) -> Any: + def transfer_batch_to_device(self, batch: Any, device: Optional[torch.device] = None) -> Any: """ Override this hook if your :class:`~torch.utils.data.DataLoader` returns tensors wrapped in a custom data structure. @@ -549,6 +549,7 @@ def transfer_batch_to_device(self, batch, device) - :func:`~pytorch_lightning.utilities.apply_func.move_data_to_device` - :func:`~pytorch_lightning.utilities.apply_func.apply_to_collection` """ + device = device or self.device return move_data_to_device(batch, device) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index ef05ce69c1828b..ab66435a2935db 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -22,6 +22,7 @@ import tempfile from abc import ABC from argparse import Namespace +from pathlib import Path from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union import torch @@ -278,6 +279,7 @@ def log( sync_dist_group, accelerator.sync_tensor, self._current_dataloader_idx, + self.device, ) def log_dict( @@ -989,7 +991,7 @@ def configure_optimizers( - List or Tuple - List of optimizers. - Two lists - The first list has multiple optimizers, the second a list of LR schedulers (or lr_dict). - Dictionary, with an 'optimizer' key, and (optionally) a 'lr_scheduler' - key which value is a single LR scheduler or lr_dict. + key whose value is a single LR scheduler or lr_dict. - Tuple of dictionaries as described, with an optional 'frequency' key. - None - Fit will run without any optimizer. @@ -1001,21 +1003,22 @@ def configure_optimizers( In the former case, all optimizers will operate on the given batch in each optimization step. In the latter, only one optimizer will operate on the given batch at every step. - The lr_dict is a dictionary which contains scheduler and its associated configuration. - It has five keys. The default configuration is shown below. + The lr_dict is a dictionary which contains the scheduler and its associated configuration. + The default configuration is shown below. .. code-block:: python { - 'scheduler': lr_scheduler, # The LR schduler + 'scheduler': lr_scheduler, # The LR scheduler instance (required) 'interval': 'epoch', # The unit of the scheduler's step size 'frequency': 1, # The frequency of the scheduler 'reduce_on_plateau': False, # For ReduceLROnPlateau scheduler 'monitor': 'val_loss', # Metric for ReduceLROnPlateau to monitor - 'strict': True # Whether to crash the training if `monitor` is not found + 'strict': True, # Whether to crash the training if `monitor` is not found + 'name': None, # Custom name for LearningRateMonitor to use } - If user only provides LR schedulers, then their configuration will set to default as shown above. + Only the ``scheduler`` key is required, the rest will be set to the defaults above. Examples: .. code-block:: python @@ -1390,12 +1393,15 @@ def get_progress_bar_dict(self): """ # call .item() only once but store elements without graphs running_train_loss = self.trainer.train_loop.running_loss.mean() - avg_training_loss = ( - running_train_loss.cpu().item() - if running_train_loss is not None - else float("NaN") - ) - tqdm_dict = {"loss": "{:.3g}".format(avg_training_loss)} + avg_training_loss = None + if running_train_loss is not None: + avg_training_loss = running_train_loss.cpu().item() + elif self.trainer.train_loop.automatic_optimization: + avg_training_loss = float('NaN') + + tqdm_dict = {} + if avg_training_loss is not None: + tqdm_dict["loss"] = f"{avg_training_loss:.3g}" if self.trainer.truncated_bptt_steps is not None: tqdm_dict["split_idx"] = self.trainer.split_idx @@ -1530,12 +1536,19 @@ def _set_hparams(self, hp: Union[dict, Namespace, str]) -> None: else: self._hparams = hp - def to_onnx(self, file_path: str, input_sample: Optional[Tensor] = None, **kwargs): - """Saves the model in ONNX format + @torch.no_grad() + def to_onnx( + self, + file_path: Union[str, Path], + input_sample: Optional[Any] = None, + **kwargs, + ): + """ + Saves the model in ONNX format Args: - file_path: The path of the file the model should be saved to. - input_sample: A sample of an input tensor for tracing. + file_path: The path of the file the onnx model should be saved to. + input_sample: An input for tracing. Default: None (Use self.example_input_array) **kwargs: Will be passed to torch.onnx.export function. Example: @@ -1554,31 +1567,32 @@ def to_onnx(self, file_path: str, input_sample: Optional[Tensor] = None, **kwarg ... os.path.isfile(tmpfile.name) True """ + mode = self.training - if isinstance(input_sample, Tensor): - input_data = input_sample - elif self.example_input_array is not None: - input_data = self.example_input_array - else: - if input_sample is not None: + if input_sample is None: + if self.example_input_array is None: raise ValueError( - f"Received `input_sample` of type {type(input_sample)}. Expected type is `Tensor`" + "Could not export to ONNX since neither `input_sample` nor" + " `model.example_input_array` attribute is set." ) - raise ValueError( - "Could not export to ONNX since neither `input_sample` nor" - " `model.example_input_array` attribute is set." - ) - input_data = input_data.to(self.device) + input_sample = self.example_input_array + + input_sample = self.transfer_batch_to_device(input_sample) + if "example_outputs" not in kwargs: self.eval() - with torch.no_grad(): - kwargs["example_outputs"] = self(input_data) + kwargs["example_outputs"] = self(input_sample) - torch.onnx.export(self, input_data, file_path, **kwargs) + torch.onnx.export(self, input_sample, file_path, **kwargs) + self.train(mode) + @torch.no_grad() def to_torchscript( - self, file_path: Optional[str] = None, method: Optional[str] = 'script', - example_inputs: Optional[Union[torch.Tensor, Tuple[torch.Tensor]]] = None, **kwargs + self, + file_path: Optional[Union[str, Path]] = None, + method: Optional[str] = 'script', + example_inputs: Optional[Any] = None, + **kwargs, ) -> Union[ScriptModule, Dict[str, ScriptModule]]: """ By default compiles the whole model to a :class:`~torch.jit.ScriptModule`. @@ -1590,7 +1604,7 @@ def to_torchscript( Args: file_path: Path where to save the torchscript. Default: None (no file saved). method: Whether to use TorchScript's script or trace method. Default: 'script' - example_inputs: Tensor to be used to do tracing when method is set to 'trace'. + example_inputs: An input to be used to do tracing when method is set to 'trace'. Default: None (Use self.example_input_array) **kwargs: Additional arguments that will be passed to the :func:`torch.jit.script` or :func:`torch.jit.trace` function. @@ -1624,21 +1638,27 @@ def to_torchscript( This LightningModule as a torchscript, regardless of whether file_path is defined or not. """ - mode = self.training - with torch.no_grad(): - if method == 'script': - torchscript_module = torch.jit.script(self.eval(), **kwargs) - elif method == 'trace': - # if no example inputs are provided, try to see if model has example_input_array set - if example_inputs is None: - example_inputs = self.example_input_array - # automatically send example inputs to the right device and use trace - example_inputs = self.transfer_batch_to_device(example_inputs, device=self.device) - torchscript_module = torch.jit.trace(func=self.eval(), example_inputs=example_inputs, **kwargs) - else: - raise ValueError(f"The 'method' parameter only supports 'script' or 'trace', but value given was:" - f"{method}") + + if method == 'script': + torchscript_module = torch.jit.script(self.eval(), **kwargs) + elif method == 'trace': + # if no example inputs are provided, try to see if model has example_input_array set + if example_inputs is None: + if self.example_input_array is None: + raise ValueError( + 'Choosing method=`trace` requires either `example_inputs`' + ' or `model.example_input_array` to be defined' + ) + example_inputs = self.example_input_array + + # automatically send example inputs to the right device and use trace + example_inputs = self.transfer_batch_to_device(example_inputs) + torchscript_module = torch.jit.trace(func=self.eval(), example_inputs=example_inputs, **kwargs) + else: + raise ValueError("The 'method' parameter only supports 'script' or 'trace'," + f" but value given was: {method}") + self.train(mode) if file_path is not None: diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 142fe9048cb0ea..b6112a68b4e9b8 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -15,15 +15,15 @@ """[Train, Eval]Result for easier logging, checkpointing, early stopping, epoch-wise reduction.""" import numbers +import os from copy import copy -from typing import Optional, Dict, Union, Sequence, Callable, MutableMapping, Any, List, Tuple, Iterable +from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Sequence, Tuple, Union import torch from torch import Tensor -import os -from pytorch_lightning.utilities.distributed import sync_ddp_if_available from pytorch_lightning.metrics import Metric +from pytorch_lightning.utilities.distributed import sync_ddp_if_available class Result(Dict): @@ -128,6 +128,7 @@ def log( sync_dist_group: Optional[Any] = None, sync_fn: Callable = None, dataloader_idx: Optional[int] = None, + device: torch.device = None, ): # no metrics should be logged with graphs if not enable_graph and isinstance(value, torch.Tensor): @@ -138,7 +139,10 @@ def log( if sync_dist and isinstance(value, (torch.Tensor, numbers.Number)): is_dist_initialized = torch.distributed.is_available() and torch.distributed.is_initialized() # TODO: Find a way to make the reduction only once, so we don't need to clone. - value = value.clone() if is_dist_initialized else value + if is_dist_initialized and isinstance(value, torch.Tensor): + value = value.clone() + else: + value = torch.tensor(value, device=device, dtype=torch.float) value = sync_fn(value, group=sync_dist_group, reduce_op=sync_dist_op) if 'meta' not in self: @@ -367,7 +371,10 @@ def get_forked_metrics(self, add_dataloader_idx=False): dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx) if options['forked']: - result[dl_key] = self[k] + if isinstance(self[k], Metric): + result[dl_key] = self[k].compute().detach() + else: + result[dl_key] = self[k] return result diff --git a/pytorch_lightning/metrics/classification/f_beta.py b/pytorch_lightning/metrics/classification/f_beta.py index d6147b00463b35..fadfd000ebbe1e 100755 --- a/pytorch_lightning/metrics/classification/f_beta.py +++ b/pytorch_lightning/metrics/classification/f_beta.py @@ -52,11 +52,11 @@ class FBeta(Metric): Threshold value for binary or multi-label logits. default: 0.5 average: - * `'micro'` computes metric globally - * `'macro'` computes metric for each class and uniformly averages them - * `'weighted'` computes metric for each class and does a weighted-average, - where each class is weighted by their support (accounts for class imbalance) - * `None` computes and returns the metric per class + - ``'micro'`` computes metric globally + - ``'macro'`` computes metric for each class and uniformly averages them + - ``'weighted'`` computes metric for each class and does a weighted-average, + where each class is weighted by their support (accounts for class imbalance) + - ``'none'`` computes and returns the metric per class multilabel: If predictions are from multilabel classification. compute_on_step: @@ -185,11 +185,11 @@ class F1(FBeta): Threshold value for binary or multi-label logits. default: 0.5 average: - * `'micro'` computes metric globally - * `'macro'` computes metric for each class and uniformly averages them - * `'weighted'` computes metric for each class and does a weighted-average, - where each class is weighted by their support (accounts for class imbalance) - * `None` computes and returns the metric per class + - ``'micro'`` computes metric globally + - ``'macro'`` computes metric for each class and uniformly averages them + - ``'weighted'`` computes metric for each class and does a weighted-average, + where each class is weighted by their support (accounts for class imbalance) + - ``'none'`` computes and returns the metric per class multilabel: If predictions are from multilabel classification. compute_on_step: @@ -212,7 +212,6 @@ class F1(FBeta): def __init__( self, num_classes: int = 1, - beta: float = 1.0, threshold: float = 0.5, average: str = "micro", multilabel: bool = False, diff --git a/pytorch_lightning/metrics/functional/f_beta.py b/pytorch_lightning/metrics/functional/f_beta.py index 3f0a7a04493257..2b0ba194d56f02 100755 --- a/pytorch_lightning/metrics/functional/f_beta.py +++ b/pytorch_lightning/metrics/functional/f_beta.py @@ -83,11 +83,11 @@ def fbeta( Threshold value for binary or multi-label logits. default: 0.5 average: - * `'micro'` computes metric globally - * `'macro'` computes metric for each class and uniformly averages them - * `'weighted'` computes metric for each class and does a weighted-average, - where each class is weighted by their support (accounts for class imbalance) - * `None` computes and returns the metric per class + - ``'micro'`` computes metric globally + - ``'macro'`` computes metric for each class and uniformly averages them + - ``'weighted'`` computes metric for each class and does a weighted-average, + where each class is weighted by their support (accounts for class imbalance) + - ``'none'`` computes and returns the metric per class multilabel: If predictions are from multilabel classification. @@ -110,7 +110,6 @@ def f1( preds: torch.Tensor, target: torch.Tensor, num_classes: int, - beta: float = 1.0, threshold: float = 0.5, average: str = "micro", multilabel: bool = False @@ -136,11 +135,11 @@ def f1( Threshold value for binary or multi-label logits. default: 0.5 average: - * `'micro'` computes metric globally - * `'macro'` computes metric for each class and uniformly averages them - * `'weighted'` computes metric for each class and does a weighted-average, - where each class is weighted by their support (accounts for class imbalance) - * `None` computes and returns the metric per class + - ``'micro'`` computes metric globally + - ``'macro'`` computes metric for each class and uniformly averages them + - ``'weighted'`` computes metric for each class and does a weighted-average, + where each class is weighted by their support (accounts for class imbalance) + - ``'none'`` computes and returns the metric per class multilabel: If predictions are from multilabel classification. diff --git a/pytorch_lightning/setup_tools.py b/pytorch_lightning/setup_tools.py index 3842bbe50cfc5d..29ac3b814b3c2e 100644 --- a/pytorch_lightning/setup_tools.py +++ b/pytorch_lightning/setup_tools.py @@ -14,12 +14,12 @@ # limitations under the License. import os import re -import warnings from typing import Iterable, List from urllib.error import HTTPError, URLError from urllib.request import Request, urlopen +import warnings -from pytorch_lightning import PROJECT_ROOT, __homepage__, __version__ +from pytorch_lightning import __homepage__, __version__, PROJECT_ROOT _PATH_BADGES = os.path.join('.', 'docs', 'source', '_images', 'badges') # badge to download diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index 2311cc767de2d4..429bddd88b77e9 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -13,14 +13,16 @@ # limitations under the License. import os +from pathlib import Path import re +from typing import Union, Optional import torch import pytorch_lightning from pytorch_lightning import _logger as log from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.utilities import APEX_AVAILABLE, AMPType, OMEGACONF_AVAILABLE, rank_zero_warn +from pytorch_lightning.utilities import APEX_AVAILABLE, AMPType, OMEGACONF_AVAILABLE, rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.upgrade_checkpoint import KEYS_MAPPING as DEPRECATED_CHECKPOINT_KEYS @@ -52,16 +54,17 @@ def restore_weights(self, model: LightningModule): if self.trainer.on_gpu: torch.cuda.empty_cache() - # if script called from hpc resubmit, load weights - did_restore_hpc_weights = self.restore_hpc_weights_if_needed(model) + # 1. Attempt to restore states from HPC checkpoint + dir_path_hpc = str(self.trainer.weights_save_path) + max_suffix = self.max_ckpt_in_folder(dir_path_hpc, "hpc_ckpt_") + if max_suffix is not None: + checkpoint_path = f'{dir_path_hpc}/hpc_ckpt_{max_suffix}.ckpt' + self.hpc_load(checkpoint_path, self.trainer.on_gpu) + rank_zero_info(f'restored hpc model from: {checkpoint_path}') - # clear cache after restore - if self.trainer.on_gpu: - torch.cuda.empty_cache() - - if not did_restore_hpc_weights: - if self.trainer.resume_from_checkpoint is not None: - self.restore(self.trainer.resume_from_checkpoint, on_gpu=self.trainer.on_gpu) + # 2. Attempt to restore states from `resume_from_checkpoint` file + elif self.trainer.resume_from_checkpoint is not None: + self.restore(self.trainer.resume_from_checkpoint, on_gpu=self.trainer.on_gpu) # wait for all to catch up self.trainer.accelerator_backend.barrier('TrainerIOMixin.restore_weights') @@ -72,24 +75,14 @@ def restore_weights(self, model: LightningModule): def restore(self, checkpoint_path: str, on_gpu: bool): """ - Load model/training states from the checkpoint file through file-read and state-restore. - Also restores all training state like: - - epoch - - callbacks - - schedulers - - optimizer - In detail, check return value description of `dump_checkpoint` + Load model/training states from a 'PyTorch-Lightning checkpoint' file through file-read and state-restore. + All restored states are listed in return value description of `dump_checkpoint`. """ - # if on_gpu: - # checkpoint = torch.load(checkpoint_path) - # else: - # load on CPU first - # read a checkpoint dictionary object from the checkpoint file at `checkpoint_path` + # read a checkpoint dictionary object from the 'PyTorch-Lightning checkpoint' file at `checkpoint_path` checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage) - # restore states from the checkpoint dictionary object - # load model state + # acquire the model model = self.trainer.get_model() # restore model and datamodule state @@ -106,14 +99,14 @@ def restore_model_state(self, model: LightningModule, checkpoint) -> None: Restore model states from a 'PyTorch-Lightning checkpoint' dictionary object """ - # give the datamodule a chance to load something + # restore datamodule states if self.trainer.datamodule is not None: self.trainer.datamodule.on_load_checkpoint(checkpoint) - # give model a chance to restore something + # hook: give user access to checkpoint if needed. model.on_load_checkpoint(checkpoint) - # restore the state_dict on the model + # restore model state_dict model.load_state_dict(checkpoint['state_dict']) def restore_training_state(self, checkpoint): @@ -187,23 +180,6 @@ def restore_training_state(self, checkpoint): for scheduler, lrs_state in zip(self.trainer.lr_schedulers, lr_schedulers): scheduler['scheduler'].load_state_dict(lrs_state) - def restore_hpc_weights_if_needed(self, model: LightningModule): - """If there is a set of hpc weights, use as signal to restore model.""" - did_restore = False - - # look for hpc weights - folderpath = str(self.trainer.weights_save_path) - fs = get_filesystem(folderpath) - if fs.exists(folderpath): - files = [os.path.basename(f['name']) for f in fs.listdir(folderpath)] - hpc_weight_paths = [x for x in files if 'hpc_ckpt' in x] - - # if hpc weights exist restore model - if len(hpc_weight_paths) > 0: - self.hpc_load(folderpath, self.trainer.on_gpu) - did_restore = True - return did_restore - # ---------------------------------- # PRIVATE OPS # ---------------------------------- @@ -216,7 +192,8 @@ def hpc_save(self, folderpath: str, logger): # save logger to make sure we get all the metrics logger.save() - ckpt_number = self.max_ckpt_in_folder(folderpath) + 1 + max_suffix = self.max_ckpt_in_folder(folderpath) + ckpt_number = (max_suffix if max_suffix is not None else 0) + 1 fs.makedirs(folderpath, exist_ok=True) filepath = os.path.join(folderpath, f'hpc_ckpt_{ckpt_number}.ckpt') @@ -333,36 +310,52 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict: return checkpoint - def hpc_load(self, folderpath, on_gpu): - filepath = '{}/hpc_ckpt_{}.ckpt'.format(folderpath, self.max_ckpt_in_folder(folderpath)) + def hpc_load(self, checkpoint_path: str, on_gpu: bool): + """ + Load model/training states from a 'PyTorch-Lightning checkpoint' file for hpc. + All restored states are listed in return value description of `dump_checkpoint`. + """ - # load on CPU first - checkpoint = pl_load(filepath, map_location=lambda storage, loc: storage) + # read a checkpoint dictionary object from the 'PyTorch-Lightning checkpoint' file at `checkpoint_path` + checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage) - # load model state + # acquire the model model = self.trainer.get_model() - # restore states from 'PyTorch-Lightning checkpoint' dictionary object + # restore model and datamodule state self.restore_model_state(model, checkpoint) if self.trainer.root_gpu is not None: model.cuda(self.trainer.root_gpu) - # load training state (affects trainer only) + # restore training state self.restore_training_state(checkpoint) - # call model hook + # call hpc specific hook model.on_hpc_load(checkpoint) - log.info(f'restored hpc model from: {filepath}') + def max_ckpt_in_folder(self, dir_path: Union[str, Path], name_key: str = 'ckpt_') -> Optional[int]: + """List up files in `dir_path` with name_key, then yield maximum suffix number. + + Args: + dir_path: path of directory which may contain files whose name include `name_key` + + Returns: + None if no-corresponding-file else maximum suffix number + """ + + # check directory existence + fs = get_filesystem(dir_path) + if not fs.exists(dir_path): + return None - def max_ckpt_in_folder(self, path, name_key='ckpt_'): - fs = get_filesystem(path) - files = [os.path.basename(f["name"]) for f in fs.listdir(path)] + # check corresponding file existence + files = [os.path.basename(f["name"]) for f in fs.listdir(dir_path)] files = [x for x in files if name_key in x] if len(files) == 0: - return 0 + return None + # extract suffix number ckpt_vs = [] for name in files: name = name.split(name_key)[-1] @@ -371,6 +364,13 @@ def max_ckpt_in_folder(self, path, name_key='ckpt_'): return max(ckpt_vs) + def get_max_ckpt_path_from_folder(self, folder_path: Union[str, Path]) -> str: + """Get path of maximum-epoch checkpoint in the folder.""" + + max_suffix = self.max_ckpt_in_folder(folder_path) + ckpt_number = max_suffix if max_suffix is not None else 0 + return f'{folder_path}/hpc_ckpt_{ckpt_number}.ckpt' + def save_checkpoint(self, filepath, weights_only: bool = False): """Save model/training states as a checkpoint file through state-dump and file-write. diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 28025859814cc2..6d206f3dd929ed 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -91,11 +91,13 @@ def check_dataloader_idx(self, result: Result) -> bool: random_key = list(result.keys())[-1] return result["meta"][random_key]["dataloader_idx"] is not None - def get_latest_from_func_name(self, latest_result, func_name: str, *args, **kwargs) -> Dict: + def get_latest_from_func_name(self, latest_result_opt, func_name: str, *args, **kwargs) -> Dict: results = {} - add_dataloader_idx = self.check_dataloader_idx(latest_result) - func = getattr(latest_result, func_name) - results.update(func(*args, add_dataloader_idx=add_dataloader_idx, **kwargs)) + for opt_idx in latest_result_opt: + latest_result = latest_result_opt[opt_idx] + add_dataloader_idx = self.check_dataloader_idx(latest_result) + func = getattr(latest_result, func_name) + results.update(func(*args, add_dataloader_idx=add_dataloader_idx, **kwargs)) return results def run_latest_batch_metrics_with_func_name(self, func_name, *args, **kwargs) -> List[Dict]: @@ -156,6 +158,7 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio assert isinstance(result, Result) if dataloader_idx is None: dataloader_idx = 0 + if extra_info is None: extra_info = {} @@ -166,6 +169,7 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio if dataloader_idx not in self._internals: self._internals[dataloader_idx] = {} self._internals_reduced[dataloader_idx] = defaultdict(dict) + self._latest_ref[dataloader_idx] = {} # extract infos opt_idx = extra_info["opt_idx"] @@ -173,7 +177,7 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio self._append_to_structure(self._internals[dataloader_idx], opt_idx, batch_idx, result) - self._latest_ref[dataloader_idx] = result + self._latest_ref[dataloader_idx][opt_idx] = result # [dataloader_idx] is a list else: @@ -181,7 +185,11 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio self._internals.setdefault(dataloader_idx, []) self._internals[dataloader_idx].append(result) - self._latest_ref[dataloader_idx] = result + if dataloader_idx not in self._latest_ref: + self._latest_ref[dataloader_idx] = {} + self._latest_ref[dataloader_idx][0] = {} + + self._latest_ref[dataloader_idx][0] = result def auto_reduce_results_on_epoch_end(self) -> None: """ @@ -206,13 +214,9 @@ def auto_reduce_results_on_epoch_end(self) -> None: # TODO: How to start training in middle of epoch opt_outputs = epoch_metrics[opt_idx] - num_batch_idx = len(self._internals[dl_idx][num_opt_idx]) - 1 - assert num_batch_idx >= 0 - batch_indexes = self._internals[dl_idx][num_opt_idx].keys() - # reduce across time first time_reduced_outputs = [] - for batch_idx in batch_indexes: + for batch_idx in opt_outputs.keys(): tbptt_outs = opt_outputs[batch_idx] tbptt_outs = tbptt_outs[0].__class__.reduce_across_time(tbptt_outs) if len(tbptt_outs) > 1: diff --git a/pytorch_lightning/trainer/optimizers.py b/pytorch_lightning/trainer/optimizers.py index 6f3ba80bd0734f..479d4017202611 100644 --- a/pytorch_lightning/trainer/optimizers.py +++ b/pytorch_lightning/trainer/optimizers.py @@ -94,6 +94,7 @@ def configure_schedulers(self, schedulers: list, monitor: Optional[str] = None): lr_schedulers = [] default_config = { 'scheduler': None, + 'name': None, # no custom name 'interval': 'epoch', # after epoch is over 'frequency': 1, # every epoch/batch 'reduce_on_plateau': False, # most often not ReduceLROnPlateau scheduler diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index 57747be0d51fb5..b2ba92846b241a 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -50,7 +50,7 @@ def __init__(self, window_length: int): def reset(self) -> None: """Empty the accumulator.""" - self = TensorRunningAccum(self.window_length) + self.__init__(self.window_length) def last(self): """Get the last added element.""" diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 35da90625adefe..5a837956bc4ce2 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -133,7 +133,7 @@ def __init__( distributed_backend: Optional[str] = None, automatic_optimization: Optional[bool] = None, move_metrics_to_cpu: bool = False, - enable_pl_optimizer: bool = True, + enable_pl_optimizer: bool = False, ): r""" Customize every aspect of training via flags diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index 9724f05247c009..c315c6633b6fb3 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -15,14 +15,14 @@ import os import warnings from functools import wraps +from typing import Any, Optional, Union import torch + from pytorch_lightning import _logger as log -from typing import Union, Optional, Any if torch.distributed.is_available(): - from torch.distributed import ReduceOp - from torch.distributed import group + from torch.distributed import ReduceOp, group else: class ReduceOp: SUM = None @@ -145,15 +145,14 @@ def sync_ddp( if group is None: group = torch.distributed.group.WORLD - if reduce_op is None: - reduce_op = torch.distributed.ReduceOp.SUM - elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"): - reduce_op = torch.distributed.ReduceOp.SUM + op = reduce_op if isinstance(reduce_op, ReduceOp) else ReduceOp.SUM + + if isinstance(reduce_op, str) and reduce_op.lower() in ("avg", "mean"): divide_by_world_size = True # sync all processes before reduction torch.distributed.barrier(group=group) - torch.distributed.all_reduce(result, op=reduce_op, group=group, async_op=False) + torch.distributed.all_reduce(result, op=op, group=group, async_op=False) if divide_by_world_size: result = result / torch.distributed.get_world_size(group) diff --git a/tests/__init__.py b/tests/__init__.py index 981d685430da99..1bb81c466e6eb7 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,3 +1,16 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import os import numpy as np diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py index d8847d592e1de6..6f6b5f858ff175 100644 --- a/tests/base/deterministic_model.py +++ b/tests/base/deterministic_model.py @@ -15,7 +15,6 @@ from torch import nn from torch.utils.data import Dataset, DataLoader -from pytorch_lightning.core.step_result import TrainResult, EvalResult from pytorch_lightning.core.lightning import LightningModule @@ -111,235 +110,6 @@ def training_epoch_end_scalar(self, outputs): assert batch_out.grad_fn is None assert isinstance(batch_out, torch.Tensor) - def training_step_no_default_callbacks_for_train_loop(self, batch, batch_idx): - """ - Early stop and checkpoint only on these values - """ - acc = self.step(batch, batch_idx) - result = TrainResult(minimize=acc) - assert 'early_step_on' not in result - assert 'checkpoint_on' in result - return result - - def training_step_no_callbacks_result_obj(self, batch, batch_idx): - """ - Early stop and checkpoint only on these values - """ - acc = self.step(batch, batch_idx) - result = TrainResult(minimize=acc, checkpoint_on=False) - assert 'early_step_on' not in result - assert 'checkpoint_on' not in result - return result - - def training_step_result_log_epoch_and_step_for_callbacks(self, batch, batch_idx): - """ - Early stop and checkpoint only on these values - """ - acc = self.step(batch, batch_idx) - - self.assert_backward = False - losses = [20, 19, 18, 10, 15, 14, 9, 11, 11, 20] - idx = self.current_epoch - loss = acc + losses[idx] - result = TrainResult(minimize=loss, early_stop_on=loss, checkpoint_on=loss) - return result - - def training_step_result_log_step_only(self, batch, batch_idx): - acc = self.step(batch, batch_idx) - result = TrainResult(minimize=acc) - - # step only metrics - result.log(f'step_log_and_pbar_acc1_b{batch_idx}', torch.tensor(11).type_as(acc), prog_bar=True) - result.log(f'step_log_acc2_b{batch_idx}', torch.tensor(12).type_as(acc)) - result.log(f'step_pbar_acc3_b{batch_idx}', torch.tensor(13).type_as(acc), logger=False, prog_bar=True) - - self.training_step_called = True - return result - - def training_step_result_log_epoch_only(self, batch, batch_idx): - acc = self.step(batch, batch_idx) - result = TrainResult(minimize=acc) - - result.log(f'epoch_log_and_pbar_acc1_e{self.current_epoch}', torch.tensor(14).type_as(acc), - on_epoch=True, prog_bar=True, on_step=False) - result.log(f'epoch_log_acc2_e{self.current_epoch}', torch.tensor(15).type_as(acc), - on_epoch=True, on_step=False) - result.log(f'epoch_pbar_acc3_e{self.current_epoch}', torch.tensor(16).type_as(acc), - on_epoch=True, logger=False, prog_bar=True, on_step=False) - - self.training_step_called = True - return result - - def training_step_result_log_epoch_and_step(self, batch, batch_idx): - acc = self.step(batch, batch_idx) - result = TrainResult(minimize=acc) - - val_1 = (5 + batch_idx) * (self.current_epoch + 1) - val_2 = (6 + batch_idx) * (self.current_epoch + 1) - val_3 = (7 + batch_idx) * (self.current_epoch + 1) - result.log('step_epoch_log_and_pbar_acc1', torch.tensor(val_1).type_as(acc), - on_epoch=True, prog_bar=True) - result.log('step_epoch_log_acc2', torch.tensor(val_2).type_as(acc), - on_epoch=True) - result.log('step_epoch_pbar_acc3', torch.tensor(val_3).type_as(acc), - on_epoch=True, logger=False, prog_bar=True) - - self.training_step_called = True - return result - - def training_epoch_end_return_for_log_epoch_and_step(self, result): - """ - There should be an array of scalars without graphs that are all 171 (4 of them) - """ - self.training_epoch_end_called = True - - if self.use_dp or self.use_ddp2: - pass - else: - # only saw 4 batches - assert isinstance(result, TrainResult) - - result.step_epoch_log_acc2 = result.step_epoch_log_acc2_step.prod() - result.step_epoch_pbar_acc3 = result.step_epoch_pbar_acc3_step.prod() - result.step_epoch_log_and_pbar_acc1 = result.step_epoch_log_and_pbar_acc1_step.prod() - result.minimize = result.minimize.mean() - result.checkpoint_on = result.checkpoint_on.mean() - - result.step_epoch_log_and_pbar_acc1_step = result.step_epoch_log_and_pbar_acc1_step.prod() - result.step_epoch_log_and_pbar_acc1_epoch = result.step_epoch_log_and_pbar_acc1_epoch.prod() - result.step_epoch_log_acc2_step = result.step_epoch_log_acc2_step.prod() - result.step_epoch_log_acc2_epoch = result.step_epoch_log_acc2_epoch.prod() - result.step_epoch_pbar_acc3_step = result.step_epoch_pbar_acc3_step.prod() - result.step_epoch_pbar_acc3_epoch = result.step_epoch_pbar_acc3_epoch.prod() - result.log('epoch_end_log_acc', torch.tensor(1212).type_as(result.step_epoch_log_acc2_epoch), - logger=True, on_epoch=True) - result.log('epoch_end_pbar_acc', torch.tensor(1213).type_as(result.step_epoch_log_acc2_epoch), - logger=False, prog_bar=True, on_epoch=True) - result.log('epoch_end_log_pbar_acc', torch.tensor(1214).type_as(result.step_epoch_log_acc2_epoch), - logger=True, prog_bar=True, on_epoch=True) - return result - - # -------------------------- - # EvalResults - # -------------------------- - def validation_step_result_callbacks(self, batch, batch_idx): - acc = self.step(batch, batch_idx) - - self.assert_backward = False - losses = [20, 19, 20, 21, 22, 23] - idx = self.current_epoch - loss = acc + losses[idx] - result = EvalResult(early_stop_on=loss, checkpoint_on=loss) - - self.validation_step_called = True - return result - - def validation_step_result_no_callbacks(self, batch, batch_idx): - acc = self.step(batch, batch_idx) - - self.assert_backward = False - losses = [20, 19, 20, 21, 22, 23, 50, 50, 50, 50, 50, 50] - idx = self.current_epoch - loss = acc + losses[idx] - - result = EvalResult(checkpoint_on=loss) - - self.validation_step_called = True - return result - - def validation_step_result_only_epoch_metrics(self, batch, batch_idx): - """ - Only track epoch level metrics - """ - acc = self.step(batch, batch_idx) - result = EvalResult(checkpoint_on=acc, early_stop_on=acc) - - # step only metrics - result.log('no_val_no_pbar', torch.tensor(11 + batch_idx).type_as(acc), prog_bar=False, logger=False) - result.log('val_step_log_acc', torch.tensor(11 + batch_idx).type_as(acc), prog_bar=False, logger=True) - result.log('val_step_log_pbar_acc', torch.tensor(12 + batch_idx).type_as(acc), prog_bar=True, logger=True) - result.log('val_step_pbar_acc', torch.tensor(13 + batch_idx).type_as(acc), prog_bar=True, logger=False) - - self.validation_step_called = True - return result - - def validation_step_result_only_step_metrics(self, batch, batch_idx): - """ - Only track epoch level metrics - """ - acc = self.step(batch, batch_idx) - result = EvalResult(checkpoint_on=acc, early_stop_on=acc) - - # step only metrics - result.log('no_val_no_pbar', torch.tensor(11 + batch_idx).type_as(acc), - prog_bar=False, logger=False, on_epoch=False, on_step=True) - result.log('val_step_log_acc', torch.tensor(11 + batch_idx).type_as(acc), - prog_bar=False, logger=True, on_epoch=False, on_step=True) - result.log('val_step_log_pbar_acc', torch.tensor(12 + batch_idx).type_as(acc), - prog_bar=True, logger=True, on_epoch=False, on_step=True) - result.log('val_step_pbar_acc', torch.tensor(13 + batch_idx).type_as(acc), - prog_bar=True, logger=False, on_epoch=False, on_step=True) - result.log('val_step_batch_idx', torch.tensor(batch_idx).type_as(acc), - prog_bar=True, logger=True, on_epoch=False, on_step=True) - - self.validation_step_called = True - return result - - def validation_step_result_epoch_step_metrics(self, batch, batch_idx): - """ - Only track epoch level metrics - """ - acc = self.step(batch, batch_idx) - result = EvalResult(checkpoint_on=acc, early_stop_on=acc) - - # step only metrics - result.log('no_val_no_pbar', torch.tensor(11 + batch_idx).type_as(acc), - prog_bar=False, logger=False, on_epoch=True, on_step=True) - result.log('val_step_log_acc', torch.tensor(11 + batch_idx).type_as(acc), - prog_bar=False, logger=True, on_epoch=True, on_step=True) - result.log('val_step_log_pbar_acc', torch.tensor(12 + batch_idx).type_as(acc), - prog_bar=True, logger=True, on_epoch=True, on_step=True) - result.log('val_step_pbar_acc', torch.tensor(13 + batch_idx).type_as(acc), - prog_bar=True, logger=False, on_epoch=True, on_step=True) - result.log('val_step_batch_idx', torch.tensor(batch_idx).type_as(acc), - prog_bar=True, logger=True, on_epoch=True, on_step=True) - - self.validation_step_called = True - return result - - def validation_step_for_epoch_end_result(self, batch, batch_idx): - """ - EvalResult flows to epoch end (without step_end) - """ - acc = self.step(batch, batch_idx) - result = EvalResult(checkpoint_on=acc, early_stop_on=acc) - - # step only metrics - result.log('val_step_metric', torch.tensor(batch_idx).type_as(acc), - prog_bar=True, logger=True, on_epoch=True, on_step=False) - result.log('batch_idx', torch.tensor(batch_idx).type_as(acc), - prog_bar=True, logger=True, on_epoch=True, on_step=False) - - self.validation_step_called = True - return result - - def validation_epoch_end_result(self, result): - self.validation_epoch_end_called = True - - if self.trainer.running_sanity_check: - assert len(result.batch_idx) == 2 - else: - assert len(result.batch_idx) == self.trainer.limit_val_batches - - expected_val = result.val_step_metric.sum() / len(result.batch_idx) - result.val_step_metric = result.val_step_metric.mean() - result.batch_idx = result.batch_idx.mean() - assert result.val_step_metric == expected_val - - result.log('val_epoch_end_metric', torch.tensor(189).type_as(result.val_step_metric), prog_bar=True) - - return result - # -------------------------- # dictionary returns # -------------------------- diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py index 18bb0c4d72715b..24535dc67da8e8 100644 --- a/tests/base/develop_pipelines.py +++ b/tests/base/develop_pipelines.py @@ -86,9 +86,11 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, wi trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = \ trainer.init_optimizers(pretrained_model) - # test HPC loading / saving + # test HPC saving trainer.checkpoint_connector.hpc_save(save_dir, logger) - trainer.checkpoint_connector.hpc_load(save_dir, on_gpu=on_gpu) + # test HPC loading + checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder(save_dir) + trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=on_gpu) def run_prediction(dataloader, trained_model, dp=False, min_acc=0.50): diff --git a/tests/base/model_test_steps.py b/tests/base/model_test_steps.py index 0010dcdf14a197..440ec4c4c35b47 100644 --- a/tests/base/model_test_steps.py +++ b/tests/base/model_test_steps.py @@ -59,38 +59,6 @@ def test_step(self, batch, batch_idx, *args, **kwargs): 'test_dic': {'test_loss_a': loss_test}}) return output - def test_step_result_obj(self, batch, batch_idx, *args, **kwargs): - """ - Default, baseline test_step - :param batch: - :return: - """ - x, y = batch - x = x.view(x.size(0), -1) - y_hat = self(x) - - loss_test = self.loss(y, y_hat) - - # acc - labels_hat = torch.argmax(y_hat, dim=1) - test_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0) - test_acc = torch.tensor(test_acc) - - test_acc = test_acc.type_as(x) - - result = EvalResult() - # alternate possible outputs to test - if batch_idx % 1 == 0: - result.log_dict({'test_loss': loss_test, 'test_acc': test_acc}) - return result - if batch_idx % 2 == 0: - return test_acc - - if batch_idx % 3 == 0: - result.log_dict({'test_loss': loss_test, 'test_acc': test_acc}) - result.test_dic = {'test_loss_a': loss_test} - return result - def test_step__multiple_dataloaders(self, batch, batch_idx, dataloader_idx, **kwargs): """ Default, baseline test_step diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py index caec6db9aaa10a..0590f5b7b5cccf 100644 --- a/tests/base/model_train_steps.py +++ b/tests/base/model_train_steps.py @@ -53,25 +53,6 @@ def training_step(self, batch, batch_idx, optimizer_idx=None): ) return output - def training_step_result_obj(self, batch, batch_idx, optimizer_idx=None): - # forward pass - x, y = batch - x = x.view(x.size(0), -1) - y_hat = self(x) - - # calculate loss - loss_val = self.loss(y, y_hat) - log_val = loss_val - - # alternate between tensors and scalars for "log" and "progress_bar" - if batch_idx % 2 == 0: - log_val = log_val.item() - - result = TrainResult(loss_val) - result.log('some_val', log_val * log_val, prog_bar=True, logger=False) - result.log('train_some_val', log_val * log_val) - return result - def training_step__inf_loss(self, batch, batch_idx, optimizer_idx=None): output = self.training_step(batch, batch_idx, optimizer_idx) if batch_idx == self.test_step_inf_loss: @@ -81,19 +62,6 @@ def training_step__inf_loss(self, batch, batch_idx, optimizer_idx=None): output /= 0 return output - def training_step_full_loop_result_obj_dp(self, batch, batch_idx, optimizer_idx=None): - """ - Full loop flow train step (result obj + dp) - """ - x, y = batch - x = x.view(x.size(0), -1) - y_hat = self(x.to(self.device)) - loss_val = y_hat.sum() - result = TrainResult(minimize=loss_val) - result.log('train_step_metric', loss_val + 1) - self.training_step_called = True - return result - def training_step_result_obj_dp(self, batch, batch_idx, optimizer_idx=None): # forward pass x, y = batch @@ -136,23 +104,6 @@ def training_epoch_end_full_loop_result_obj_dp(self, result): return result - def eval_step_full_loop_result_obj_dp(self, batch, batch_idx, optimizer_idx=None): - """ - Full loop flow train step (result obj + dp) - """ - x, y = batch - x = x.view(x.size(0), -1) - y_hat = self(x.to(self.device)) - loss_val = y_hat.sum() - result = EvalResult(checkpoint_on=loss_val, early_stop_on=loss_val) - - eval_name = 'validation' if not self.trainer.testing else 'test' - result.log(f'{eval_name}_step_metric', loss_val + 1, on_step=True) - - setattr(self, f'{eval_name}_step_called', True) - - return result - def eval_step_end_full_loop_result_obj_dp(self, result): """ Full loop flow train step (result obj + dp) @@ -198,20 +149,3 @@ def eval_epoch_end_full_loop_result_obj_dp(self, result): setattr(result, f'{eval_name}_step_metric', reduced) return result - - def training_step__using_metrics(self, batch, batch_idx, optimizer_idx=None): - """Lightning calls this inside the training loop""" - # forward pass - x, y = batch - x = x.view(x.size(0), -1) - y_hat = self(x) - - # calculate loss - loss_val = self.loss(y, y_hat) - - # call metric - val = self.metric(x, y) - - result = TrainResult(minimize=loss_val) - result.log('metric_val', val) - return result diff --git a/tests/base/model_valid_steps.py b/tests/base/model_valid_steps.py index e23e62dccdaba6..a008a6cecf1102 100644 --- a/tests/base/model_valid_steps.py +++ b/tests/base/model_valid_steps.py @@ -71,25 +71,6 @@ def validation_step_no_monitor(self, batch, batch_idx, *args, **kwargs): }) return output - def validation_step_result_obj(self, batch, batch_idx, *args, **kwargs): - x, y = batch - x = x.view(x.size(0), -1) - y_hat = self(x) - - loss_val = self.loss(y, y_hat) - - # acc - labels_hat = torch.argmax(y_hat, dim=1) - val_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0) - val_acc = torch.tensor(val_acc).type_as(x) - - result = EvalResult(checkpoint_on=loss_val, early_stop_on=loss_val) - result.log_dict({ - 'val_loss': loss_val, - 'val_acc': val_acc, - }) - return result - def validation_step_result_obj_dp(self, batch, batch_idx, *args, **kwargs): x, y = batch x = x.view(x.size(0), -1) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index c00c712bb3b13c..070bb4e9f6989b 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -33,6 +33,8 @@ def test_trainer_callback_system(torch_save): limit_train_batches=3, limit_test_batches=2, progress_bar_refresh_rate=0, + # todo: enabled since internally we wrap the model for optimizer step, this should be fixed + enable_pl_optimizer=True ) # no call yet diff --git a/tests/callbacks/test_lr_monitor.py b/tests/callbacks/test_lr_monitor.py index a6783435ed3e27..d29f254df67d0d 100644 --- a/tests/callbacks/test_lr_monitor.py +++ b/tests/callbacks/test_lr_monitor.py @@ -13,11 +13,11 @@ # limitations under the License. import pytest +import tests.base.develop_utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.callbacks import LearningRateMonitor from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.base import EvalModelTemplate -import tests.base.develop_utils as tutils +from tests.base import BoringModel, EvalModelTemplate def test_lr_monitor_single_lr(tmpdir): @@ -43,7 +43,7 @@ def test_lr_monitor_single_lr(tmpdir): 'Momentum should not be logged by default' assert len(lr_monitor.lrs) == len(trainer.lr_schedulers), \ 'Number of learning rates logged does not match number of lr schedulers' - assert all([k in ['lr-Adam'] for k in lr_monitor.lrs.keys()]), \ + assert lr_monitor.lr_sch_names == list(lr_monitor.lrs.keys()) == ['lr-Adam'], \ 'Names of learning rates not set correctly' @@ -134,7 +134,7 @@ def test_lr_monitor_multi_lrs(tmpdir, logging_interval): assert lr_monitor.lrs, 'No learning rates logged' assert len(lr_monitor.lrs) == len(trainer.lr_schedulers), \ 'Number of learning rates logged does not match number of lr schedulers' - assert all([k in ['lr-Adam', 'lr-Adam-1'] for k in lr_monitor.lrs.keys()]), \ + assert lr_monitor.lr_sch_names == ['lr-Adam', 'lr-Adam-1'], \ 'Names of learning rates not set correctly' if logging_interval == 'step': @@ -167,5 +167,27 @@ def test_lr_monitor_param_groups(tmpdir): assert lr_monitor.lrs, 'No learning rates logged' assert len(lr_monitor.lrs) == 2 * len(trainer.lr_schedulers), \ 'Number of learning rates logged does not match number of param groups' - assert all([k in ['lr-Adam/pg1', 'lr-Adam/pg2'] for k in lr_monitor.lrs.keys()]), \ + assert lr_monitor.lr_sch_names == ['lr-Adam'] + assert list(lr_monitor.lrs.keys()) == ['lr-Adam/pg1', 'lr-Adam/pg2'], \ 'Names of learning rates not set correctly' + + +def test_lr_monitor_custom_name(tmpdir): + class TestModel(BoringModel): + def configure_optimizers(self): + optimizer, [scheduler] = super().configure_optimizers() + lr_scheduler = {'scheduler': scheduler, 'name': 'my_logging_name'} + return optimizer, [lr_scheduler] + + lr_monitor = LearningRateMonitor() + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=2, + limit_val_batches=0.1, + limit_train_batches=0.5, + callbacks=[lr_monitor], + progress_bar_refresh_rate=0, + weights_summary=None, + ) + trainer.fit(TestModel()) + assert lr_monitor.lr_sch_names == list(lr_monitor.lrs.keys()) == ['my_logging_name'] diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index 31154eac1bf0d6..106c34030051e7 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -12,15 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -import os.path as osp import pickle import platform import re from argparse import Namespace -from distutils.version import LooseVersion from pathlib import Path from unittest import mock -from unittest.mock import MagicMock, Mock +from unittest.mock import Mock import cloudpickle import pytest @@ -641,20 +639,17 @@ def validation_epoch_end(self, outputs): @pytest.mark.parametrize("enable_pl_optimizer", [False, True]) def test_checkpoint_repeated_strategy(enable_pl_optimizer, tmpdir): """ - This test validates that the checkpoint can be called when provided to callacks list + This test validates that the checkpoint can be called when provided to callbacks list """ - checkpoint_callback = ModelCheckpoint(monitor='val_loss', dirpath=tmpdir, filename="{epoch:02d}") class ExtendedBoringModel(BoringModel): - def validation_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) return {"val_loss": loss} model = ExtendedBoringModel() - model.validation_step_end = None model.validation_epoch_end = None trainer = Trainer( max_epochs=1, @@ -663,92 +658,30 @@ def validation_step(self, batch, batch_idx): limit_test_batches=2, callbacks=[checkpoint_callback], enable_pl_optimizer=enable_pl_optimizer, + weights_summary=None, + progress_bar_refresh_rate=0, ) - trainer.fit(model) assert os.listdir(tmpdir) == ['epoch=00.ckpt'] - def get_last_checkpoint(): - ckpts = os.listdir(tmpdir) - ckpts_map = {int(x.split("=")[1].split('.')[0]): osp.join(tmpdir, x) for x in ckpts if "epoch" in x} - num_ckpts = len(ckpts_map) - 1 - return ckpts_map[num_ckpts] - - for idx in range(1, 5): + for idx in range(4): # load from checkpoint - chk = get_last_checkpoint() - model = BoringModel.load_from_checkpoint(chk) - trainer = pl.Trainer( - max_epochs=1, - limit_train_batches=2, - limit_val_batches=2, - limit_test_batches=2, - resume_from_checkpoint=chk, - enable_pl_optimizer=enable_pl_optimizer) - trainer.fit(model) - trainer.test(model) - - assert str(os.listdir(tmpdir)) == "['epoch=00.ckpt']" - - -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) -@pytest.mark.parametrize("enable_pl_optimizer", [False, True]) -def test_checkpoint_repeated_strategy_tmpdir(enable_pl_optimizer, tmpdir): - """ - This test validates that the checkpoint can be called when provided to callacks list - """ - - checkpoint_callback = ModelCheckpoint(monitor='val_loss', filepath=os.path.join(tmpdir, "{epoch:02d}")) - - class ExtendedBoringModel(BoringModel): - - def validation_step(self, batch, batch_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - return {"val_loss": loss} - - model = ExtendedBoringModel() - model.validation_step_end = None - model.validation_epoch_end = None - trainer = Trainer( - default_root_dir=tmpdir, - max_epochs=1, - limit_train_batches=2, - limit_val_batches=2, - limit_test_batches=2, - callbacks=[checkpoint_callback], - enable_pl_optimizer=enable_pl_optimizer, - ) - - trainer.fit(model) - assert sorted(os.listdir(tmpdir)) == sorted(['epoch=00.ckpt', 'lightning_logs']) - path_to_lightning_logs = osp.join(tmpdir, 'lightning_logs') - assert sorted(os.listdir(path_to_lightning_logs)) == sorted(['version_0']) - - def get_last_checkpoint(): - ckpts = os.listdir(tmpdir) - ckpts_map = {int(x.split("=")[1].split('.')[0]): osp.join(tmpdir, x) for x in ckpts if "epoch" in x} - num_ckpts = len(ckpts_map) - 1 - return ckpts_map[num_ckpts] - - for idx in range(1, 5): - - # load from checkpoint - chk = get_last_checkpoint() - model = LogInTwoMethods.load_from_checkpoint(chk) + model = LogInTwoMethods.load_from_checkpoint(checkpoint_callback.best_model_path) trainer = pl.Trainer( default_root_dir=tmpdir, max_epochs=1, limit_train_batches=2, limit_val_batches=2, limit_test_batches=2, - resume_from_checkpoint=chk, - enable_pl_optimizer=enable_pl_optimizer) - + resume_from_checkpoint=checkpoint_callback.best_model_path, + enable_pl_optimizer=enable_pl_optimizer, + weights_summary=None, + progress_bar_refresh_rate=0, + ) trainer.fit(model) - trainer.test(model) - assert sorted(os.listdir(tmpdir)) == sorted(['epoch=00.ckpt', 'lightning_logs']) - assert sorted(os.listdir(path_to_lightning_logs)) == sorted([f'version_{i}' for i in range(idx + 1)]) + trainer.test(model, verbose=False) + assert set(os.listdir(tmpdir)) == {'epoch=00.ckpt', 'lightning_logs'} + assert set(os.listdir(tmpdir.join("lightning_logs"))) == {f'version_{i}' for i in range(4)} @mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) @@ -760,21 +693,22 @@ def test_checkpoint_repeated_strategy_extended(enable_pl_optimizer, tmpdir): """ class ExtendedBoringModel(BoringModel): - def validation_step(self, batch, batch_idx): output = self.layer(batch) loss = self.loss(batch, output) return {"val_loss": loss} + def validation_epoch_end(self, *_): + ... + def assert_trainer_init(trainer): assert not trainer.checkpoint_connector.has_trained assert trainer.global_step == 0 assert trainer.current_epoch == 0 def get_last_checkpoint(ckpt_dir): - ckpts = os.listdir(ckpt_dir) - ckpts.sort() - return osp.join(ckpt_dir, ckpts[-1]) + last = ckpt_dir.listdir(sort=True)[-1] + return str(last) def assert_checkpoint_content(ckpt_dir): chk = pl_load(get_last_checkpoint(ckpt_dir)) @@ -782,23 +716,15 @@ def assert_checkpoint_content(ckpt_dir): assert chk["global_step"] == 4 def assert_checkpoint_log_dir(idx): - lightning_logs_path = osp.join(tmpdir, 'lightning_logs') - assert sorted(os.listdir(lightning_logs_path)) == [f'version_{i}' for i in range(idx + 1)] - assert len(os.listdir(ckpt_dir)) == epochs - - def get_model(): - model = ExtendedBoringModel() - model.validation_step_end = None - model.validation_epoch_end = None - return model + lightning_logs = tmpdir / 'lightning_logs' + actual = [d.basename for d in lightning_logs.listdir(sort=True)] + assert actual == [f'version_{i}' for i in range(idx + 1)] + assert len(ckpt_dir.listdir()) == epochs - ckpt_dir = osp.join(tmpdir, 'checkpoints') + ckpt_dir = tmpdir / 'checkpoints' checkpoint_cb = ModelCheckpoint(dirpath=ckpt_dir, save_top_k=-1) epochs = 2 limit_train_batches = 2 - - model = get_model() - trainer_config = dict( default_root_dir=tmpdir, max_epochs=epochs, @@ -806,40 +732,32 @@ def get_model(): limit_val_batches=3, limit_test_batches=4, enable_pl_optimizer=enable_pl_optimizer, - ) - - trainer = pl.Trainer( - **trainer_config, callbacks=[checkpoint_cb], ) + trainer = pl.Trainer(**trainer_config) assert_trainer_init(trainer) + model = ExtendedBoringModel() trainer.fit(model) assert trainer.checkpoint_connector.has_trained assert trainer.global_step == epochs * limit_train_batches assert trainer.current_epoch == epochs - 1 assert_checkpoint_log_dir(0) + assert_checkpoint_content(ckpt_dir) trainer.test(model) assert trainer.current_epoch == epochs - 1 - assert_checkpoint_content(ckpt_dir) - for idx in range(1, 5): chk = get_last_checkpoint(ckpt_dir) assert_checkpoint_content(ckpt_dir) - checkpoint_cb = ModelCheckpoint(dirpath=ckpt_dir, save_top_k=-1) - model = get_model() - # load from checkpoint - trainer = pl.Trainer( - **trainer_config, - resume_from_checkpoint=chk, - callbacks=[checkpoint_cb], - ) + trainer_config["callbacks"] = [ModelCheckpoint(dirpath=ckpt_dir, save_top_k=-1)] + trainer = pl.Trainer(**trainer_config, resume_from_checkpoint=chk) assert_trainer_init(trainer) + model = ExtendedBoringModel() trainer.test(model) assert not trainer.checkpoint_connector.has_trained assert trainer.global_step == epochs * limit_train_batches @@ -1020,3 +938,42 @@ def __init__(self, hparams): else: # make sure it's not AttributeDict assert type(ckpt[model.CHECKPOINT_HYPER_PARAMS_KEY]) == hparams_type + + +@pytest.mark.parametrize('max_epochs', [3, 4]) +@pytest.mark.parametrize( + 'save_top_k, expected', + [ + (1, ['curr_epoch.ckpt']), + (2, ['curr_epoch.ckpt', 'curr_epoch-v0.ckpt']), + ] +) +def test_model_checkpoint_file_already_exists(tmpdir, max_epochs, save_top_k, expected): + """ + Test that version is added to filename if required and it already exists in dirpath. + """ + model_checkpoint = ModelCheckpoint( + dirpath=tmpdir, + filename='curr_epoch', + save_top_k=save_top_k, + monitor='epoch', + mode='max', + ) + trainer = Trainer( + default_root_dir=tmpdir, + callbacks=[model_checkpoint], + max_epochs=max_epochs, + limit_train_batches=2, + limit_val_batches=2, + logger=None, + weights_summary=None, + progress_bar_refresh_rate=0, + ) + + model = BoringModel() + trainer.fit(model) + ckpt_files = os.listdir(tmpdir) + assert set(ckpt_files) == set(expected) + + epochs_in_ckpt_files = [pl_load(os.path.join(tmpdir, f))['epoch'] - 1 for f in ckpt_files] + assert sorted(epochs_in_ckpt_files) == list(range(max_epochs - save_top_k, max_epochs)) diff --git a/tests/collect_env_details.py b/tests/collect_env_details.py index 1d443795d28767..2b8c4b3fafeed3 100644 --- a/tests/collect_env_details.py +++ b/tests/collect_env_details.py @@ -1,3 +1,16 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """Diagnose your system and show basic information This server mainly to get detail info for better bug reporting. diff --git a/tests/conftest.py b/tests/conftest.py index ad4b7169456a89..c6a14a99b24789 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,21 @@ -import sys -import threading +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from functools import partial, wraps from http.server import SimpleHTTPRequestHandler +import sys +import threading import pytest import torch.multiprocessing as mp diff --git a/tests/core/test_results.py b/tests/core/test_results.py index f4486ce6ae4194..797004b7f21ffa 100644 --- a/tests/core/test_results.py +++ b/tests/core/test_results.py @@ -18,7 +18,7 @@ import torch import torch.distributed as dist import torch.multiprocessing as mp -from pytorch_lightning import Trainer, seed_everything +from pytorch_lightning import Trainer from pytorch_lightning.core.step_result import Result, TrainResult, EvalResult import tests.base.develop_utils as tutils diff --git a/tests/deprecated_api/__init__.py b/tests/deprecated_api/__init__.py new file mode 100644 index 00000000000000..99e21d1ed6b229 --- /dev/null +++ b/tests/deprecated_api/__init__.py @@ -0,0 +1,21 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Test deprecated functionality which will be removed in vX.Y.Z""" +import sys + + +def _soft_unimport_module(str_module): + # once the module is imported e.g with parsing with pytest it lives in memory + if str_module in sys.modules: + del sys.modules[str_module] diff --git a/tests/deprecated_api/test_remove_1-2.py b/tests/deprecated_api/test_remove_1-2.py new file mode 100644 index 00000000000000..331208d56df103 --- /dev/null +++ b/tests/deprecated_api/test_remove_1-2.py @@ -0,0 +1,45 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Test deprecated functionality which will be removed in vX.Y.Z""" + +import pytest +import torch + +from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +def test_tbd_remove_in_v1_2_0(): + with pytest.deprecated_call(match='will be removed in v1.2'): + ModelCheckpoint(filepath='..') + + with pytest.deprecated_call(match='will be removed in v1.2'): + ModelCheckpoint('..') + + with pytest.raises(MisconfigurationException, match='inputs which are not feasible'): + ModelCheckpoint(filepath='..', dirpath='.') + + +def test_tbd_remove_in_v1_2_0_metrics(): + from pytorch_lightning.metrics.classification import Fbeta + from pytorch_lightning.metrics.functional.classification import f1_score, fbeta_score + + with pytest.deprecated_call(match='will be removed in v1.2'): + Fbeta(2) + + with pytest.deprecated_call(match='will be removed in v1.2'): + fbeta_score(torch.tensor([0, 1, 2, 3]), torch.tensor([0, 1, 2, 1]), 0.2) + + with pytest.deprecated_call(match='will be removed in v1.2'): + f1_score(torch.tensor([0, 1, 0, 1]), torch.tensor([0, 1, 0, 0])) diff --git a/tests/test_deprecated.py b/tests/deprecated_api/test_remove_1-3.py similarity index 60% rename from tests/test_deprecated.py rename to tests/deprecated_api/test_remove_1-3.py index 59c6728009b6f0..7ec69796b1e46e 100644 --- a/tests/test_deprecated.py +++ b/tests/deprecated_api/test_remove_1-3.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Test deprecated functionality which will be removed in vX.Y.Z""" -import sys from argparse import ArgumentParser from unittest import mock @@ -21,10 +20,8 @@ from pytorch_lightning import LightningModule, Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint -from pytorch_lightning.metrics.functional.classification import auc from pytorch_lightning.profiler.profilers import PassThroughProfiler, SimpleProfiler from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests.base import EvalModelTemplate def test_tbd_remove_in_v1_3_0(tmpdir): @@ -52,27 +49,27 @@ def __init__(self, hparams): def test_tbd_remove_in_v1_3_0_metrics(): + from pytorch_lightning.metrics.functional.classification import to_onehot with pytest.deprecated_call(match='will be removed in v1.3'): - from pytorch_lightning.metrics.functional.classification import to_onehot to_onehot(torch.tensor([1, 2, 3])) + from pytorch_lightning.metrics.functional.classification import to_categorical with pytest.deprecated_call(match='will be removed in v1.3'): - from pytorch_lightning.metrics.functional.classification import to_categorical to_categorical(torch.tensor([[0.2, 0.5], [0.9, 0.1]])) + from pytorch_lightning.metrics.functional.classification import get_num_classes with pytest.deprecated_call(match='will be removed in v1.3'): - from pytorch_lightning.metrics.functional.classification import get_num_classes get_num_classes(pred=torch.tensor([0, 1]), target=torch.tensor([1, 1])) x_binary = torch.tensor([0, 1, 2, 3]) y_binary = torch.tensor([0, 1, 2, 3]) + from pytorch_lightning.metrics.functional.classification import roc with pytest.deprecated_call(match='will be removed in v1.3'): - from pytorch_lightning.metrics.functional.classification import roc roc(pred=x_binary, target=y_binary) + from pytorch_lightning.metrics.functional.classification import _roc with pytest.deprecated_call(match='will be removed in v1.3'): - from pytorch_lightning.metrics.functional.classification import _roc _roc(pred=x_binary, target=y_binary) x_multy = torch.tensor([[0.85, 0.05, 0.05, 0.05], @@ -81,64 +78,40 @@ def test_tbd_remove_in_v1_3_0_metrics(): [0.05, 0.05, 0.05, 0.85]]) y_multy = torch.tensor([0, 1, 3, 2]) + from pytorch_lightning.metrics.functional.classification import multiclass_roc with pytest.deprecated_call(match='will be removed in v1.3'): - from pytorch_lightning.metrics.functional.classification import multiclass_roc multiclass_roc(pred=x_multy, target=y_multy) + from pytorch_lightning.metrics.functional.classification import average_precision with pytest.deprecated_call(match='will be removed in v1.3'): - from pytorch_lightning.metrics.functional.classification import average_precision average_precision(pred=x_binary, target=y_binary) + from pytorch_lightning.metrics.functional.classification import precision_recall_curve with pytest.deprecated_call(match='will be removed in v1.3'): - from pytorch_lightning.metrics.functional.classification import precision_recall_curve precision_recall_curve(pred=x_binary, target=y_binary) + from pytorch_lightning.metrics.functional.classification import multiclass_precision_recall_curve with pytest.deprecated_call(match='will be removed in v1.3'): - from pytorch_lightning.metrics.functional.classification import multiclass_precision_recall_curve multiclass_precision_recall_curve(pred=x_multy, target=y_multy) + from pytorch_lightning.metrics.functional.reduction import reduce with pytest.deprecated_call(match='will be removed in v1.3'): - from pytorch_lightning.metrics.functional.reduction import reduce reduce(torch.tensor([0, 1, 1, 0]), 'sum') + from pytorch_lightning.metrics.functional.reduction import class_reduce with pytest.deprecated_call(match='will be removed in v1.3'): - from pytorch_lightning.metrics.functional.reduction import class_reduce class_reduce(torch.randint(1, 10, (50,)).float(), torch.randint(10, 20, (50,)).float(), torch.randint(1, 100, (50,)).float()) -def test_tbd_remove_in_v1_2_0(): - with pytest.deprecated_call(match='will be removed in v1.2'): - checkpoint_cb = ModelCheckpoint(filepath='.') - - with pytest.deprecated_call(match='will be removed in v1.2'): - checkpoint_cb = ModelCheckpoint('.') - - with pytest.raises(MisconfigurationException, match='inputs which are not feasible'): - checkpoint_cb = ModelCheckpoint(filepath='.', dirpath='.') - - -def test_tbd_remove_in_v1_2_0_metrics(): - from pytorch_lightning.metrics.classification import Fbeta - from pytorch_lightning.metrics.functional.classification import f1_score, fbeta_score - - with pytest.deprecated_call(match='will be removed in v1.2'): - Fbeta(2) - - with pytest.deprecated_call(match='will be removed in v1.2'): - fbeta_score(torch.tensor([0, 1, 2, 3]), torch.tensor([0, 1, 2, 1]), 0.2) - - with pytest.deprecated_call(match='will be removed in v1.2'): - f1_score(torch.tensor([0, 1, 0, 1]), torch.tensor([0, 1, 0, 0])) - - # TODO: remove bool from Trainer.profiler param in v1.3.0, update profiler_connector.py @pytest.mark.parametrize(['profiler', 'expected'], [ (True, SimpleProfiler), (False, PassThroughProfiler), ]) def test_trainer_profiler_remove_in_v1_3_0(profiler, expected): + # remove bool from Trainer.profiler param in v1.3.0, update profiler_connector.py with pytest.deprecated_call(match='will be removed in v1.3'): trainer = Trainer(profiler=profiler) assert isinstance(trainer.profiler, expected) @@ -162,47 +135,3 @@ def test_trainer_cli_profiler_remove_in_v1_3_0(cli_args, expected_parsed_arg, ex assert getattr(args, "profiler") == expected_parsed_arg trainer = Trainer.from_argparse_args(args) assert isinstance(trainer.profiler, expected_profiler) - - -def _soft_unimport_module(str_module): - # once the module is imported e.g with parsing with pytest it lives in memory - if str_module in sys.modules: - del sys.modules[str_module] - - -class ModelVer0_6(EvalModelTemplate): - - # todo: this shall not be needed while evaluate asks for dataloader explicitly - def val_dataloader(self): - return self.dataloader(train=False) - - def validation_step(self, batch, batch_idx, *args, **kwargs): - return {'val_loss': torch.tensor(0.6)} - - def validation_end(self, outputs): - return {'val_loss': torch.tensor(0.6)} - - def test_dataloader(self): - return self.dataloader(train=False) - - def test_end(self, outputs): - return {'test_loss': torch.tensor(0.6)} - - -class ModelVer0_7(EvalModelTemplate): - - # todo: this shall not be needed while evaluate asks for dataloader explicitly - def val_dataloader(self): - return self.dataloader(train=False) - - def validation_step(self, batch, batch_idx, *args, **kwargs): - return {'val_loss': torch.tensor(0.7)} - - def validation_end(self, outputs): - return {'val_loss': torch.tensor(0.7)} - - def test_dataloader(self): - return self.dataloader(train=False) - - def test_end(self, outputs): - return {'test_loss': torch.tensor(0.7)} diff --git a/tests/metrics/regression/test_ssim.py b/tests/metrics/regression/test_ssim.py index f581188e89fce5..8bb304850e3f22 100644 --- a/tests/metrics/regression/test_ssim.py +++ b/tests/metrics/regression/test_ssim.py @@ -53,9 +53,7 @@ def _sk_metric(preds, target, data_range, multichannel): class TestSSIM(MetricTester): atol = 6e-5 - # TODO: for some reason this test hangs with ddp=True - # @pytest.mark.parametrize("ddp", [True, False]) - @pytest.mark.parametrize("ddp", [False]) + @pytest.mark.parametrize("ddp", [True, False]) @pytest.mark.parametrize("dist_sync_on_step", [True, False]) def test_ssim(self, preds, target, multichannel, ddp, dist_sync_on_step): self.run_class_metric_test( diff --git a/tests/metrics/utils.py b/tests/metrics/utils.py index c607a466b20683..4bd6608ce3fcf4 100644 --- a/tests/metrics/utils.py +++ b/tests/metrics/utils.py @@ -11,6 +11,11 @@ from pytorch_lightning.metrics import Metric +try: + set_start_method("spawn") +except RuntimeError: + pass + NUM_PROCESSES = 2 NUM_BATCHES = 10 BATCH_SIZE = 32 @@ -165,10 +170,7 @@ def setup_class(self): """Setup the metric class. This will spawn the pool of workers that are used for metric testing and setup_ddp """ - try: - set_start_method("spawn") - except RuntimeError: - pass + self.poolSize = NUM_PROCESSES self.pool = Pool(processes=self.poolSize) self.pool.starmap(setup_ddp, [(rank, self.poolSize) for rank in range(self.poolSize)]) diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py index f10753491d447c..f41bf59bb4f4c2 100644 --- a/tests/models/data/horovod/train_default_model.py +++ b/tests/models/data/horovod/train_default_model.py @@ -74,9 +74,11 @@ def run_test_from_config(trainer_options): for dataloader in test_loaders: run_prediction(dataloader, pretrained_model) - # test HPC loading / saving + # test HPC saving trainer.checkpoint_connector.hpc_save(ckpt_path, trainer.logger) - trainer.checkpoint_connector.hpc_load(ckpt_path, on_gpu=args.on_gpu) + # test HPC loading + checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder(ckpt_path) + trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=args.on_gpu) if args.on_gpu: trainer = Trainer(gpus=1, accelerator='horovod', max_epochs=1) diff --git a/tests/models/test_onnx.py b/tests/models/test_onnx.py index a3919a6a8a7ddd..82727d37479b68 100644 --- a/tests/models/test_onnx.py +++ b/tests/models/test_onnx.py @@ -21,44 +21,44 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer -from tests.base import EvalModelTemplate +from tests.base import BoringModel, EvalModelTemplate def test_model_saves_with_input_sample(tmpdir): """Test that ONNX model saves with input sample and size is greater than 3 MB""" - model = EvalModelTemplate() + model = BoringModel() trainer = Trainer(max_epochs=1) trainer.fit(model) file_path = os.path.join(tmpdir, "model.onnx") - input_sample = torch.randn((1, 28 * 28)) + input_sample = torch.randn((1, 32)) model.to_onnx(file_path, input_sample) assert os.path.isfile(file_path) - assert os.path.getsize(file_path) > 3e+06 + assert os.path.getsize(file_path) > 4e2 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") def test_model_saves_on_gpu(tmpdir): """Test that model saves on gpu""" - model = EvalModelTemplate() + model = BoringModel() trainer = Trainer(gpus=1, max_epochs=1) trainer.fit(model) file_path = os.path.join(tmpdir, "model.onnx") - input_sample = torch.randn((1, 28 * 28)) + input_sample = torch.randn((1, 32)) model.to_onnx(file_path, input_sample) assert os.path.isfile(file_path) - assert os.path.getsize(file_path) > 3e+06 + assert os.path.getsize(file_path) > 4e2 def test_model_saves_with_example_output(tmpdir): """Test that ONNX model saves when provided with example output""" - model = EvalModelTemplate() + model = BoringModel() trainer = Trainer(max_epochs=1) trainer.fit(model) file_path = os.path.join(tmpdir, "model.onnx") - input_sample = torch.randn((1, 28 * 28)) + input_sample = torch.randn((1, 32)) model.eval() example_outputs = model.forward(input_sample) model.to_onnx(file_path, input_sample, example_outputs=example_outputs) @@ -67,11 +67,13 @@ def test_model_saves_with_example_output(tmpdir): def test_model_saves_with_example_input_array(tmpdir): """Test that ONNX model saves with_example_input_array and size is greater than 3 MB""" - model = EvalModelTemplate() + model = BoringModel() + model.example_input_array = torch.randn(5, 32) + file_path = os.path.join(tmpdir, "model.onnx") model.to_onnx(file_path) assert os.path.exists(file_path) is True - assert os.path.getsize(file_path) > 3e+06 + assert os.path.getsize(file_path) > 4e2 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") @@ -100,7 +102,9 @@ def test_model_saves_on_multi_gpu(tmpdir): def test_verbose_param(tmpdir, capsys): """Test that output is present when verbose parameter is set""" - model = EvalModelTemplate() + model = BoringModel() + model.example_input_array = torch.randn(5, 32) + file_path = os.path.join(tmpdir, "model.onnx") model.to_onnx(file_path, verbose=True) captured = capsys.readouterr() @@ -108,8 +112,8 @@ def test_verbose_param(tmpdir, capsys): def test_error_if_no_input(tmpdir): - """Test that an exception is thrown when there is no input tensor""" - model = EvalModelTemplate() + """Test that an error is thrown when there is no input tensor""" + model = BoringModel() model.example_input_array = None file_path = os.path.join(tmpdir, "model.onnx") with pytest.raises(ValueError, match=r'Could not export to ONNX since neither `input_sample` nor' @@ -117,21 +121,12 @@ def test_error_if_no_input(tmpdir): model.to_onnx(file_path) -def test_error_if_input_sample_is_not_tensor(tmpdir): - """Test that an exception is thrown when there is no input tensor""" - model = EvalModelTemplate() - model.example_input_array = None - file_path = os.path.join(tmpdir, "model.onnx") - input_sample = np.random.randn(1, 28 * 28) - with pytest.raises(ValueError, match=f'Received `input_sample` of type {type(input_sample)}. Expected type is ' - f'`Tensor`'): - model.to_onnx(file_path, input_sample) - - def test_if_inference_output_is_valid(tmpdir): """Test that the output inferred from ONNX model is same as from PyTorch""" - model = EvalModelTemplate() - trainer = Trainer(max_epochs=5) + model = BoringModel() + model.example_input_array = torch.randn(5, 32) + + trainer = Trainer(max_epochs=2) trainer.fit(model) model.eval() diff --git a/tests/models/test_torchscript.py b/tests/models/test_torchscript.py index bf2c34b8bfef5b..3c43b201f52e4c 100644 --- a/tests/models/test_torchscript.py +++ b/tests/models/test_torchscript.py @@ -16,43 +16,72 @@ import pytest import torch -from tests.base import EvalModelTemplate +from tests.base import BoringModel from tests.base.datamodules import TrialMNISTDataModule from tests.base.models import ParityModuleRNN, BasicGAN @pytest.mark.parametrize("modelclass", [ - EvalModelTemplate, + BoringModel, ParityModuleRNN, BasicGAN, ]) def test_torchscript_input_output(modelclass): """ Test that scripted LightningModule forward works. """ model = modelclass() + + if isinstance(model, BoringModel): + model.example_input_array = torch.randn(5, 32) + script = model.to_torchscript() assert isinstance(script, torch.jit.ScriptModule) + model.eval() - model_output = model(model.example_input_array) + with torch.no_grad(): + model_output = model(model.example_input_array) + script_output = script(model.example_input_array) assert torch.allclose(script_output, model_output) @pytest.mark.parametrize("modelclass", [ - EvalModelTemplate, + BoringModel, ParityModuleRNN, BasicGAN, ]) -def test_torchscript_input_output_trace(modelclass): - """ Test that traced LightningModule forward works. """ +def test_torchscript_example_input_output_trace(modelclass): + """ Test that traced LightningModule forward works with example_input_array """ model = modelclass() + + if isinstance(model, BoringModel): + model.example_input_array = torch.randn(5, 32) + script = model.to_torchscript(method='trace') assert isinstance(script, torch.jit.ScriptModule) + model.eval() - model_output = model(model.example_input_array) + with torch.no_grad(): + model_output = model(model.example_input_array) + script_output = script(model.example_input_array) assert torch.allclose(script_output, model_output) +def test_torchscript_input_output_trace(): + """ Test that traced LightningModule forward works with example_inputs """ + model = BoringModel() + example_inputs = torch.randn(1, 32) + script = model.to_torchscript(example_inputs=example_inputs, method='trace') + assert isinstance(script, torch.jit.ScriptModule) + + model.eval() + with torch.no_grad(): + model_output = model(example_inputs) + + script_output = script(example_inputs) + assert torch.allclose(script_output, model_output) + + @pytest.mark.parametrize("device", [ torch.device("cpu"), torch.device("cuda", 0) @@ -60,7 +89,9 @@ def test_torchscript_input_output_trace(modelclass): @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine") def test_torchscript_device(device): """ Test that scripted module is on the correct device. """ - model = EvalModelTemplate().to(device) + model = BoringModel().to(device) + model.example_input_array = torch.randn(5, 32) + script = model.to_torchscript() assert next(script.parameters()).device == device script_output = script(model.example_input_array.to(device)) @@ -69,7 +100,7 @@ def test_torchscript_device(device): def test_torchscript_retain_training_state(): """ Test that torchscript export does not alter the training mode of original model. """ - model = EvalModelTemplate() + model = BoringModel() model.train(True) script = model.to_torchscript() assert model.training @@ -81,7 +112,7 @@ def test_torchscript_retain_training_state(): @pytest.mark.parametrize("modelclass", [ - EvalModelTemplate, + BoringModel, ParityModuleRNN, BasicGAN, ]) @@ -100,7 +131,7 @@ def test_torchscript_properties(modelclass): @pytest.mark.parametrize("modelclass", [ - EvalModelTemplate, + BoringModel, ParityModuleRNN, BasicGAN, ]) @@ -109,9 +140,27 @@ def test_torchscript_properties(modelclass): reason="torch.save/load has bug loading script modules on torch <= 1.4", ) def test_torchscript_save_load(tmpdir, modelclass): - """ Test that scripted LightningModules is correctly saved and can be loaded. """ + """ Test that scripted LightningModule is correctly saved and can be loaded. """ model = modelclass() output_file = str(tmpdir / "model.pt") script = model.to_torchscript(file_path=output_file) loaded_script = torch.jit.load(output_file) assert torch.allclose(next(script.parameters()), next(loaded_script.parameters())) + + +def test_torchcript_invalid_method(tmpdir): + """Test that an error is thrown with invalid torchscript method""" + model = BoringModel() + model.train(True) + + with pytest.raises(ValueError, match="only supports 'script' or 'trace'"): + model.to_torchscript(method='temp') + + +def test_torchscript_with_no_input(tmpdir): + """Test that an error is thrown when there is no input tensor""" + model = BoringModel() + model.example_input_array = None + + with pytest.raises(ValueError, match='requires either `example_inputs` or `model.example_input_array`'): + model.to_torchscript(method='trace') diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index e838dc60d81b31..37ab774bc83421 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -244,39 +244,6 @@ def test_distributed_backend_set_when_using_tpu(tmpdir, tpu_cores): assert Trainer(tpu_cores=tpu_cores).distributed_backend == "tpu" -@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"}) -@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") -@pl_multi_process_test -def test_result_obj_on_tpu(tmpdir): - seed_everything(1234) - - batches = 5 - epochs = 2 - - model = EvalModelTemplate() - model.training_step = model.training_step_result_obj - model.training_step_end = None - model.training_epoch_end = None - model.validation_step = model.validation_step_result_obj - model.validation_step_end = None - model.validation_epoch_end = None - model.test_step = model.test_step_result_obj - model.test_step_end = None - model.test_epoch_end = None - - trainer_options = dict( - default_root_dir=tmpdir, - max_epochs=epochs, - callbacks=[EarlyStopping()], - log_every_n_steps=2, - limit_train_batches=batches, - weights_summary=None, - tpu_cores=8 - ) - - tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) - - @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") @pl_multi_process_test def test_broadcast_on_tpu(): diff --git a/tests/special_tests.sh b/tests/special_tests.sh index f7cb5819517839..950e3776bbc7fa 100644 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -19,4 +19,4 @@ python ${DEFAULTS} tests/plugins/test_rpc_plugin.py::test_rpc_function_calls_ddp python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_manual python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_manual_amp python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_automatic -# python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_with_wrong_balance +python ${DEFAULTS} tests/trainer/logging_tests/test_train_loop_logging_1_0.py::test_logging_sync_dist_true_ddp diff --git a/tests/test_profiler.py b/tests/test_profiler.py index 3bce379c1115c2..91a8631a732870 100644 --- a/tests/test_profiler.py +++ b/tests/test_profiler.py @@ -1,6 +1,20 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os -import time from pathlib import Path +import time import numpy as np import pytest diff --git a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py index 0c27d8909d760f..51b9c2ac69496d 100644 --- a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py +++ b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py @@ -18,6 +18,7 @@ import collections import itertools import os +import platform from unittest import mock import numpy as np @@ -26,8 +27,8 @@ from torch.utils.data import Dataset import pytorch_lightning as pl -from pytorch_lightning import Trainer, callbacks -from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning import callbacks, Trainer +from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint from pytorch_lightning.core.lightning import LightningModule from tests.base.boring_model import BoringModel, RandomDictDataset, RandomDictStringDataset from tests.base.deterministic_model import DeterministicModel @@ -685,6 +686,7 @@ class TestModel(BoringModel): def training_step(self, batch, batch_idx): acc = self.step(batch[0]) self.log('foo', torch.tensor(fake_result), on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum') + self.log('foo_2', 2, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum') return acc def validation_step(self, batch, batch_idx): @@ -704,9 +706,46 @@ def validation_step(self, batch, batch_idx): trainer.fit(model) assert trainer.logged_metrics['foo'] == fake_result + assert trainer.logged_metrics['foo_2'] == 2 assert trainer.logged_metrics['bar'] == fake_result +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', + reason="test should be run outside of pytest") +def test_logging_sync_dist_true_ddp(tmpdir): + """ + Tests to ensure that the sync_dist flag works with ddp + """ + class TestLoggingSyncDistModel(BoringModel): + def training_step(self, batch, batch_idx): + acc = self.step(batch[0]) + self.log('foo', 1, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='SUM') + return acc + + def validation_step(self, batch, batch_idx): + self.training_step_called = True + output = self.layer(batch) + loss = self.loss(batch, output) + self.log('bar', 2, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='AVG') + return {"x": loss} + + model = TestLoggingSyncDistModel() + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=1, + limit_val_batches=1, + max_epochs=2, + weights_summary=None, + accelerator="ddp", + gpus=2, + ) + trainer.fit(model) + + assert trainer.logged_metrics['foo'] == 2 + assert trainer.logged_metrics['bar'] == 2 + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") def test_logging_sync_dist_true_gpu(tmpdir): """ @@ -771,3 +810,48 @@ def on_train_epoch_end(self, *_): trainer.fit(model) assert model.epoch_end_called assert model.on_train_epoch_end_called + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine") +def test_metric_are_properly_reduced(tmpdir): + class TestingModel(BoringModel): + def __init__(self, *args, **kwargs): + super().__init__() + self.train_acc = pl.metrics.Accuracy() + self.val_acc = pl.metrics.Accuracy() + + def training_step(self, batch, batch_idx): + self.train_acc(torch.rand(1, 3, device=self.device), torch.randint(0, 2, (1,), device=self.device)) + self.log('train_acc', self.train_acc, on_step=True, on_epoch=True) + return super().training_step(batch, batch_idx) + + def validation_step(self, batch, batch_idx): + preds = torch.tensor(0, device=self.device) + targets = torch.tensor(1, device=self.device) + if batch_idx < 8: + targets = preds + self.val_acc(preds, targets) + self.log('val_acc', self.val_acc, on_step=True, on_epoch=True) + return super().validation_step(batch, batch_idx) + + early_stop = EarlyStopping(monitor='val_acc', mode='max') + + checkpoint = ModelCheckpoint( + monitor='val_acc', + save_last=True, + save_top_k=2, + mode='max', + ) + + model = TestingModel() + trainer = Trainer( + default_root_dir=tmpdir, + gpus=1, + max_epochs=2, + limit_train_batches=5, + limit_val_batches=32, + callbacks=[early_stop, checkpoint]) + trainer.fit(model) + + assert trainer.callback_metrics["val_acc"] == 8 / 32. + assert "train_acc" in trainer.callback_metrics diff --git a/tests/trainer/optimization/test_multiple_optimizers.py b/tests/trainer/optimization/test_multiple_optimizers.py new file mode 100644 index 00000000000000..78b6f8f7ff84a3 --- /dev/null +++ b/tests/trainer/optimization/test_multiple_optimizers.py @@ -0,0 +1,63 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Tests to ensure that the behaviours related to multiple optimizers works +""" +import torch + +import pytorch_lightning as pl +from tests.base.boring_model import BoringModel + + +def test_unbalanced_logging_with_multiple_optimizers(tmpdir): + """ + This tests ensures reduction works in un-balanced logging settings + """ + class TestModel(BoringModel): + + loss_1 = [] + loss_2 = [] + + def training_step(self, batch, batch_idx, optimizer_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + if optimizer_idx == 0 and self.trainer.global_step > 10: + self.log("loss_1", loss, on_epoch=True, prog_bar=True) + self.loss_1.append(loss.detach().clone()) + elif optimizer_idx == 1: + self.log("loss_2", loss, on_epoch=True, prog_bar=True) + self.loss_2.append(loss.detach().clone()) + return {"loss": loss} + + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.001) + optimizer2 = torch.optim.SGD(self.layer.parameters(), lr=0.001) + return [optimizer, optimizer2] + + model = TestModel() + model.training_epoch_end = None + + # Initialize a trainer + trainer = pl.Trainer( + default_root_dir=tmpdir, + max_epochs=1, + ) + + trainer.fit(model) + + assert torch.equal(trainer.callback_metrics["loss_2_step"], model.loss_2[-1]) + assert torch.equal(trainer.callback_metrics["loss_1_step"], model.loss_1[-1]) + # test loss are properly reduced + assert torch.abs(trainer.callback_metrics["loss_2_epoch"] - torch.FloatTensor(model.loss_2).mean()) < 1e-6 + assert torch.abs(trainer.callback_metrics["loss_1_epoch"] - torch.FloatTensor(model.loss_1).mean()) < 1e-6 diff --git a/tests/trainer/test_optimizers.py b/tests/trainer/test_optimizers.py index 2e76192836740c..52e085b2b7b8cb 100644 --- a/tests/trainer/test_optimizers.py +++ b/tests/trainer/test_optimizers.py @@ -15,7 +15,6 @@ import torch from pytorch_lightning import Callback, Trainer -from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import EvalModelTemplate from tests.base.boring_model import BoringModel @@ -177,6 +176,7 @@ def test_reducelronplateau_scheduling(tmpdir): frequency=1, reduce_on_plateau=True, strict=True, + name=None, ), 'lr scheduler was not correctly converted to dict' @@ -215,7 +215,13 @@ def test_optimizer_return_options(enable_pl_optimizer): assert len(freq) == 0 assert optim[0] == opt_a assert lr_sched[0] == dict( - scheduler=scheduler_a, interval='epoch', frequency=1, reduce_on_plateau=False, monitor=None, strict=True + scheduler=scheduler_a, + interval='epoch', + frequency=1, + reduce_on_plateau=False, + monitor=None, + strict=True, + name=None, ) # opt tuple of 1 list @@ -225,7 +231,13 @@ def test_optimizer_return_options(enable_pl_optimizer): assert len(freq) == 0 assert optim[0] == opt_a assert lr_sched[0] == dict( - scheduler=scheduler_a, interval='epoch', frequency=1, reduce_on_plateau=False, monitor=None, strict=True + scheduler=scheduler_a, + interval='epoch', + frequency=1, + reduce_on_plateau=False, + monitor=None, + strict=True, + name=None, ) # opt single dictionary @@ -235,7 +247,13 @@ def test_optimizer_return_options(enable_pl_optimizer): assert len(freq) == 0 assert optim[0] == opt_a assert lr_sched[0] == dict( - scheduler=scheduler_a, interval='epoch', frequency=1, reduce_on_plateau=False, monitor=None, strict=True + scheduler=scheduler_a, + interval='epoch', + frequency=1, + reduce_on_plateau=False, + monitor=None, + strict=True, + name=None, ) # opt multiple dictionaries with frequencies @@ -247,7 +265,13 @@ def test_optimizer_return_options(enable_pl_optimizer): assert len(optim) == len(lr_sched) == len(freq) == 2 assert optim[0] == opt_a assert lr_sched[0] == dict( - scheduler=scheduler_a, interval='epoch', frequency=1, reduce_on_plateau=False, monitor=None, strict=True + scheduler=scheduler_a, + interval='epoch', + frequency=1, + reduce_on_plateau=False, + monitor=None, + strict=True, + name=None, ) assert freq == [1, 5] diff --git a/tests/trainer/test_supporters.py b/tests/trainer/test_supporters.py new file mode 100644 index 00000000000000..b8a0e066cdef89 --- /dev/null +++ b/tests/trainer/test_supporters.py @@ -0,0 +1,38 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import torch + +from pytorch_lightning.trainer.supporters import TensorRunningAccum + + +def test_tensor_running_accum_reset(): + """ Test that reset would set all attributes to the initialization state """ + + window_length = 10 + + accum = TensorRunningAccum(window_length=window_length) + assert accum.last() is None + assert accum.mean() is None + + accum.append(torch.tensor(1.5)) + assert accum.last() == torch.tensor(1.5) + assert accum.mean() == torch.tensor(1.5) + + accum.reset() + assert accum.window_length == window_length + assert accum.memory is None + assert accum.current_idx == 0 + assert accum.last_idx is None + assert not accum.rotated diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 9b29d6ec2b1dd6..9e5ceccf9b646b 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -11,12 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import glob import math import os import pickle import sys -import types from argparse import Namespace from copy import deepcopy from pathlib import Path @@ -34,6 +32,7 @@ from pytorch_lightning.loggers import TensorBoardLogger from pytorch_lightning.profiler.profilers import AdvancedProfiler, PassThroughProfiler, SimpleProfiler from pytorch_lightning.trainer.logging import TrainerLoggingMixin +from pytorch_lightning.trainer.states import TrainerState from pytorch_lightning.utilities import NATIVE_AMP_AVAILABLE from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -61,6 +60,7 @@ def test_no_val_module(monkeypatch, tmpdir, tmpdir_server, url_ckpt): result = trainer.fit(model) # training complete assert result == 1, "amp + ddp model failed to complete" + assert trainer.state == TrainerState.FINISHED # save model new_weights_path = os.path.join(tmpdir, "save_test.ckpt") @@ -107,6 +107,7 @@ def test_no_val_end_module(monkeypatch, tmpdir, tmpdir_server, url_ckpt): # traning complete assert result == 1, "amp + ddp model failed to complete" + assert trainer.state == TrainerState.FINISHED # save model new_weights_path = os.path.join(tmpdir, "save_test.ckpt") @@ -151,6 +152,7 @@ def test_strict_model_load(monkeypatch, tmpdir, tmpdir_server, url_ckpt): # traning complete assert result == 1 + assert trainer.state == TrainerState.FINISHED # save model new_weights_path = os.path.join(tmpdir, "save_test.ckpt") @@ -468,6 +470,7 @@ def test_model_checkpoint_only_weights(tmpdir): result = trainer.fit(model) # training complete assert result == 1, "training failed to complete" + assert trainer.state == TrainerState.FINISHED checkpoint_path = list(trainer.checkpoint_callback.best_k_models.keys())[0] @@ -507,35 +510,23 @@ def test_resume_from_checkpoint_epoch_restored(monkeypatch, tmpdir, tmpdir_serve # set $TORCH_HOME, which determines torch hub's cache path, to tmpdir monkeypatch.setenv("TORCH_HOME", tmpdir) - hparams = EvalModelTemplate.get_default_hparams() - - def _new_model(): - # Create a model that tracks epochs and batches seen - model = EvalModelTemplate(**hparams) - model.num_epochs_seen = 0 - model.num_batches_seen = 0 - model.num_on_load_checkpoint_called = 0 + class TestModel(BoringModel): + # Model that tracks epochs and batches seen + num_epochs_seen = 0 + num_batches_seen = 0 + num_on_load_checkpoint_called = 0 - def increment_epoch(self): + def on_epoch_end(self): self.num_epochs_seen += 1 - def increment_batch(self, batch, batch_idx, dataloader_idx): + def on_train_batch_start(self, *_): self.num_batches_seen += 1 - def increment_on_load_checkpoint(self, _): + def on_load_checkpoint(self, _): self.num_on_load_checkpoint_called += 1 - # Bind methods to keep track of epoch numbers, batch numbers it has seen - # as well as number of times it has called on_load_checkpoint() - model.on_epoch_end = types.MethodType(increment_epoch, model) - model.on_train_batch_start = types.MethodType(increment_batch, model) - model.on_load_checkpoint = types.MethodType(increment_on_load_checkpoint, model) - return model - - model = _new_model() - - trainer_options = dict( - progress_bar_refresh_rate=0, + model = TestModel() + trainer = Trainer( max_epochs=2, limit_train_batches=0.65, limit_val_batches=1, @@ -543,144 +534,125 @@ def increment_on_load_checkpoint(self, _): default_root_dir=tmpdir, val_check_interval=1.0, enable_pl_optimizer=enable_pl_optimizer, + progress_bar_refresh_rate=0, + logger=False, + weights_summary=None, ) - - trainer = Trainer(**trainer_options) - # fit model trainer.fit(model) - training_batches = trainer.num_training_batches - assert model.num_epochs_seen == 2 - assert model.num_batches_seen == training_batches * 2 + assert model.num_batches_seen == trainer.num_training_batches * 2 assert model.num_on_load_checkpoint_called == 0 # Other checkpoints can be uncommented if/when resuming mid-epoch is supported - checkpoints = sorted(glob.glob(os.path.join(trainer.checkpoint_callback.dirpath, "*.ckpt"))) + checkpoints = Path(trainer.checkpoint_callback.dirpath).glob("*.ckpt") if url_ckpt: # transform local paths into url checkpoints ip, port = tmpdir_server - checkpoints = [f"http://{ip}:{port}/" + os.path.basename(check) for check in checkpoints] + checkpoints = [f"http://{ip}:{port}/" + ckpt.name for ckpt in checkpoints] - for check in checkpoints: - next_model = _new_model() - state = pl_load(check) + for ckpt in checkpoints: + next_model = TestModel() + state = pl_load(ckpt) # Resume training - trainer_options["max_epochs"] = 2 - new_trainer = Trainer(**trainer_options, resume_from_checkpoint=check) + new_trainer = Trainer(resume_from_checkpoint=ckpt, max_epochs=2) new_trainer.fit(next_model) - assert state["global_step"] + next_model.num_batches_seen == training_batches * trainer_options["max_epochs"] + assert state["global_step"] + next_model.num_batches_seen == trainer.num_training_batches * trainer.max_epochs assert next_model.num_on_load_checkpoint_called == 1 -def _init_steps_model(): - """private method for initializing a model with 5% train epochs""" - model = EvalModelTemplate() - - # define train epoch to 5% of data - train_percent = 0.5 - # get number of samples in 1 epoch - num_train_samples = math.floor(len(model.train_dataloader()) * train_percent) - - trainer_options = dict( - limit_train_batches=train_percent, - ) - return model, trainer_options, num_train_samples - - def test_trainer_max_steps_and_epochs(tmpdir): """Verify model trains according to specified max steps""" - model, trainer_options, num_train_samples = _init_steps_model() + model = BoringModel() + num_train_samples = math.floor(len(model.train_dataloader()) * 0.5) # define less train steps than epochs - trainer_options.update( - default_root_dir=tmpdir, - max_epochs=3, - max_steps=num_train_samples + 10, - ) - - # fit model - trainer = Trainer(**trainer_options) + trainer_kwargs = { + 'limit_train_batches': 0.5, + 'default_root_dir': tmpdir, + 'max_epochs': 3, + 'max_steps': num_train_samples + 10, + 'logger': False, + 'weights_summary': None, + 'progress_bar_refresh_rate': 0, + } + trainer = Trainer(**trainer_kwargs) result = trainer.fit(model) - assert result == 1, "Training did not complete" - # check training stopped at max_steps + assert result == 1, "Training did not complete" + assert trainer.state == TrainerState.FINISHED assert trainer.global_step == trainer.max_steps, "Model did not stop at max_steps" # define less train epochs than steps - trainer_options.update( - max_epochs=2, - max_steps=trainer_options["max_epochs"] * 2 * num_train_samples, - ) - - # fit model - trainer = Trainer(**trainer_options) + trainer_kwargs['max_epochs'] = 2 + trainer_kwargs['max_steps'] = 3 * 2 * num_train_samples + trainer = Trainer(**trainer_kwargs) result = trainer.fit(model) - assert result == 1, "Training did not complete" - # check training stopped at max_epochs + assert result == 1, "Training did not complete" + assert trainer.state == TrainerState.FINISHED assert trainer.global_step == num_train_samples * trainer.max_epochs assert trainer.current_epoch == trainer.max_epochs - 1, "Model did not stop at max_epochs" def test_trainer_min_steps_and_epochs(tmpdir): """Verify model trains according to specified min steps""" - model, trainer_options, num_train_samples = _init_steps_model() - - # define callback for stopping the model and default epochs - trainer_options.update( - default_root_dir=tmpdir, - callbacks=[EarlyStopping(monitor="early_stop_on", min_delta=1.0)], - val_check_interval=2, - min_epochs=1, - max_epochs=7, - ) - - # define less min steps than 1 epoch - trainer_options["min_steps"] = math.floor(num_train_samples / 2) - - # fit model - trainer = Trainer(**trainer_options) + model = EvalModelTemplate() + num_train_samples = math.floor(len(model.train_dataloader()) * 0.5) + + trainer_kwargs = { + 'limit_train_batches': 0.5, + 'default_root_dir': tmpdir, + # define callback for stopping the model + 'callbacks': [EarlyStopping(monitor="early_stop_on", min_delta=1.0)], + 'val_check_interval': 2, + 'min_epochs': 1, + 'max_epochs': 7, + # define less min steps than 1 epoch + 'min_steps': num_train_samples // 2, + 'logger': False, + 'weights_summary': None, + 'progress_bar_refresh_rate': 0, + } + trainer = Trainer(**trainer_kwargs) result = trainer.fit(model) - assert result == 1, "Training did not complete" - # check model ran for at least min_epochs - assert ( - trainer.global_step >= num_train_samples and trainer.current_epoch > 0 - ), "Model did not train for at least min_epochs" + assert result == 1, "Training did not complete" + assert trainer.state == TrainerState.FINISHED + assert trainer.current_epoch > 0 + assert trainer.global_step >= num_train_samples, "Model did not train for at least min_epochs" # define less epochs than min_steps - trainer_options["min_steps"] = math.floor(num_train_samples * 1.5) - - # fit model - trainer = Trainer(**trainer_options) + trainer_kwargs["min_steps"] = math.floor(num_train_samples * 1.5) + trainer = Trainer(**trainer_kwargs) result = trainer.fit(model) - assert result == 1, "Training did not complete" - # check model ran for at least num_train_samples*1.5 - assert ( - trainer.global_step >= math.floor(num_train_samples * 1.5) and trainer.current_epoch > 0 - ), "Model did not train for at least min_steps" + assert result == 1, "Training did not complete" + assert trainer.state == TrainerState.FINISHED + assert trainer.current_epoch > 0 + assert trainer.global_step >= math.floor(num_train_samples * 1.5), "Model did not train for at least min_steps" def test_trainer_max_steps_accumulate_batches(tmpdir): """Verify model trains according to specified max steps with grad accumulated batches""" - model, trainer_options, num_train_samples = _init_steps_model() + model = BoringModel() + num_train_samples = math.floor(len(model.train_dataloader()) * 0.5) # define less train steps than epochs - trainer_options.update( + trainer = Trainer( + limit_train_batches=0.5, default_root_dir=tmpdir, - max_steps=(num_train_samples + 10), + max_steps=num_train_samples + 10, accumulate_grad_batches=10, + logger=False, + weights_summary=None, + progress_bar_refresh_rate=0, ) - - # fit model - trainer = Trainer(**trainer_options) result = trainer.fit(model) - assert result == 1, "Training did not complete" - # check training stopped at max_steps + assert result == 1, "Training did not complete" + assert trainer.state == TrainerState.FINISHED assert trainer.global_step == trainer.max_steps, "Model did not stop at max_steps" @@ -703,6 +675,7 @@ def test_benchmark_option(tmpdir): # verify training completed assert result == 1 + assert trainer.state == TrainerState.FINISHED # verify torch.backends.cudnn.benchmark is not turned off assert torch.backends.cudnn.benchmark @@ -788,6 +761,7 @@ def training_epoch_end(self, *args, **kwargs): # check that limit_train_batches=0 turns off training assert result == 1, "training failed to complete" + assert trainer.state == TrainerState.FINISHED assert trainer.current_epoch == 0 assert not model.training_step_invoked, "`training_step` should not run when `limit_train_batches=0`" assert not model.training_epoch_end_invoked, "`training_epoch_end` should not run when `limit_train_batches=0`" @@ -806,6 +780,7 @@ def training_epoch_end(self, *args, **kwargs): assert not torch.all(torch.eq(before_state_dict[key], after_state_dict[key])) assert result == 1, "training failed to complete" + assert trainer.state == TrainerState.FINISHED assert trainer.current_epoch == 0 assert model.training_step_invoked, "did not run `training_step` with `fast_dev_run=True`" assert model.training_epoch_end_invoked, "did not run `training_epoch_end` with `fast_dev_run=True`" @@ -844,6 +819,7 @@ def validation_epoch_end(self, *args, **kwargs): # check that limit_val_batches=0 turns off validation assert result == 1, "training failed to complete" + assert trainer.state == TrainerState.FINISHED assert trainer.current_epoch == 1 assert not model.validation_step_invoked, "`validation_step` should not run when `limit_val_batches=0`" assert not model.validation_epoch_end_invoked, "`validation_epoch_end` should not run when `limit_val_batches=0`" @@ -855,6 +831,7 @@ def validation_epoch_end(self, *args, **kwargs): result = trainer.fit(model) assert result == 1, "training failed to complete" + assert trainer.state == TrainerState.FINISHED assert trainer.current_epoch == 0 assert model.validation_step_invoked, "did not run `validation_step` with `fast_dev_run=True`" assert model.validation_epoch_end_invoked, "did not run `validation_epoch_end` with `fast_dev_run=True`" @@ -958,6 +935,7 @@ def test_gradient_clipping(tmpdir): """ Test gradient clipping """ + tutils.reset_seed() model = EvalModelTemplate() @@ -995,6 +973,7 @@ def test_gradient_clipping_fp16(tmpdir): """ Test gradient clipping with fp16 """ + tutils.reset_seed() model = EvalModelTemplate() @@ -1117,7 +1096,7 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): @pytest.mark.parametrize( "trainer_kwargs,expected", [ - pytest.param( + ( dict(accelerator=None, gpus=None), dict( use_dp=False, @@ -1129,7 +1108,7 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): num_processes=1, ), ), - pytest.param( + ( dict(accelerator="dp", gpus=None), dict( use_dp=False, @@ -1141,7 +1120,7 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): num_processes=1, ), ), - pytest.param( + ( dict(accelerator="dp", gpus=None), dict( use_dp=False, @@ -1153,7 +1132,7 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): num_processes=1, ), ), - pytest.param( + ( dict(accelerator="ddp", gpus=None), dict( use_dp=False, @@ -1165,7 +1144,7 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): num_processes=1, ), ), - pytest.param( + ( dict(accelerator="ddp", num_processes=2, gpus=None), dict( use_dp=False, @@ -1177,7 +1156,7 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): num_processes=2, ), ), - pytest.param( + ( dict(accelerator="ddp", num_nodes=2, gpus=None), dict( use_dp=False, @@ -1189,7 +1168,7 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): num_processes=1, ), ), - pytest.param( + ( dict(accelerator="ddp_cpu", num_processes=2, gpus=None), dict( use_dp=False, @@ -1201,7 +1180,7 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): num_processes=2, ), ), - pytest.param( + ( dict(accelerator="ddp2", gpus=None), dict( use_dp=False, @@ -1213,7 +1192,7 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): num_processes=1, ), ), - pytest.param( + ( dict(accelerator=None, gpus=1), dict( use_dp=False, @@ -1224,9 +1203,8 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): use_single_gpu=True, num_processes=1, ), - marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")], ), - pytest.param( + ( dict(accelerator="dp", gpus=1), dict( use_dp=True, @@ -1237,9 +1215,8 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): use_single_gpu=True, num_processes=1, ), - marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")], ), - pytest.param( + ( dict(accelerator="ddp", gpus=1), dict( use_dp=False, @@ -1250,9 +1227,8 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): use_single_gpu=True, num_processes=1, ), - marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")], ), - pytest.param( + ( dict(accelerator="ddp_cpu", num_processes=2, gpus=1), dict( use_dp=False, @@ -1263,9 +1239,8 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): use_single_gpu=False, num_processes=2, ), - marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")], ), - pytest.param( + ( dict(accelerator="ddp2", gpus=1), dict( use_dp=False, @@ -1276,9 +1251,8 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): use_single_gpu=False, num_processes=1, ), - marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")], ), - pytest.param( + ( dict(accelerator=None, gpus=2), dict( use_dp=False, @@ -1289,9 +1263,8 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): use_single_gpu=False, num_processes=2, ), - marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")], ), - pytest.param( + ( dict(accelerator="dp", gpus=2), dict( use_dp=True, @@ -1302,9 +1275,8 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): use_single_gpu=False, num_processes=1, ), - marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")], ), - pytest.param( + ( dict(accelerator="ddp", gpus=2), dict( use_dp=False, @@ -1315,9 +1287,8 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): use_single_gpu=False, num_processes=2, ), - marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")], ), - pytest.param( + ( dict(accelerator="ddp2", gpus=2), dict( use_dp=False, @@ -1328,21 +1299,17 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches): use_single_gpu=False, num_processes=1, ), - marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")], ), ], ) -# Todo: mock nb Gpus so all these tests can run on any device -# todo: think about simplification, that the the expected will be just a list use_xxx which shall be true... -def test_trainer_config(trainer_kwargs, expected): +def test_trainer_config(trainer_kwargs, expected, monkeypatch): + if trainer_kwargs["gpus"] is not None: + monkeypatch.setattr(torch.cuda, "is_available", lambda: True) + monkeypatch.setattr(torch.cuda, "device_count", lambda: trainer_kwargs["gpus"]) trainer = Trainer(**trainer_kwargs) - assert trainer.use_dp is expected["use_dp"], 'for input: %s' % trainer_kwargs - assert trainer.use_ddp is expected["use_ddp"], 'for input: %s' % trainer_kwargs - assert trainer.use_ddp2 is expected["use_ddp2"], 'for input: %s' % trainer_kwargs - assert trainer.num_gpus == expected["num_gpus"], 'for input: %s' % trainer_kwargs - assert trainer.on_gpu is expected["on_gpu"], 'for input: %s' % trainer_kwargs - assert trainer.use_single_gpu is expected["use_single_gpu"], 'for input: %s' % trainer_kwargs - assert trainer.num_processes == expected["num_processes"], 'for input: %s' % trainer_kwargs + assert len(expected) == 7 + for k, v in expected.items(): + assert getattr(trainer, k) == v, f"Failed {k}: {v}" def test_trainer_subclassing(): @@ -1358,6 +1325,7 @@ def __init__(self, custom_arg, *args, custom_kwarg="test", **kwargs): trainer = TrainerSubclass(123, custom_kwarg="custom", fast_dev_run=True) result = trainer.fit(model) assert result == 1 + assert trainer.state == TrainerState.FINISHED assert trainer.custom_arg == 123 assert trainer.custom_kwarg == "custom" assert trainer.fast_dev_run @@ -1373,6 +1341,7 @@ def __init__(self, **kwargs): trainer = TrainerSubclass(custom_kwarg="custom", fast_dev_run=True) result = trainer.fit(model) assert result == 1 + assert trainer.state == TrainerState.FINISHED assert trainer.custom_kwarg == "custom" assert trainer.fast_dev_run