diff --git a/.mergify.yml b/.mergify.yml
index 44c48f2ddced5e..cb5ef3ec7519a8 100644
--- a/.mergify.yml
+++ b/.mergify.yml
@@ -12,59 +12,59 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-pull_request_rules:
-
- - name: Automatic merge on approval
- conditions:
- - base=master
- # number of review approvals
- - "#approved-reviews-by>=3"
- # no waiting or assigned review
- - "#review-requested=0"
- # no requested chnages from any reviewer
- - "#changes-requested-reviews-by=0"
- # this serves as ALL check has to pass as we have actually around 40 tests in total
- - "#status-success>=54"
- # this is just in case since we rely on GPU tests (note: redundand to the above)
- - status-success=continuous-integration/drone/pr
- - "status-success=ci/circleci: TPU-tests"
- # this is patter-like, unofrunatly serves as `any(...)` (note: redundand to the above)
- #- "status-success~=^ci/circleci:"
- # no conflict with master branch
- - -conflict
- # was not closed yet
- - -closed
- # filter-out GH draft PRs
- - -draft
- actions:
- delete_head_branch: {}
- merge:
- # https://doc.mergify.io/merge-action.html#strict-merge
- # (on head branch) $ git merge --no-ff base
- # (on head branch) # Wait for CI to go green
- # (on head branch) # Squash all commits
- # (on base branch) $ git merge --ff head
- strict: true
- method: squash
- comment:
- message: Great job! =)
-
- - name: warn on conflicts
- conditions:
- - conflict
- # filter-out GH draft PRs
- - -draft
- actions:
- comment:
- message: This pull request is now in conflict... :(
-
- - name: add core reviewer
- conditions:
- # filter-out GH draft PRs
- - -draft
- # number of review approvals
- - "#approved-reviews-by<3"
- actions:
- request_reviews:
- teams:
- - core-contributors
+#pull_request_rules:
+#
+# - name: Automatic merge on approval
+# conditions:
+# - base=master
+# # number of review approvals
+# - "#approved-reviews-by>=3"
+# # no waiting or assigned review
+# - "#review-requested=0"
+# # no requested chnages from any reviewer
+# - "#changes-requested-reviews-by=0"
+# # this serves as ALL check has to pass as we have actually around 40 tests in total
+# - "#status-success>=54"
+# # this is just in case since we rely on GPU tests (note: redundand to the above)
+# - status-success=continuous-integration/drone/pr
+# - "status-success=ci/circleci: TPU-tests"
+# # this is patter-like, unofrunatly serves as `any(...)` (note: redundand to the above)
+# #- "status-success~=^ci/circleci:"
+# # no conflict with master branch
+# - -conflict
+# # was not closed yet
+# - -closed
+# # filter-out GH draft PRs
+# - -draft
+# actions:
+# delete_head_branch: {}
+# merge:
+# # https://doc.mergify.io/merge-action.html#strict-merge
+# # (on head branch) $ git merge --no-ff base
+# # (on head branch) # Wait for CI to go green
+# # (on head branch) # Squash all commits
+# # (on base branch) $ git merge --ff head
+# strict: true
+# method: squash
+# comment:
+# message: Great job! =)
+#
+# - name: warn on conflicts
+# conditions:
+# - conflict
+# # filter-out GH draft PRs
+# - -draft
+# actions:
+# comment:
+# message: This pull request is now in conflict... :(
+#
+# - name: add core reviewer
+# conditions:
+# # filter-out GH draft PRs
+# - -draft
+# # number of review approvals
+# - "#approved-reviews-by<3"
+# actions:
+# request_reviews:
+# teams:
+# - core-contributors
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f078349ef3665d..051fe5fae09e5e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
-## [unreleased.Features] - YYYY-MM-DD
+## [unreleased.BugFix] - YYYY-MM-DD
### Added
@@ -22,28 +22,39 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
### Fixed
-
-## [unreleased.BugFix] - YYYY-MM-DD
+## [1.1.1] - 2020-12-15
### Added
+- Add a notebook example to reach a quick baseline of ~94% accuracy on CIFAR10 using Resnet in Lightning ([#4818](https://github.com/PyTorchLightning/pytorch-lightning/pull/4818)
### Changed
-
-### Deprecated
+- Simplify accelerator steps ([#5015](https://github.com/PyTorchLightning/pytorch-lightning/pull/5015)
+- Refactor load in checkpoint connector ([#4593](https://github.com/PyTorchLightning/pytorch-lightning/pull/4593)
+- Fixed the saved filename in `ModelCheckpoint` when it already exists ([#4861](https://github.com/PyTorchLightning/pytorch-lightning/pull/4861))
+=======
### Removed
+- Drop duplicate metrics ([#5014](https://github.com/PyTorchLightning/pytorch-lightning/pull/5014)
+- Remove beta arg from F1 class and functional ([#5076](https://github.com/PyTorchLightning/pytorch-lightning/pull/5076)
### Fixed
- Fixed trainer by default `None` in `DDPAccelerator` ([#4915](https://github.com/PyTorchLightning/pytorch-lightning/pull/4915))
-
-
-- Fixed `LightningOptimizer` exposes optimizer attributes ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095))
-
+- Fixed `LightningOptimizer` to expose optimizer attributes ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095))
+- Do not warn when the `name` key is used in the `lr_scheduler` dict ([#5057](https://github.com/PyTorchLightning/pytorch-lightning/pull/5057))
+- Check if optimizer supports closure ([#4981](https://github.com/PyTorchLightning/pytorch-lightning/pull/4981)
+- Extend LightningOptimizer to exposure underlying Optimizer attributes + update doc ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095)
+- Add deprecated metric utility functions back to functional (
+ [#5067](https://github.com/PyTorchLightning/pytorch-lightning/pull/5067),
+ [#5068](https://github.com/PyTorchLightning/pytorch-lightning/pull/5068))
+- Allow any input in `to_onnx` and `to_torchscript` ([#4378](https://github.com/PyTorchLightning/pytorch-lightning/pull/4378)
+- Do not warn when the name key is used in the `lr_scheduler` dict ([#5057](https://github.com/PyTorchLightning/pytorch-lightning/pull/5057)
+
+- Fixed `DDPHPCAccelerator` hangs in DDP construction by calling `init_device` ([#5157](https://github.com/PyTorchLightning/pytorch-lightning/pull/5157))
## [1.1.0] - 2020-12-09
diff --git a/benchmarks/test_parity.py b/benchmarks/test_parity.py
index 41bba9533e10d7..3508d5a3c28acc 100644
--- a/benchmarks/test_parity.py
+++ b/benchmarks/test_parity.py
@@ -4,8 +4,8 @@
import pytest
import torch
+from pytorch_lightning import seed_everything, Trainer
import tests.base.develop_utils as tutils
-from pytorch_lightning import Trainer, seed_everything
from tests.base.models import ParityModuleMNIST, ParityModuleRNN
diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
index 9fe49764421785..2e52613462621c 100644
--- a/benchmarks/test_sharded_parity.py
+++ b/benchmarks/test_sharded_parity.py
@@ -6,7 +6,7 @@
import pytest
import torch
-from pytorch_lightning import Trainer, seed_everything
+from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
from pytorch_lightning.plugins.sharded_plugin import DDPShardedPlugin
from pytorch_lightning.utilities import FAIRSCALE_AVAILABLE, NATIVE_AMP_AVAILABLE
diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile
index 8eb093295c37bb..5dfeac8c9e86ea 100644
--- a/dockers/base-xla/Dockerfile
+++ b/dockers/base-xla/Dockerfile
@@ -97,6 +97,8 @@ RUN \
python -c "fname = 'requirements.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('torch')] ; open(fname, 'w').writelines(lines)" && \
# drop Horovod as it is not needed
python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \
+ # drop fairscale as it is not needed
+ python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)" && \
# drop TorchVision as it was installed with XLA
python -c "fname = 'requirements/examples.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('torchvision')] ; open(fname, 'w').writelines(lines)" && \
pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed && \
diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile
index a514b1c3d35fed..464f7fd8f309eb 100644
--- a/dockers/tpu-tests/Dockerfile
+++ b/dockers/tpu-tests/Dockerfile
@@ -27,8 +27,10 @@ COPY ./ ./pytorch-lightning/
RUN \
# Install pytorch-lightning at the current PR, plus dependencies.
#pip install -r pytorch-lightning/requirements.txt --no-cache-dir && \
- # drop Horovod
+ # drop Horovod as it is not needed
python -c "fname = 'pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \
+ # drop fairscale as it is not needed
+ python -c "fname = 'pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)" && \
pip install -r pytorch-lightning/requirements/devel.txt --no-cache-dir --upgrade-strategy only-if-needed
#RUN python -c "import pytorch_lightning as pl; print(pl.__version__)"
diff --git a/docs/source/introduction_guide.rst b/docs/source/introduction_guide.rst
index d6d082e2ed779b..d4cf578e10bda2 100644
--- a/docs/source/introduction_guide.rst
+++ b/docs/source/introduction_guide.rst
@@ -601,8 +601,8 @@ In this method we do all the preparation we need to do once (instead of on every
def setup(self, stage):
# transform
transform=transforms.Compose([transforms.ToTensor()])
- MNIST(os.getcwd(), train=True, download=False, transform=transform)
- MNIST(os.getcwd(), train=False, download=False, transform=transform)
+ mnist_train = MNIST(os.getcwd(), train=True, download=False, transform=transform)
+ mnist_test = MNIST(os.getcwd(), train=False, download=False, transform=transform)
# train/val split
mnist_train, mnist_val = random_split(mnist_train, [55000, 5000])
diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst
index def47810504d69..b3e0b905f27f43 100644
--- a/docs/source/multi_gpu.rst
+++ b/docs/source/multi_gpu.rst
@@ -663,7 +663,7 @@ It is highly recommended to use Sharded Training in multi-GPU environments where
A technical note: as batch size scales, storing activations for the backwards pass becomes the bottleneck in training. As a result, sharding optimizer state and gradients becomes less impactful.
Work within the future will bring optional sharding to activations and model parameters to reduce memory further, but come with a speed cost.
-To use Sharded Training, you need to first install FairScale using the command below or install all extras using ``pip install pytorch-lightning["extra"]``.
+To use Sharded Training, you need to first install FairScale using the command below.
.. code-block:: bash
diff --git a/notebooks/04-transformers-text-classification.ipynb b/notebooks/04-transformers-text-classification.ipynb
index 037b24e4ddd9dc..d52af84a76d975 100644
--- a/notebooks/04-transformers-text-classification.ipynb
+++ b/notebooks/04-transformers-text-classification.ipynb
@@ -1,5 +1,12 @@
{
"cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {
diff --git a/notebooks/05-trainer-flags-overview.ipynb b/notebooks/05-trainer-flags-overview.ipynb
index 6413e8239bb2e5..da044a9c9b5c6e 100644
--- a/notebooks/05-trainer-flags-overview.ipynb
+++ b/notebooks/05-trainer-flags-overview.ipynb
@@ -1,5 +1,12 @@
{
"cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ ""
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {
diff --git a/pyproject.toml b/pyproject.toml
index 760421a56ece8c..01e416aa51d8b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,7 @@ exclude = "(.eggs|.git|.hg|.mypy_cache|.nox|.tox|.venv|.svn|_build|buck-out|buil
[tool.isort]
known_first_party = [
- "bencharmks",
+ "benchmarks",
"docs",
"pl_examples",
"pytorch_lightning",
@@ -52,3 +52,5 @@ skip_glob = [
]
profile = "black"
line_length = 120
+force_sort_within_sections = "True"
+order_by_type = "False"
diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
index 408d95a72dc470..222263ea2d3853 100644
--- a/pytorch_lightning/__init__.py
+++ b/pytorch_lightning/__init__.py
@@ -1,6 +1,6 @@
"""Root package info."""
-__version__ = '1.1.1rc0'
+__version__ = '1.1.1'
__author__ = 'William Falcon et al.'
__author_email__ = 'waf2107@columbia.edu'
__license__ = 'Apache-2.0'
diff --git a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py
index a0545a4604aece..b9a71ed2717441 100644
--- a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py
@@ -48,3 +48,6 @@ def model_to_device(self, model, process_idx):
def get_device_ids(self):
device_ids = None
return device_ids
+
+ def init_device(self, process_idx):
+ pass
diff --git a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
index ec4c087998614e..b257884e34aef5 100644
--- a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
+++ b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py
@@ -126,6 +126,7 @@ def ddp_train(self, process_idx, model):
"""
# determine which process we are and world size
self.set_world_ranks(process_idx)
+ self.init_device(process_idx)
# toggle prog bar
if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None:
diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 88f1881643c9aa..4125a924cb2c59 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -19,6 +19,7 @@
Monitor a metric and stop training when it stops improving.
"""
+import numbers
import os
import numpy as np
@@ -26,7 +27,8 @@
from pytorch_lightning import _logger as log
from pytorch_lightning.callbacks.base import Callback
-from pytorch_lightning.utilities import rank_zero_info, rank_zero_warn, TPU_AVAILABLE
+from pytorch_lightning.metrics.metric import Metric
+from pytorch_lightning.utilities import TPU_AVAILABLE, rank_zero_info, rank_zero_warn
class EarlyStopping(Callback):
@@ -201,8 +203,11 @@ def _run_early_stopping_check(self, trainer, pl_module):
# when in dev debugging
trainer.dev_debugger.track_early_stopping_history(self, current)
- if not isinstance(current, torch.Tensor):
- current = torch.tensor(current, device=pl_module.device)
+ if current is not None:
+ if isinstance(current, Metric):
+ current = current.compute()
+ elif isinstance(current, numbers.Number):
+ current = torch.tensor(current, device=pl_module.device, dtype=torch.float)
if trainer.use_tpu and TPU_AVAILABLE:
current = current.cpu()
diff --git a/pytorch_lightning/callbacks/lr_monitor.py b/pytorch_lightning/callbacks/lr_monitor.py
index 081aec45067cf1..9799e0d3298d35 100755
--- a/pytorch_lightning/callbacks/lr_monitor.py
+++ b/pytorch_lightning/callbacks/lr_monitor.py
@@ -157,7 +157,7 @@ def _find_names(self, lr_schedulers) -> List[str]:
names = []
for scheduler in lr_schedulers:
sch = scheduler['scheduler']
- if 'name' in scheduler:
+ if scheduler['name'] is not None:
name = scheduler['name']
else:
opt_name = 'lr-' + sch.optimizer.__class__.__name__
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 1354f7f5056b39..82df32ce3996c2 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -20,6 +20,7 @@
"""
+import numbers
import os
import re
from copy import deepcopy
@@ -32,8 +33,9 @@
from pytorch_lightning import _logger as log
from pytorch_lightning.callbacks.base import Callback
-from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn
+from pytorch_lightning.metrics.metric import Metric
from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
+from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn
from pytorch_lightning.utilities.cloud_io import get_filesystem
from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -240,17 +242,14 @@ def save_checkpoint(self, trainer, pl_module):
# what can be monitored
monitor_candidates = self._monitor_candidates(trainer)
- # ie: path/val_loss=0.5.ckpt
- filepath = self._get_metric_interpolated_filepath_name(monitor_candidates, epoch, global_step)
-
# callback supports multiple simultaneous modes
# here we call each mode sequentially
# Mode 1: save all checkpoints OR only the top k
if self.save_top_k:
- self._save_top_k_checkpoints(monitor_candidates, trainer, pl_module, filepath)
+ self._save_top_k_checkpoints(trainer, pl_module, monitor_candidates)
# Mode 2: save the last checkpoint
- self._save_last_checkpoint(trainer, pl_module, monitor_candidates, filepath)
+ self._save_last_checkpoint(trainer, pl_module, monitor_candidates)
def __validate_init_configuration(self):
if self.save_top_k is not None and self.save_top_k < -1:
@@ -444,6 +443,7 @@ def format_checkpoint_name(
)
if ver is not None:
filename = self.CHECKPOINT_JOIN_CHAR.join((filename, f"v{ver}"))
+
ckpt_name = f"{filename}{self.FILE_EXTENSION}"
return os.path.join(self.dirpath, ckpt_name) if self.dirpath else ckpt_name
@@ -515,13 +515,20 @@ def _validate_monitor_key(self, trainer):
)
raise MisconfigurationException(m)
- def _get_metric_interpolated_filepath_name(self, ckpt_name_metrics: Dict[str, Any], epoch: int, step: int):
+ def _get_metric_interpolated_filepath_name(
+ self,
+ ckpt_name_metrics: Dict[str, Any],
+ epoch: int,
+ step: int,
+ del_filepath: Optional[str] = None
+ ) -> str:
filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics)
+
version_cnt = 0
- while self._fs.exists(filepath):
+ while self._fs.exists(filepath) and filepath != del_filepath:
filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics, ver=version_cnt)
- # this epoch called before
version_cnt += 1
+
return filepath
def _monitor_candidates(self, trainer):
@@ -531,13 +538,11 @@ def _monitor_candidates(self, trainer):
ckpt_name_metrics.update({"step": trainer.global_step, "epoch": trainer.current_epoch})
return ckpt_name_metrics
- def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics, filepath):
+ def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics):
should_save_last = self.monitor is None or self.save_last
if not should_save_last:
return
- last_filepath = filepath
-
# when user ALSO asked for the 'last.ckpt' change the name
if self.save_last:
last_filepath = self._format_checkpoint_name(
@@ -548,6 +553,10 @@ def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics, filepath)
prefix=self.prefix
)
last_filepath = os.path.join(self.dirpath, f"{last_filepath}{self.FILE_EXTENSION}")
+ else:
+ last_filepath = self._get_metric_interpolated_filepath_name(
+ ckpt_name_metrics, trainer.current_epoch, trainer.global_step
+ )
accelerator_backend = trainer.accelerator_backend
@@ -568,16 +577,19 @@ def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics, filepath)
if self.monitor is None:
self.best_model_path = self.last_model_path
- def _save_top_k_checkpoints(self, metrics, trainer, pl_module, filepath):
+ def _save_top_k_checkpoints(self, trainer, pl_module, metrics):
current = metrics.get(self.monitor)
epoch = metrics.get("epoch")
step = metrics.get("step")
- if not isinstance(current, torch.Tensor) and current is not None:
- current = torch.tensor(current, device=pl_module.device)
+ if current is not None:
+ if isinstance(current, Metric):
+ current = current.compute()
+ elif isinstance(current, numbers.Number):
+ current = torch.tensor(current, device=pl_module.device, dtype=torch.float)
if self.check_monitor_top_k(current):
- self._update_best_and_save(filepath, current, epoch, step, trainer, pl_module)
+ self._update_best_and_save(current, epoch, step, trainer, pl_module, metrics)
elif self.verbose:
rank_zero_info(
f"Epoch {epoch:d}, step {step:d}: {self.monitor} was not in top {self.save_top_k}"
@@ -588,25 +600,26 @@ def _is_valid_monitor_key(self, metrics):
def _update_best_and_save(
self,
- filepath: str,
current: torch.Tensor,
epoch: int,
step: int,
trainer,
pl_module,
+ ckpt_name_metrics
):
k = len(self.best_k_models) + 1 if self.save_top_k == -1 else self.save_top_k
- del_list = []
+ del_filepath = None
if len(self.best_k_models) == k and k > 0:
- delpath = self.kth_best_model_path
- self.best_k_models.pop(self.kth_best_model_path)
- del_list.append(delpath)
+ del_filepath = self.kth_best_model_path
+ self.best_k_models.pop(del_filepath)
# do not save nan, replace with +/- inf
if torch.isnan(current):
current = torch.tensor(float('inf' if self.mode == "min" else '-inf'))
+ filepath = self._get_metric_interpolated_filepath_name(ckpt_name_metrics, epoch, step, del_filepath)
+
# save the current score
self.current_score = current
self.best_k_models[filepath] = current
@@ -630,9 +643,8 @@ def _update_best_and_save(
)
self._save_model(filepath, trainer, pl_module)
- for cur_path in del_list:
- if cur_path != filepath:
- self._del_model(cur_path)
+ if del_filepath is not None and filepath != del_filepath:
+ self._del_model(del_filepath)
def to_yaml(self, filepath: Optional[Union[str, Path]] = None):
"""
diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py
index 57979b73f2cb6d..f24a4ce8beb8ac 100644
--- a/pytorch_lightning/core/hooks.py
+++ b/pytorch_lightning/core/hooks.py
@@ -14,7 +14,7 @@
"""Various hooks to be used in the Lightning code."""
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Optional, Union
import torch
from pytorch_lightning.utilities import move_data_to_device, rank_zero_warn
@@ -501,7 +501,7 @@ def val_dataloader(self):
will have an argument ``dataloader_idx`` which matches the order here.
"""
- def transfer_batch_to_device(self, batch: Any, device: torch.device) -> Any:
+ def transfer_batch_to_device(self, batch: Any, device: Optional[torch.device] = None) -> Any:
"""
Override this hook if your :class:`~torch.utils.data.DataLoader` returns tensors
wrapped in a custom data structure.
@@ -549,6 +549,7 @@ def transfer_batch_to_device(self, batch, device)
- :func:`~pytorch_lightning.utilities.apply_func.move_data_to_device`
- :func:`~pytorch_lightning.utilities.apply_func.apply_to_collection`
"""
+ device = device or self.device
return move_data_to_device(batch, device)
diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index ef05ce69c1828b..ab66435a2935db 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -22,6 +22,7 @@
import tempfile
from abc import ABC
from argparse import Namespace
+from pathlib import Path
from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
import torch
@@ -278,6 +279,7 @@ def log(
sync_dist_group,
accelerator.sync_tensor,
self._current_dataloader_idx,
+ self.device,
)
def log_dict(
@@ -989,7 +991,7 @@ def configure_optimizers(
- List or Tuple - List of optimizers.
- Two lists - The first list has multiple optimizers, the second a list of LR schedulers (or lr_dict).
- Dictionary, with an 'optimizer' key, and (optionally) a 'lr_scheduler'
- key which value is a single LR scheduler or lr_dict.
+ key whose value is a single LR scheduler or lr_dict.
- Tuple of dictionaries as described, with an optional 'frequency' key.
- None - Fit will run without any optimizer.
@@ -1001,21 +1003,22 @@ def configure_optimizers(
In the former case, all optimizers will operate on the given batch in each optimization step.
In the latter, only one optimizer will operate on the given batch at every step.
- The lr_dict is a dictionary which contains scheduler and its associated configuration.
- It has five keys. The default configuration is shown below.
+ The lr_dict is a dictionary which contains the scheduler and its associated configuration.
+ The default configuration is shown below.
.. code-block:: python
{
- 'scheduler': lr_scheduler, # The LR schduler
+ 'scheduler': lr_scheduler, # The LR scheduler instance (required)
'interval': 'epoch', # The unit of the scheduler's step size
'frequency': 1, # The frequency of the scheduler
'reduce_on_plateau': False, # For ReduceLROnPlateau scheduler
'monitor': 'val_loss', # Metric for ReduceLROnPlateau to monitor
- 'strict': True # Whether to crash the training if `monitor` is not found
+ 'strict': True, # Whether to crash the training if `monitor` is not found
+ 'name': None, # Custom name for LearningRateMonitor to use
}
- If user only provides LR schedulers, then their configuration will set to default as shown above.
+ Only the ``scheduler`` key is required, the rest will be set to the defaults above.
Examples:
.. code-block:: python
@@ -1390,12 +1393,15 @@ def get_progress_bar_dict(self):
"""
# call .item() only once but store elements without graphs
running_train_loss = self.trainer.train_loop.running_loss.mean()
- avg_training_loss = (
- running_train_loss.cpu().item()
- if running_train_loss is not None
- else float("NaN")
- )
- tqdm_dict = {"loss": "{:.3g}".format(avg_training_loss)}
+ avg_training_loss = None
+ if running_train_loss is not None:
+ avg_training_loss = running_train_loss.cpu().item()
+ elif self.trainer.train_loop.automatic_optimization:
+ avg_training_loss = float('NaN')
+
+ tqdm_dict = {}
+ if avg_training_loss is not None:
+ tqdm_dict["loss"] = f"{avg_training_loss:.3g}"
if self.trainer.truncated_bptt_steps is not None:
tqdm_dict["split_idx"] = self.trainer.split_idx
@@ -1530,12 +1536,19 @@ def _set_hparams(self, hp: Union[dict, Namespace, str]) -> None:
else:
self._hparams = hp
- def to_onnx(self, file_path: str, input_sample: Optional[Tensor] = None, **kwargs):
- """Saves the model in ONNX format
+ @torch.no_grad()
+ def to_onnx(
+ self,
+ file_path: Union[str, Path],
+ input_sample: Optional[Any] = None,
+ **kwargs,
+ ):
+ """
+ Saves the model in ONNX format
Args:
- file_path: The path of the file the model should be saved to.
- input_sample: A sample of an input tensor for tracing.
+ file_path: The path of the file the onnx model should be saved to.
+ input_sample: An input for tracing. Default: None (Use self.example_input_array)
**kwargs: Will be passed to torch.onnx.export function.
Example:
@@ -1554,31 +1567,32 @@ def to_onnx(self, file_path: str, input_sample: Optional[Tensor] = None, **kwarg
... os.path.isfile(tmpfile.name)
True
"""
+ mode = self.training
- if isinstance(input_sample, Tensor):
- input_data = input_sample
- elif self.example_input_array is not None:
- input_data = self.example_input_array
- else:
- if input_sample is not None:
+ if input_sample is None:
+ if self.example_input_array is None:
raise ValueError(
- f"Received `input_sample` of type {type(input_sample)}. Expected type is `Tensor`"
+ "Could not export to ONNX since neither `input_sample` nor"
+ " `model.example_input_array` attribute is set."
)
- raise ValueError(
- "Could not export to ONNX since neither `input_sample` nor"
- " `model.example_input_array` attribute is set."
- )
- input_data = input_data.to(self.device)
+ input_sample = self.example_input_array
+
+ input_sample = self.transfer_batch_to_device(input_sample)
+
if "example_outputs" not in kwargs:
self.eval()
- with torch.no_grad():
- kwargs["example_outputs"] = self(input_data)
+ kwargs["example_outputs"] = self(input_sample)
- torch.onnx.export(self, input_data, file_path, **kwargs)
+ torch.onnx.export(self, input_sample, file_path, **kwargs)
+ self.train(mode)
+ @torch.no_grad()
def to_torchscript(
- self, file_path: Optional[str] = None, method: Optional[str] = 'script',
- example_inputs: Optional[Union[torch.Tensor, Tuple[torch.Tensor]]] = None, **kwargs
+ self,
+ file_path: Optional[Union[str, Path]] = None,
+ method: Optional[str] = 'script',
+ example_inputs: Optional[Any] = None,
+ **kwargs,
) -> Union[ScriptModule, Dict[str, ScriptModule]]:
"""
By default compiles the whole model to a :class:`~torch.jit.ScriptModule`.
@@ -1590,7 +1604,7 @@ def to_torchscript(
Args:
file_path: Path where to save the torchscript. Default: None (no file saved).
method: Whether to use TorchScript's script or trace method. Default: 'script'
- example_inputs: Tensor to be used to do tracing when method is set to 'trace'.
+ example_inputs: An input to be used to do tracing when method is set to 'trace'.
Default: None (Use self.example_input_array)
**kwargs: Additional arguments that will be passed to the :func:`torch.jit.script` or
:func:`torch.jit.trace` function.
@@ -1624,21 +1638,27 @@ def to_torchscript(
This LightningModule as a torchscript, regardless of whether file_path is
defined or not.
"""
-
mode = self.training
- with torch.no_grad():
- if method == 'script':
- torchscript_module = torch.jit.script(self.eval(), **kwargs)
- elif method == 'trace':
- # if no example inputs are provided, try to see if model has example_input_array set
- if example_inputs is None:
- example_inputs = self.example_input_array
- # automatically send example inputs to the right device and use trace
- example_inputs = self.transfer_batch_to_device(example_inputs, device=self.device)
- torchscript_module = torch.jit.trace(func=self.eval(), example_inputs=example_inputs, **kwargs)
- else:
- raise ValueError(f"The 'method' parameter only supports 'script' or 'trace', but value given was:"
- f"{method}")
+
+ if method == 'script':
+ torchscript_module = torch.jit.script(self.eval(), **kwargs)
+ elif method == 'trace':
+ # if no example inputs are provided, try to see if model has example_input_array set
+ if example_inputs is None:
+ if self.example_input_array is None:
+ raise ValueError(
+ 'Choosing method=`trace` requires either `example_inputs`'
+ ' or `model.example_input_array` to be defined'
+ )
+ example_inputs = self.example_input_array
+
+ # automatically send example inputs to the right device and use trace
+ example_inputs = self.transfer_batch_to_device(example_inputs)
+ torchscript_module = torch.jit.trace(func=self.eval(), example_inputs=example_inputs, **kwargs)
+ else:
+ raise ValueError("The 'method' parameter only supports 'script' or 'trace',"
+ f" but value given was: {method}")
+
self.train(mode)
if file_path is not None:
diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py
index 142fe9048cb0ea..b6112a68b4e9b8 100644
--- a/pytorch_lightning/core/step_result.py
+++ b/pytorch_lightning/core/step_result.py
@@ -15,15 +15,15 @@
"""[Train, Eval]Result for easier logging, checkpointing, early stopping, epoch-wise reduction."""
import numbers
+import os
from copy import copy
-from typing import Optional, Dict, Union, Sequence, Callable, MutableMapping, Any, List, Tuple, Iterable
+from typing import Any, Callable, Dict, Iterable, List, MutableMapping, Optional, Sequence, Tuple, Union
import torch
from torch import Tensor
-import os
-from pytorch_lightning.utilities.distributed import sync_ddp_if_available
from pytorch_lightning.metrics import Metric
+from pytorch_lightning.utilities.distributed import sync_ddp_if_available
class Result(Dict):
@@ -128,6 +128,7 @@ def log(
sync_dist_group: Optional[Any] = None,
sync_fn: Callable = None,
dataloader_idx: Optional[int] = None,
+ device: torch.device = None,
):
# no metrics should be logged with graphs
if not enable_graph and isinstance(value, torch.Tensor):
@@ -138,7 +139,10 @@ def log(
if sync_dist and isinstance(value, (torch.Tensor, numbers.Number)):
is_dist_initialized = torch.distributed.is_available() and torch.distributed.is_initialized()
# TODO: Find a way to make the reduction only once, so we don't need to clone.
- value = value.clone() if is_dist_initialized else value
+ if is_dist_initialized and isinstance(value, torch.Tensor):
+ value = value.clone()
+ else:
+ value = torch.tensor(value, device=device, dtype=torch.float)
value = sync_fn(value, group=sync_dist_group, reduce_op=sync_dist_op)
if 'meta' not in self:
@@ -367,7 +371,10 @@ def get_forked_metrics(self, add_dataloader_idx=False):
dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx)
if options['forked']:
- result[dl_key] = self[k]
+ if isinstance(self[k], Metric):
+ result[dl_key] = self[k].compute().detach()
+ else:
+ result[dl_key] = self[k]
return result
diff --git a/pytorch_lightning/metrics/classification/f_beta.py b/pytorch_lightning/metrics/classification/f_beta.py
index d6147b00463b35..fadfd000ebbe1e 100755
--- a/pytorch_lightning/metrics/classification/f_beta.py
+++ b/pytorch_lightning/metrics/classification/f_beta.py
@@ -52,11 +52,11 @@ class FBeta(Metric):
Threshold value for binary or multi-label logits. default: 0.5
average:
- * `'micro'` computes metric globally
- * `'macro'` computes metric for each class and uniformly averages them
- * `'weighted'` computes metric for each class and does a weighted-average,
- where each class is weighted by their support (accounts for class imbalance)
- * `None` computes and returns the metric per class
+ - ``'micro'`` computes metric globally
+ - ``'macro'`` computes metric for each class and uniformly averages them
+ - ``'weighted'`` computes metric for each class and does a weighted-average,
+ where each class is weighted by their support (accounts for class imbalance)
+ - ``'none'`` computes and returns the metric per class
multilabel: If predictions are from multilabel classification.
compute_on_step:
@@ -185,11 +185,11 @@ class F1(FBeta):
Threshold value for binary or multi-label logits. default: 0.5
average:
- * `'micro'` computes metric globally
- * `'macro'` computes metric for each class and uniformly averages them
- * `'weighted'` computes metric for each class and does a weighted-average,
- where each class is weighted by their support (accounts for class imbalance)
- * `None` computes and returns the metric per class
+ - ``'micro'`` computes metric globally
+ - ``'macro'`` computes metric for each class and uniformly averages them
+ - ``'weighted'`` computes metric for each class and does a weighted-average,
+ where each class is weighted by their support (accounts for class imbalance)
+ - ``'none'`` computes and returns the metric per class
multilabel: If predictions are from multilabel classification.
compute_on_step:
@@ -212,7 +212,6 @@ class F1(FBeta):
def __init__(
self,
num_classes: int = 1,
- beta: float = 1.0,
threshold: float = 0.5,
average: str = "micro",
multilabel: bool = False,
diff --git a/pytorch_lightning/metrics/functional/f_beta.py b/pytorch_lightning/metrics/functional/f_beta.py
index 3f0a7a04493257..2b0ba194d56f02 100755
--- a/pytorch_lightning/metrics/functional/f_beta.py
+++ b/pytorch_lightning/metrics/functional/f_beta.py
@@ -83,11 +83,11 @@ def fbeta(
Threshold value for binary or multi-label logits. default: 0.5
average:
- * `'micro'` computes metric globally
- * `'macro'` computes metric for each class and uniformly averages them
- * `'weighted'` computes metric for each class and does a weighted-average,
- where each class is weighted by their support (accounts for class imbalance)
- * `None` computes and returns the metric per class
+ - ``'micro'`` computes metric globally
+ - ``'macro'`` computes metric for each class and uniformly averages them
+ - ``'weighted'`` computes metric for each class and does a weighted-average,
+ where each class is weighted by their support (accounts for class imbalance)
+ - ``'none'`` computes and returns the metric per class
multilabel: If predictions are from multilabel classification.
@@ -110,7 +110,6 @@ def f1(
preds: torch.Tensor,
target: torch.Tensor,
num_classes: int,
- beta: float = 1.0,
threshold: float = 0.5,
average: str = "micro",
multilabel: bool = False
@@ -136,11 +135,11 @@ def f1(
Threshold value for binary or multi-label logits. default: 0.5
average:
- * `'micro'` computes metric globally
- * `'macro'` computes metric for each class and uniformly averages them
- * `'weighted'` computes metric for each class and does a weighted-average,
- where each class is weighted by their support (accounts for class imbalance)
- * `None` computes and returns the metric per class
+ - ``'micro'`` computes metric globally
+ - ``'macro'`` computes metric for each class and uniformly averages them
+ - ``'weighted'`` computes metric for each class and does a weighted-average,
+ where each class is weighted by their support (accounts for class imbalance)
+ - ``'none'`` computes and returns the metric per class
multilabel: If predictions are from multilabel classification.
diff --git a/pytorch_lightning/setup_tools.py b/pytorch_lightning/setup_tools.py
index 3842bbe50cfc5d..29ac3b814b3c2e 100644
--- a/pytorch_lightning/setup_tools.py
+++ b/pytorch_lightning/setup_tools.py
@@ -14,12 +14,12 @@
# limitations under the License.
import os
import re
-import warnings
from typing import Iterable, List
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen
+import warnings
-from pytorch_lightning import PROJECT_ROOT, __homepage__, __version__
+from pytorch_lightning import __homepage__, __version__, PROJECT_ROOT
_PATH_BADGES = os.path.join('.', 'docs', 'source', '_images', 'badges')
# badge to download
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index 2311cc767de2d4..429bddd88b77e9 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -13,14 +13,16 @@
# limitations under the License.
import os
+from pathlib import Path
import re
+from typing import Union, Optional
import torch
import pytorch_lightning
from pytorch_lightning import _logger as log
from pytorch_lightning.core.lightning import LightningModule
-from pytorch_lightning.utilities import APEX_AVAILABLE, AMPType, OMEGACONF_AVAILABLE, rank_zero_warn
+from pytorch_lightning.utilities import APEX_AVAILABLE, AMPType, OMEGACONF_AVAILABLE, rank_zero_info, rank_zero_warn
from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem
from pytorch_lightning.utilities.cloud_io import load as pl_load
from pytorch_lightning.utilities.upgrade_checkpoint import KEYS_MAPPING as DEPRECATED_CHECKPOINT_KEYS
@@ -52,16 +54,17 @@ def restore_weights(self, model: LightningModule):
if self.trainer.on_gpu:
torch.cuda.empty_cache()
- # if script called from hpc resubmit, load weights
- did_restore_hpc_weights = self.restore_hpc_weights_if_needed(model)
+ # 1. Attempt to restore states from HPC checkpoint
+ dir_path_hpc = str(self.trainer.weights_save_path)
+ max_suffix = self.max_ckpt_in_folder(dir_path_hpc, "hpc_ckpt_")
+ if max_suffix is not None:
+ checkpoint_path = f'{dir_path_hpc}/hpc_ckpt_{max_suffix}.ckpt'
+ self.hpc_load(checkpoint_path, self.trainer.on_gpu)
+ rank_zero_info(f'restored hpc model from: {checkpoint_path}')
- # clear cache after restore
- if self.trainer.on_gpu:
- torch.cuda.empty_cache()
-
- if not did_restore_hpc_weights:
- if self.trainer.resume_from_checkpoint is not None:
- self.restore(self.trainer.resume_from_checkpoint, on_gpu=self.trainer.on_gpu)
+ # 2. Attempt to restore states from `resume_from_checkpoint` file
+ elif self.trainer.resume_from_checkpoint is not None:
+ self.restore(self.trainer.resume_from_checkpoint, on_gpu=self.trainer.on_gpu)
# wait for all to catch up
self.trainer.accelerator_backend.barrier('TrainerIOMixin.restore_weights')
@@ -72,24 +75,14 @@ def restore_weights(self, model: LightningModule):
def restore(self, checkpoint_path: str, on_gpu: bool):
"""
- Load model/training states from the checkpoint file through file-read and state-restore.
- Also restores all training state like:
- - epoch
- - callbacks
- - schedulers
- - optimizer
- In detail, check return value description of `dump_checkpoint`
+ Load model/training states from a 'PyTorch-Lightning checkpoint' file through file-read and state-restore.
+ All restored states are listed in return value description of `dump_checkpoint`.
"""
- # if on_gpu:
- # checkpoint = torch.load(checkpoint_path)
- # else:
- # load on CPU first
- # read a checkpoint dictionary object from the checkpoint file at `checkpoint_path`
+ # read a checkpoint dictionary object from the 'PyTorch-Lightning checkpoint' file at `checkpoint_path`
checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage)
- # restore states from the checkpoint dictionary object
- # load model state
+ # acquire the model
model = self.trainer.get_model()
# restore model and datamodule state
@@ -106,14 +99,14 @@ def restore_model_state(self, model: LightningModule, checkpoint) -> None:
Restore model states from a 'PyTorch-Lightning checkpoint' dictionary object
"""
- # give the datamodule a chance to load something
+ # restore datamodule states
if self.trainer.datamodule is not None:
self.trainer.datamodule.on_load_checkpoint(checkpoint)
- # give model a chance to restore something
+ # hook: give user access to checkpoint if needed.
model.on_load_checkpoint(checkpoint)
- # restore the state_dict on the model
+ # restore model state_dict
model.load_state_dict(checkpoint['state_dict'])
def restore_training_state(self, checkpoint):
@@ -187,23 +180,6 @@ def restore_training_state(self, checkpoint):
for scheduler, lrs_state in zip(self.trainer.lr_schedulers, lr_schedulers):
scheduler['scheduler'].load_state_dict(lrs_state)
- def restore_hpc_weights_if_needed(self, model: LightningModule):
- """If there is a set of hpc weights, use as signal to restore model."""
- did_restore = False
-
- # look for hpc weights
- folderpath = str(self.trainer.weights_save_path)
- fs = get_filesystem(folderpath)
- if fs.exists(folderpath):
- files = [os.path.basename(f['name']) for f in fs.listdir(folderpath)]
- hpc_weight_paths = [x for x in files if 'hpc_ckpt' in x]
-
- # if hpc weights exist restore model
- if len(hpc_weight_paths) > 0:
- self.hpc_load(folderpath, self.trainer.on_gpu)
- did_restore = True
- return did_restore
-
# ----------------------------------
# PRIVATE OPS
# ----------------------------------
@@ -216,7 +192,8 @@ def hpc_save(self, folderpath: str, logger):
# save logger to make sure we get all the metrics
logger.save()
- ckpt_number = self.max_ckpt_in_folder(folderpath) + 1
+ max_suffix = self.max_ckpt_in_folder(folderpath)
+ ckpt_number = (max_suffix if max_suffix is not None else 0) + 1
fs.makedirs(folderpath, exist_ok=True)
filepath = os.path.join(folderpath, f'hpc_ckpt_{ckpt_number}.ckpt')
@@ -333,36 +310,52 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
return checkpoint
- def hpc_load(self, folderpath, on_gpu):
- filepath = '{}/hpc_ckpt_{}.ckpt'.format(folderpath, self.max_ckpt_in_folder(folderpath))
+ def hpc_load(self, checkpoint_path: str, on_gpu: bool):
+ """
+ Load model/training states from a 'PyTorch-Lightning checkpoint' file for hpc.
+ All restored states are listed in return value description of `dump_checkpoint`.
+ """
- # load on CPU first
- checkpoint = pl_load(filepath, map_location=lambda storage, loc: storage)
+ # read a checkpoint dictionary object from the 'PyTorch-Lightning checkpoint' file at `checkpoint_path`
+ checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage)
- # load model state
+ # acquire the model
model = self.trainer.get_model()
- # restore states from 'PyTorch-Lightning checkpoint' dictionary object
+ # restore model and datamodule state
self.restore_model_state(model, checkpoint)
if self.trainer.root_gpu is not None:
model.cuda(self.trainer.root_gpu)
- # load training state (affects trainer only)
+ # restore training state
self.restore_training_state(checkpoint)
- # call model hook
+ # call hpc specific hook
model.on_hpc_load(checkpoint)
- log.info(f'restored hpc model from: {filepath}')
+ def max_ckpt_in_folder(self, dir_path: Union[str, Path], name_key: str = 'ckpt_') -> Optional[int]:
+ """List up files in `dir_path` with name_key, then yield maximum suffix number.
+
+ Args:
+ dir_path: path of directory which may contain files whose name include `name_key`
+
+ Returns:
+ None if no-corresponding-file else maximum suffix number
+ """
+
+ # check directory existence
+ fs = get_filesystem(dir_path)
+ if not fs.exists(dir_path):
+ return None
- def max_ckpt_in_folder(self, path, name_key='ckpt_'):
- fs = get_filesystem(path)
- files = [os.path.basename(f["name"]) for f in fs.listdir(path)]
+ # check corresponding file existence
+ files = [os.path.basename(f["name"]) for f in fs.listdir(dir_path)]
files = [x for x in files if name_key in x]
if len(files) == 0:
- return 0
+ return None
+ # extract suffix number
ckpt_vs = []
for name in files:
name = name.split(name_key)[-1]
@@ -371,6 +364,13 @@ def max_ckpt_in_folder(self, path, name_key='ckpt_'):
return max(ckpt_vs)
+ def get_max_ckpt_path_from_folder(self, folder_path: Union[str, Path]) -> str:
+ """Get path of maximum-epoch checkpoint in the folder."""
+
+ max_suffix = self.max_ckpt_in_folder(folder_path)
+ ckpt_number = max_suffix if max_suffix is not None else 0
+ return f'{folder_path}/hpc_ckpt_{ckpt_number}.ckpt'
+
def save_checkpoint(self, filepath, weights_only: bool = False):
"""Save model/training states as a checkpoint file through state-dump and file-write.
diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
index 28025859814cc2..6d206f3dd929ed 100644
--- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
+++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py
@@ -91,11 +91,13 @@ def check_dataloader_idx(self, result: Result) -> bool:
random_key = list(result.keys())[-1]
return result["meta"][random_key]["dataloader_idx"] is not None
- def get_latest_from_func_name(self, latest_result, func_name: str, *args, **kwargs) -> Dict:
+ def get_latest_from_func_name(self, latest_result_opt, func_name: str, *args, **kwargs) -> Dict:
results = {}
- add_dataloader_idx = self.check_dataloader_idx(latest_result)
- func = getattr(latest_result, func_name)
- results.update(func(*args, add_dataloader_idx=add_dataloader_idx, **kwargs))
+ for opt_idx in latest_result_opt:
+ latest_result = latest_result_opt[opt_idx]
+ add_dataloader_idx = self.check_dataloader_idx(latest_result)
+ func = getattr(latest_result, func_name)
+ results.update(func(*args, add_dataloader_idx=add_dataloader_idx, **kwargs))
return results
def run_latest_batch_metrics_with_func_name(self, func_name, *args, **kwargs) -> List[Dict]:
@@ -156,6 +158,7 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio
assert isinstance(result, Result)
if dataloader_idx is None:
dataloader_idx = 0
+
if extra_info is None:
extra_info = {}
@@ -166,6 +169,7 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio
if dataloader_idx not in self._internals:
self._internals[dataloader_idx] = {}
self._internals_reduced[dataloader_idx] = defaultdict(dict)
+ self._latest_ref[dataloader_idx] = {}
# extract infos
opt_idx = extra_info["opt_idx"]
@@ -173,7 +177,7 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio
self._append_to_structure(self._internals[dataloader_idx], opt_idx, batch_idx, result)
- self._latest_ref[dataloader_idx] = result
+ self._latest_ref[dataloader_idx][opt_idx] = result
# [dataloader_idx] is a list
else:
@@ -181,7 +185,11 @@ def append(self, result, dataloader_idx: Optional[int] = None, extra_info: Optio
self._internals.setdefault(dataloader_idx, [])
self._internals[dataloader_idx].append(result)
- self._latest_ref[dataloader_idx] = result
+ if dataloader_idx not in self._latest_ref:
+ self._latest_ref[dataloader_idx] = {}
+ self._latest_ref[dataloader_idx][0] = {}
+
+ self._latest_ref[dataloader_idx][0] = result
def auto_reduce_results_on_epoch_end(self) -> None:
"""
@@ -206,13 +214,9 @@ def auto_reduce_results_on_epoch_end(self) -> None:
# TODO: How to start training in middle of epoch
opt_outputs = epoch_metrics[opt_idx]
- num_batch_idx = len(self._internals[dl_idx][num_opt_idx]) - 1
- assert num_batch_idx >= 0
- batch_indexes = self._internals[dl_idx][num_opt_idx].keys()
-
# reduce across time first
time_reduced_outputs = []
- for batch_idx in batch_indexes:
+ for batch_idx in opt_outputs.keys():
tbptt_outs = opt_outputs[batch_idx]
tbptt_outs = tbptt_outs[0].__class__.reduce_across_time(tbptt_outs)
if len(tbptt_outs) > 1:
diff --git a/pytorch_lightning/trainer/optimizers.py b/pytorch_lightning/trainer/optimizers.py
index 6f3ba80bd0734f..479d4017202611 100644
--- a/pytorch_lightning/trainer/optimizers.py
+++ b/pytorch_lightning/trainer/optimizers.py
@@ -94,6 +94,7 @@ def configure_schedulers(self, schedulers: list, monitor: Optional[str] = None):
lr_schedulers = []
default_config = {
'scheduler': None,
+ 'name': None, # no custom name
'interval': 'epoch', # after epoch is over
'frequency': 1, # every epoch/batch
'reduce_on_plateau': False, # most often not ReduceLROnPlateau scheduler
diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py
index 57747be0d51fb5..b2ba92846b241a 100644
--- a/pytorch_lightning/trainer/supporters.py
+++ b/pytorch_lightning/trainer/supporters.py
@@ -50,7 +50,7 @@ def __init__(self, window_length: int):
def reset(self) -> None:
"""Empty the accumulator."""
- self = TensorRunningAccum(self.window_length)
+ self.__init__(self.window_length)
def last(self):
"""Get the last added element."""
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 35da90625adefe..5a837956bc4ce2 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -133,7 +133,7 @@ def __init__(
distributed_backend: Optional[str] = None,
automatic_optimization: Optional[bool] = None,
move_metrics_to_cpu: bool = False,
- enable_pl_optimizer: bool = True,
+ enable_pl_optimizer: bool = False,
):
r"""
Customize every aspect of training via flags
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
index 9724f05247c009..c315c6633b6fb3 100644
--- a/pytorch_lightning/utilities/distributed.py
+++ b/pytorch_lightning/utilities/distributed.py
@@ -15,14 +15,14 @@
import os
import warnings
from functools import wraps
+from typing import Any, Optional, Union
import torch
+
from pytorch_lightning import _logger as log
-from typing import Union, Optional, Any
if torch.distributed.is_available():
- from torch.distributed import ReduceOp
- from torch.distributed import group
+ from torch.distributed import ReduceOp, group
else:
class ReduceOp:
SUM = None
@@ -145,15 +145,14 @@ def sync_ddp(
if group is None:
group = torch.distributed.group.WORLD
- if reduce_op is None:
- reduce_op = torch.distributed.ReduceOp.SUM
- elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"):
- reduce_op = torch.distributed.ReduceOp.SUM
+ op = reduce_op if isinstance(reduce_op, ReduceOp) else ReduceOp.SUM
+
+ if isinstance(reduce_op, str) and reduce_op.lower() in ("avg", "mean"):
divide_by_world_size = True
# sync all processes before reduction
torch.distributed.barrier(group=group)
- torch.distributed.all_reduce(result, op=reduce_op, group=group, async_op=False)
+ torch.distributed.all_reduce(result, op=op, group=group, async_op=False)
if divide_by_world_size:
result = result / torch.distributed.get_world_size(group)
diff --git a/tests/__init__.py b/tests/__init__.py
index 981d685430da99..1bb81c466e6eb7 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,3 +1,16 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
import os
import numpy as np
diff --git a/tests/base/deterministic_model.py b/tests/base/deterministic_model.py
index d8847d592e1de6..6f6b5f858ff175 100644
--- a/tests/base/deterministic_model.py
+++ b/tests/base/deterministic_model.py
@@ -15,7 +15,6 @@
from torch import nn
from torch.utils.data import Dataset, DataLoader
-from pytorch_lightning.core.step_result import TrainResult, EvalResult
from pytorch_lightning.core.lightning import LightningModule
@@ -111,235 +110,6 @@ def training_epoch_end_scalar(self, outputs):
assert batch_out.grad_fn is None
assert isinstance(batch_out, torch.Tensor)
- def training_step_no_default_callbacks_for_train_loop(self, batch, batch_idx):
- """
- Early stop and checkpoint only on these values
- """
- acc = self.step(batch, batch_idx)
- result = TrainResult(minimize=acc)
- assert 'early_step_on' not in result
- assert 'checkpoint_on' in result
- return result
-
- def training_step_no_callbacks_result_obj(self, batch, batch_idx):
- """
- Early stop and checkpoint only on these values
- """
- acc = self.step(batch, batch_idx)
- result = TrainResult(minimize=acc, checkpoint_on=False)
- assert 'early_step_on' not in result
- assert 'checkpoint_on' not in result
- return result
-
- def training_step_result_log_epoch_and_step_for_callbacks(self, batch, batch_idx):
- """
- Early stop and checkpoint only on these values
- """
- acc = self.step(batch, batch_idx)
-
- self.assert_backward = False
- losses = [20, 19, 18, 10, 15, 14, 9, 11, 11, 20]
- idx = self.current_epoch
- loss = acc + losses[idx]
- result = TrainResult(minimize=loss, early_stop_on=loss, checkpoint_on=loss)
- return result
-
- def training_step_result_log_step_only(self, batch, batch_idx):
- acc = self.step(batch, batch_idx)
- result = TrainResult(minimize=acc)
-
- # step only metrics
- result.log(f'step_log_and_pbar_acc1_b{batch_idx}', torch.tensor(11).type_as(acc), prog_bar=True)
- result.log(f'step_log_acc2_b{batch_idx}', torch.tensor(12).type_as(acc))
- result.log(f'step_pbar_acc3_b{batch_idx}', torch.tensor(13).type_as(acc), logger=False, prog_bar=True)
-
- self.training_step_called = True
- return result
-
- def training_step_result_log_epoch_only(self, batch, batch_idx):
- acc = self.step(batch, batch_idx)
- result = TrainResult(minimize=acc)
-
- result.log(f'epoch_log_and_pbar_acc1_e{self.current_epoch}', torch.tensor(14).type_as(acc),
- on_epoch=True, prog_bar=True, on_step=False)
- result.log(f'epoch_log_acc2_e{self.current_epoch}', torch.tensor(15).type_as(acc),
- on_epoch=True, on_step=False)
- result.log(f'epoch_pbar_acc3_e{self.current_epoch}', torch.tensor(16).type_as(acc),
- on_epoch=True, logger=False, prog_bar=True, on_step=False)
-
- self.training_step_called = True
- return result
-
- def training_step_result_log_epoch_and_step(self, batch, batch_idx):
- acc = self.step(batch, batch_idx)
- result = TrainResult(minimize=acc)
-
- val_1 = (5 + batch_idx) * (self.current_epoch + 1)
- val_2 = (6 + batch_idx) * (self.current_epoch + 1)
- val_3 = (7 + batch_idx) * (self.current_epoch + 1)
- result.log('step_epoch_log_and_pbar_acc1', torch.tensor(val_1).type_as(acc),
- on_epoch=True, prog_bar=True)
- result.log('step_epoch_log_acc2', torch.tensor(val_2).type_as(acc),
- on_epoch=True)
- result.log('step_epoch_pbar_acc3', torch.tensor(val_3).type_as(acc),
- on_epoch=True, logger=False, prog_bar=True)
-
- self.training_step_called = True
- return result
-
- def training_epoch_end_return_for_log_epoch_and_step(self, result):
- """
- There should be an array of scalars without graphs that are all 171 (4 of them)
- """
- self.training_epoch_end_called = True
-
- if self.use_dp or self.use_ddp2:
- pass
- else:
- # only saw 4 batches
- assert isinstance(result, TrainResult)
-
- result.step_epoch_log_acc2 = result.step_epoch_log_acc2_step.prod()
- result.step_epoch_pbar_acc3 = result.step_epoch_pbar_acc3_step.prod()
- result.step_epoch_log_and_pbar_acc1 = result.step_epoch_log_and_pbar_acc1_step.prod()
- result.minimize = result.minimize.mean()
- result.checkpoint_on = result.checkpoint_on.mean()
-
- result.step_epoch_log_and_pbar_acc1_step = result.step_epoch_log_and_pbar_acc1_step.prod()
- result.step_epoch_log_and_pbar_acc1_epoch = result.step_epoch_log_and_pbar_acc1_epoch.prod()
- result.step_epoch_log_acc2_step = result.step_epoch_log_acc2_step.prod()
- result.step_epoch_log_acc2_epoch = result.step_epoch_log_acc2_epoch.prod()
- result.step_epoch_pbar_acc3_step = result.step_epoch_pbar_acc3_step.prod()
- result.step_epoch_pbar_acc3_epoch = result.step_epoch_pbar_acc3_epoch.prod()
- result.log('epoch_end_log_acc', torch.tensor(1212).type_as(result.step_epoch_log_acc2_epoch),
- logger=True, on_epoch=True)
- result.log('epoch_end_pbar_acc', torch.tensor(1213).type_as(result.step_epoch_log_acc2_epoch),
- logger=False, prog_bar=True, on_epoch=True)
- result.log('epoch_end_log_pbar_acc', torch.tensor(1214).type_as(result.step_epoch_log_acc2_epoch),
- logger=True, prog_bar=True, on_epoch=True)
- return result
-
- # --------------------------
- # EvalResults
- # --------------------------
- def validation_step_result_callbacks(self, batch, batch_idx):
- acc = self.step(batch, batch_idx)
-
- self.assert_backward = False
- losses = [20, 19, 20, 21, 22, 23]
- idx = self.current_epoch
- loss = acc + losses[idx]
- result = EvalResult(early_stop_on=loss, checkpoint_on=loss)
-
- self.validation_step_called = True
- return result
-
- def validation_step_result_no_callbacks(self, batch, batch_idx):
- acc = self.step(batch, batch_idx)
-
- self.assert_backward = False
- losses = [20, 19, 20, 21, 22, 23, 50, 50, 50, 50, 50, 50]
- idx = self.current_epoch
- loss = acc + losses[idx]
-
- result = EvalResult(checkpoint_on=loss)
-
- self.validation_step_called = True
- return result
-
- def validation_step_result_only_epoch_metrics(self, batch, batch_idx):
- """
- Only track epoch level metrics
- """
- acc = self.step(batch, batch_idx)
- result = EvalResult(checkpoint_on=acc, early_stop_on=acc)
-
- # step only metrics
- result.log('no_val_no_pbar', torch.tensor(11 + batch_idx).type_as(acc), prog_bar=False, logger=False)
- result.log('val_step_log_acc', torch.tensor(11 + batch_idx).type_as(acc), prog_bar=False, logger=True)
- result.log('val_step_log_pbar_acc', torch.tensor(12 + batch_idx).type_as(acc), prog_bar=True, logger=True)
- result.log('val_step_pbar_acc', torch.tensor(13 + batch_idx).type_as(acc), prog_bar=True, logger=False)
-
- self.validation_step_called = True
- return result
-
- def validation_step_result_only_step_metrics(self, batch, batch_idx):
- """
- Only track epoch level metrics
- """
- acc = self.step(batch, batch_idx)
- result = EvalResult(checkpoint_on=acc, early_stop_on=acc)
-
- # step only metrics
- result.log('no_val_no_pbar', torch.tensor(11 + batch_idx).type_as(acc),
- prog_bar=False, logger=False, on_epoch=False, on_step=True)
- result.log('val_step_log_acc', torch.tensor(11 + batch_idx).type_as(acc),
- prog_bar=False, logger=True, on_epoch=False, on_step=True)
- result.log('val_step_log_pbar_acc', torch.tensor(12 + batch_idx).type_as(acc),
- prog_bar=True, logger=True, on_epoch=False, on_step=True)
- result.log('val_step_pbar_acc', torch.tensor(13 + batch_idx).type_as(acc),
- prog_bar=True, logger=False, on_epoch=False, on_step=True)
- result.log('val_step_batch_idx', torch.tensor(batch_idx).type_as(acc),
- prog_bar=True, logger=True, on_epoch=False, on_step=True)
-
- self.validation_step_called = True
- return result
-
- def validation_step_result_epoch_step_metrics(self, batch, batch_idx):
- """
- Only track epoch level metrics
- """
- acc = self.step(batch, batch_idx)
- result = EvalResult(checkpoint_on=acc, early_stop_on=acc)
-
- # step only metrics
- result.log('no_val_no_pbar', torch.tensor(11 + batch_idx).type_as(acc),
- prog_bar=False, logger=False, on_epoch=True, on_step=True)
- result.log('val_step_log_acc', torch.tensor(11 + batch_idx).type_as(acc),
- prog_bar=False, logger=True, on_epoch=True, on_step=True)
- result.log('val_step_log_pbar_acc', torch.tensor(12 + batch_idx).type_as(acc),
- prog_bar=True, logger=True, on_epoch=True, on_step=True)
- result.log('val_step_pbar_acc', torch.tensor(13 + batch_idx).type_as(acc),
- prog_bar=True, logger=False, on_epoch=True, on_step=True)
- result.log('val_step_batch_idx', torch.tensor(batch_idx).type_as(acc),
- prog_bar=True, logger=True, on_epoch=True, on_step=True)
-
- self.validation_step_called = True
- return result
-
- def validation_step_for_epoch_end_result(self, batch, batch_idx):
- """
- EvalResult flows to epoch end (without step_end)
- """
- acc = self.step(batch, batch_idx)
- result = EvalResult(checkpoint_on=acc, early_stop_on=acc)
-
- # step only metrics
- result.log('val_step_metric', torch.tensor(batch_idx).type_as(acc),
- prog_bar=True, logger=True, on_epoch=True, on_step=False)
- result.log('batch_idx', torch.tensor(batch_idx).type_as(acc),
- prog_bar=True, logger=True, on_epoch=True, on_step=False)
-
- self.validation_step_called = True
- return result
-
- def validation_epoch_end_result(self, result):
- self.validation_epoch_end_called = True
-
- if self.trainer.running_sanity_check:
- assert len(result.batch_idx) == 2
- else:
- assert len(result.batch_idx) == self.trainer.limit_val_batches
-
- expected_val = result.val_step_metric.sum() / len(result.batch_idx)
- result.val_step_metric = result.val_step_metric.mean()
- result.batch_idx = result.batch_idx.mean()
- assert result.val_step_metric == expected_val
-
- result.log('val_epoch_end_metric', torch.tensor(189).type_as(result.val_step_metric), prog_bar=True)
-
- return result
-
# --------------------------
# dictionary returns
# --------------------------
diff --git a/tests/base/develop_pipelines.py b/tests/base/develop_pipelines.py
index 18bb0c4d72715b..24535dc67da8e8 100644
--- a/tests/base/develop_pipelines.py
+++ b/tests/base/develop_pipelines.py
@@ -86,9 +86,11 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, wi
trainer.optimizers, trainer.lr_schedulers, trainer.optimizer_frequencies = \
trainer.init_optimizers(pretrained_model)
- # test HPC loading / saving
+ # test HPC saving
trainer.checkpoint_connector.hpc_save(save_dir, logger)
- trainer.checkpoint_connector.hpc_load(save_dir, on_gpu=on_gpu)
+ # test HPC loading
+ checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder(save_dir)
+ trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=on_gpu)
def run_prediction(dataloader, trained_model, dp=False, min_acc=0.50):
diff --git a/tests/base/model_test_steps.py b/tests/base/model_test_steps.py
index 0010dcdf14a197..440ec4c4c35b47 100644
--- a/tests/base/model_test_steps.py
+++ b/tests/base/model_test_steps.py
@@ -59,38 +59,6 @@ def test_step(self, batch, batch_idx, *args, **kwargs):
'test_dic': {'test_loss_a': loss_test}})
return output
- def test_step_result_obj(self, batch, batch_idx, *args, **kwargs):
- """
- Default, baseline test_step
- :param batch:
- :return:
- """
- x, y = batch
- x = x.view(x.size(0), -1)
- y_hat = self(x)
-
- loss_test = self.loss(y, y_hat)
-
- # acc
- labels_hat = torch.argmax(y_hat, dim=1)
- test_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0)
- test_acc = torch.tensor(test_acc)
-
- test_acc = test_acc.type_as(x)
-
- result = EvalResult()
- # alternate possible outputs to test
- if batch_idx % 1 == 0:
- result.log_dict({'test_loss': loss_test, 'test_acc': test_acc})
- return result
- if batch_idx % 2 == 0:
- return test_acc
-
- if batch_idx % 3 == 0:
- result.log_dict({'test_loss': loss_test, 'test_acc': test_acc})
- result.test_dic = {'test_loss_a': loss_test}
- return result
-
def test_step__multiple_dataloaders(self, batch, batch_idx, dataloader_idx, **kwargs):
"""
Default, baseline test_step
diff --git a/tests/base/model_train_steps.py b/tests/base/model_train_steps.py
index caec6db9aaa10a..0590f5b7b5cccf 100644
--- a/tests/base/model_train_steps.py
+++ b/tests/base/model_train_steps.py
@@ -53,25 +53,6 @@ def training_step(self, batch, batch_idx, optimizer_idx=None):
)
return output
- def training_step_result_obj(self, batch, batch_idx, optimizer_idx=None):
- # forward pass
- x, y = batch
- x = x.view(x.size(0), -1)
- y_hat = self(x)
-
- # calculate loss
- loss_val = self.loss(y, y_hat)
- log_val = loss_val
-
- # alternate between tensors and scalars for "log" and "progress_bar"
- if batch_idx % 2 == 0:
- log_val = log_val.item()
-
- result = TrainResult(loss_val)
- result.log('some_val', log_val * log_val, prog_bar=True, logger=False)
- result.log('train_some_val', log_val * log_val)
- return result
-
def training_step__inf_loss(self, batch, batch_idx, optimizer_idx=None):
output = self.training_step(batch, batch_idx, optimizer_idx)
if batch_idx == self.test_step_inf_loss:
@@ -81,19 +62,6 @@ def training_step__inf_loss(self, batch, batch_idx, optimizer_idx=None):
output /= 0
return output
- def training_step_full_loop_result_obj_dp(self, batch, batch_idx, optimizer_idx=None):
- """
- Full loop flow train step (result obj + dp)
- """
- x, y = batch
- x = x.view(x.size(0), -1)
- y_hat = self(x.to(self.device))
- loss_val = y_hat.sum()
- result = TrainResult(minimize=loss_val)
- result.log('train_step_metric', loss_val + 1)
- self.training_step_called = True
- return result
-
def training_step_result_obj_dp(self, batch, batch_idx, optimizer_idx=None):
# forward pass
x, y = batch
@@ -136,23 +104,6 @@ def training_epoch_end_full_loop_result_obj_dp(self, result):
return result
- def eval_step_full_loop_result_obj_dp(self, batch, batch_idx, optimizer_idx=None):
- """
- Full loop flow train step (result obj + dp)
- """
- x, y = batch
- x = x.view(x.size(0), -1)
- y_hat = self(x.to(self.device))
- loss_val = y_hat.sum()
- result = EvalResult(checkpoint_on=loss_val, early_stop_on=loss_val)
-
- eval_name = 'validation' if not self.trainer.testing else 'test'
- result.log(f'{eval_name}_step_metric', loss_val + 1, on_step=True)
-
- setattr(self, f'{eval_name}_step_called', True)
-
- return result
-
def eval_step_end_full_loop_result_obj_dp(self, result):
"""
Full loop flow train step (result obj + dp)
@@ -198,20 +149,3 @@ def eval_epoch_end_full_loop_result_obj_dp(self, result):
setattr(result, f'{eval_name}_step_metric', reduced)
return result
-
- def training_step__using_metrics(self, batch, batch_idx, optimizer_idx=None):
- """Lightning calls this inside the training loop"""
- # forward pass
- x, y = batch
- x = x.view(x.size(0), -1)
- y_hat = self(x)
-
- # calculate loss
- loss_val = self.loss(y, y_hat)
-
- # call metric
- val = self.metric(x, y)
-
- result = TrainResult(minimize=loss_val)
- result.log('metric_val', val)
- return result
diff --git a/tests/base/model_valid_steps.py b/tests/base/model_valid_steps.py
index e23e62dccdaba6..a008a6cecf1102 100644
--- a/tests/base/model_valid_steps.py
+++ b/tests/base/model_valid_steps.py
@@ -71,25 +71,6 @@ def validation_step_no_monitor(self, batch, batch_idx, *args, **kwargs):
})
return output
- def validation_step_result_obj(self, batch, batch_idx, *args, **kwargs):
- x, y = batch
- x = x.view(x.size(0), -1)
- y_hat = self(x)
-
- loss_val = self.loss(y, y_hat)
-
- # acc
- labels_hat = torch.argmax(y_hat, dim=1)
- val_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0)
- val_acc = torch.tensor(val_acc).type_as(x)
-
- result = EvalResult(checkpoint_on=loss_val, early_stop_on=loss_val)
- result.log_dict({
- 'val_loss': loss_val,
- 'val_acc': val_acc,
- })
- return result
-
def validation_step_result_obj_dp(self, batch, batch_idx, *args, **kwargs):
x, y = batch
x = x.view(x.size(0), -1)
diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index c00c712bb3b13c..070bb4e9f6989b 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -33,6 +33,8 @@ def test_trainer_callback_system(torch_save):
limit_train_batches=3,
limit_test_batches=2,
progress_bar_refresh_rate=0,
+ # todo: enabled since internally we wrap the model for optimizer step, this should be fixed
+ enable_pl_optimizer=True
)
# no call yet
diff --git a/tests/callbacks/test_lr_monitor.py b/tests/callbacks/test_lr_monitor.py
index a6783435ed3e27..d29f254df67d0d 100644
--- a/tests/callbacks/test_lr_monitor.py
+++ b/tests/callbacks/test_lr_monitor.py
@@ -13,11 +13,11 @@
# limitations under the License.
import pytest
+import tests.base.develop_utils as tutils
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import LearningRateMonitor
from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests.base import EvalModelTemplate
-import tests.base.develop_utils as tutils
+from tests.base import BoringModel, EvalModelTemplate
def test_lr_monitor_single_lr(tmpdir):
@@ -43,7 +43,7 @@ def test_lr_monitor_single_lr(tmpdir):
'Momentum should not be logged by default'
assert len(lr_monitor.lrs) == len(trainer.lr_schedulers), \
'Number of learning rates logged does not match number of lr schedulers'
- assert all([k in ['lr-Adam'] for k in lr_monitor.lrs.keys()]), \
+ assert lr_monitor.lr_sch_names == list(lr_monitor.lrs.keys()) == ['lr-Adam'], \
'Names of learning rates not set correctly'
@@ -134,7 +134,7 @@ def test_lr_monitor_multi_lrs(tmpdir, logging_interval):
assert lr_monitor.lrs, 'No learning rates logged'
assert len(lr_monitor.lrs) == len(trainer.lr_schedulers), \
'Number of learning rates logged does not match number of lr schedulers'
- assert all([k in ['lr-Adam', 'lr-Adam-1'] for k in lr_monitor.lrs.keys()]), \
+ assert lr_monitor.lr_sch_names == ['lr-Adam', 'lr-Adam-1'], \
'Names of learning rates not set correctly'
if logging_interval == 'step':
@@ -167,5 +167,27 @@ def test_lr_monitor_param_groups(tmpdir):
assert lr_monitor.lrs, 'No learning rates logged'
assert len(lr_monitor.lrs) == 2 * len(trainer.lr_schedulers), \
'Number of learning rates logged does not match number of param groups'
- assert all([k in ['lr-Adam/pg1', 'lr-Adam/pg2'] for k in lr_monitor.lrs.keys()]), \
+ assert lr_monitor.lr_sch_names == ['lr-Adam']
+ assert list(lr_monitor.lrs.keys()) == ['lr-Adam/pg1', 'lr-Adam/pg2'], \
'Names of learning rates not set correctly'
+
+
+def test_lr_monitor_custom_name(tmpdir):
+ class TestModel(BoringModel):
+ def configure_optimizers(self):
+ optimizer, [scheduler] = super().configure_optimizers()
+ lr_scheduler = {'scheduler': scheduler, 'name': 'my_logging_name'}
+ return optimizer, [lr_scheduler]
+
+ lr_monitor = LearningRateMonitor()
+ trainer = Trainer(
+ default_root_dir=tmpdir,
+ max_epochs=2,
+ limit_val_batches=0.1,
+ limit_train_batches=0.5,
+ callbacks=[lr_monitor],
+ progress_bar_refresh_rate=0,
+ weights_summary=None,
+ )
+ trainer.fit(TestModel())
+ assert lr_monitor.lr_sch_names == list(lr_monitor.lrs.keys()) == ['my_logging_name']
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index 31154eac1bf0d6..106c34030051e7 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -12,15 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
-import os.path as osp
import pickle
import platform
import re
from argparse import Namespace
-from distutils.version import LooseVersion
from pathlib import Path
from unittest import mock
-from unittest.mock import MagicMock, Mock
+from unittest.mock import Mock
import cloudpickle
import pytest
@@ -641,20 +639,17 @@ def validation_epoch_end(self, outputs):
@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
def test_checkpoint_repeated_strategy(enable_pl_optimizer, tmpdir):
"""
- This test validates that the checkpoint can be called when provided to callacks list
+ This test validates that the checkpoint can be called when provided to callbacks list
"""
-
checkpoint_callback = ModelCheckpoint(monitor='val_loss', dirpath=tmpdir, filename="{epoch:02d}")
class ExtendedBoringModel(BoringModel):
-
def validation_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
return {"val_loss": loss}
model = ExtendedBoringModel()
- model.validation_step_end = None
model.validation_epoch_end = None
trainer = Trainer(
max_epochs=1,
@@ -663,92 +658,30 @@ def validation_step(self, batch, batch_idx):
limit_test_batches=2,
callbacks=[checkpoint_callback],
enable_pl_optimizer=enable_pl_optimizer,
+ weights_summary=None,
+ progress_bar_refresh_rate=0,
)
-
trainer.fit(model)
assert os.listdir(tmpdir) == ['epoch=00.ckpt']
- def get_last_checkpoint():
- ckpts = os.listdir(tmpdir)
- ckpts_map = {int(x.split("=")[1].split('.')[0]): osp.join(tmpdir, x) for x in ckpts if "epoch" in x}
- num_ckpts = len(ckpts_map) - 1
- return ckpts_map[num_ckpts]
-
- for idx in range(1, 5):
+ for idx in range(4):
# load from checkpoint
- chk = get_last_checkpoint()
- model = BoringModel.load_from_checkpoint(chk)
- trainer = pl.Trainer(
- max_epochs=1,
- limit_train_batches=2,
- limit_val_batches=2,
- limit_test_batches=2,
- resume_from_checkpoint=chk,
- enable_pl_optimizer=enable_pl_optimizer)
- trainer.fit(model)
- trainer.test(model)
-
- assert str(os.listdir(tmpdir)) == "['epoch=00.ckpt']"
-
-
-@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
-@pytest.mark.parametrize("enable_pl_optimizer", [False, True])
-def test_checkpoint_repeated_strategy_tmpdir(enable_pl_optimizer, tmpdir):
- """
- This test validates that the checkpoint can be called when provided to callacks list
- """
-
- checkpoint_callback = ModelCheckpoint(monitor='val_loss', filepath=os.path.join(tmpdir, "{epoch:02d}"))
-
- class ExtendedBoringModel(BoringModel):
-
- def validation_step(self, batch, batch_idx):
- output = self.layer(batch)
- loss = self.loss(batch, output)
- return {"val_loss": loss}
-
- model = ExtendedBoringModel()
- model.validation_step_end = None
- model.validation_epoch_end = None
- trainer = Trainer(
- default_root_dir=tmpdir,
- max_epochs=1,
- limit_train_batches=2,
- limit_val_batches=2,
- limit_test_batches=2,
- callbacks=[checkpoint_callback],
- enable_pl_optimizer=enable_pl_optimizer,
- )
-
- trainer.fit(model)
- assert sorted(os.listdir(tmpdir)) == sorted(['epoch=00.ckpt', 'lightning_logs'])
- path_to_lightning_logs = osp.join(tmpdir, 'lightning_logs')
- assert sorted(os.listdir(path_to_lightning_logs)) == sorted(['version_0'])
-
- def get_last_checkpoint():
- ckpts = os.listdir(tmpdir)
- ckpts_map = {int(x.split("=")[1].split('.')[0]): osp.join(tmpdir, x) for x in ckpts if "epoch" in x}
- num_ckpts = len(ckpts_map) - 1
- return ckpts_map[num_ckpts]
-
- for idx in range(1, 5):
-
- # load from checkpoint
- chk = get_last_checkpoint()
- model = LogInTwoMethods.load_from_checkpoint(chk)
+ model = LogInTwoMethods.load_from_checkpoint(checkpoint_callback.best_model_path)
trainer = pl.Trainer(
default_root_dir=tmpdir,
max_epochs=1,
limit_train_batches=2,
limit_val_batches=2,
limit_test_batches=2,
- resume_from_checkpoint=chk,
- enable_pl_optimizer=enable_pl_optimizer)
-
+ resume_from_checkpoint=checkpoint_callback.best_model_path,
+ enable_pl_optimizer=enable_pl_optimizer,
+ weights_summary=None,
+ progress_bar_refresh_rate=0,
+ )
trainer.fit(model)
- trainer.test(model)
- assert sorted(os.listdir(tmpdir)) == sorted(['epoch=00.ckpt', 'lightning_logs'])
- assert sorted(os.listdir(path_to_lightning_logs)) == sorted([f'version_{i}' for i in range(idx + 1)])
+ trainer.test(model, verbose=False)
+ assert set(os.listdir(tmpdir)) == {'epoch=00.ckpt', 'lightning_logs'}
+ assert set(os.listdir(tmpdir.join("lightning_logs"))) == {f'version_{i}' for i in range(4)}
@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
@@ -760,21 +693,22 @@ def test_checkpoint_repeated_strategy_extended(enable_pl_optimizer, tmpdir):
"""
class ExtendedBoringModel(BoringModel):
-
def validation_step(self, batch, batch_idx):
output = self.layer(batch)
loss = self.loss(batch, output)
return {"val_loss": loss}
+ def validation_epoch_end(self, *_):
+ ...
+
def assert_trainer_init(trainer):
assert not trainer.checkpoint_connector.has_trained
assert trainer.global_step == 0
assert trainer.current_epoch == 0
def get_last_checkpoint(ckpt_dir):
- ckpts = os.listdir(ckpt_dir)
- ckpts.sort()
- return osp.join(ckpt_dir, ckpts[-1])
+ last = ckpt_dir.listdir(sort=True)[-1]
+ return str(last)
def assert_checkpoint_content(ckpt_dir):
chk = pl_load(get_last_checkpoint(ckpt_dir))
@@ -782,23 +716,15 @@ def assert_checkpoint_content(ckpt_dir):
assert chk["global_step"] == 4
def assert_checkpoint_log_dir(idx):
- lightning_logs_path = osp.join(tmpdir, 'lightning_logs')
- assert sorted(os.listdir(lightning_logs_path)) == [f'version_{i}' for i in range(idx + 1)]
- assert len(os.listdir(ckpt_dir)) == epochs
-
- def get_model():
- model = ExtendedBoringModel()
- model.validation_step_end = None
- model.validation_epoch_end = None
- return model
+ lightning_logs = tmpdir / 'lightning_logs'
+ actual = [d.basename for d in lightning_logs.listdir(sort=True)]
+ assert actual == [f'version_{i}' for i in range(idx + 1)]
+ assert len(ckpt_dir.listdir()) == epochs
- ckpt_dir = osp.join(tmpdir, 'checkpoints')
+ ckpt_dir = tmpdir / 'checkpoints'
checkpoint_cb = ModelCheckpoint(dirpath=ckpt_dir, save_top_k=-1)
epochs = 2
limit_train_batches = 2
-
- model = get_model()
-
trainer_config = dict(
default_root_dir=tmpdir,
max_epochs=epochs,
@@ -806,40 +732,32 @@ def get_model():
limit_val_batches=3,
limit_test_batches=4,
enable_pl_optimizer=enable_pl_optimizer,
- )
-
- trainer = pl.Trainer(
- **trainer_config,
callbacks=[checkpoint_cb],
)
+ trainer = pl.Trainer(**trainer_config)
assert_trainer_init(trainer)
+ model = ExtendedBoringModel()
trainer.fit(model)
assert trainer.checkpoint_connector.has_trained
assert trainer.global_step == epochs * limit_train_batches
assert trainer.current_epoch == epochs - 1
assert_checkpoint_log_dir(0)
+ assert_checkpoint_content(ckpt_dir)
trainer.test(model)
assert trainer.current_epoch == epochs - 1
- assert_checkpoint_content(ckpt_dir)
-
for idx in range(1, 5):
chk = get_last_checkpoint(ckpt_dir)
assert_checkpoint_content(ckpt_dir)
- checkpoint_cb = ModelCheckpoint(dirpath=ckpt_dir, save_top_k=-1)
- model = get_model()
-
# load from checkpoint
- trainer = pl.Trainer(
- **trainer_config,
- resume_from_checkpoint=chk,
- callbacks=[checkpoint_cb],
- )
+ trainer_config["callbacks"] = [ModelCheckpoint(dirpath=ckpt_dir, save_top_k=-1)]
+ trainer = pl.Trainer(**trainer_config, resume_from_checkpoint=chk)
assert_trainer_init(trainer)
+ model = ExtendedBoringModel()
trainer.test(model)
assert not trainer.checkpoint_connector.has_trained
assert trainer.global_step == epochs * limit_train_batches
@@ -1020,3 +938,42 @@ def __init__(self, hparams):
else:
# make sure it's not AttributeDict
assert type(ckpt[model.CHECKPOINT_HYPER_PARAMS_KEY]) == hparams_type
+
+
+@pytest.mark.parametrize('max_epochs', [3, 4])
+@pytest.mark.parametrize(
+ 'save_top_k, expected',
+ [
+ (1, ['curr_epoch.ckpt']),
+ (2, ['curr_epoch.ckpt', 'curr_epoch-v0.ckpt']),
+ ]
+)
+def test_model_checkpoint_file_already_exists(tmpdir, max_epochs, save_top_k, expected):
+ """
+ Test that version is added to filename if required and it already exists in dirpath.
+ """
+ model_checkpoint = ModelCheckpoint(
+ dirpath=tmpdir,
+ filename='curr_epoch',
+ save_top_k=save_top_k,
+ monitor='epoch',
+ mode='max',
+ )
+ trainer = Trainer(
+ default_root_dir=tmpdir,
+ callbacks=[model_checkpoint],
+ max_epochs=max_epochs,
+ limit_train_batches=2,
+ limit_val_batches=2,
+ logger=None,
+ weights_summary=None,
+ progress_bar_refresh_rate=0,
+ )
+
+ model = BoringModel()
+ trainer.fit(model)
+ ckpt_files = os.listdir(tmpdir)
+ assert set(ckpt_files) == set(expected)
+
+ epochs_in_ckpt_files = [pl_load(os.path.join(tmpdir, f))['epoch'] - 1 for f in ckpt_files]
+ assert sorted(epochs_in_ckpt_files) == list(range(max_epochs - save_top_k, max_epochs))
diff --git a/tests/collect_env_details.py b/tests/collect_env_details.py
index 1d443795d28767..2b8c4b3fafeed3 100644
--- a/tests/collect_env_details.py
+++ b/tests/collect_env_details.py
@@ -1,3 +1,16 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
"""Diagnose your system and show basic information
This server mainly to get detail info for better bug reporting.
diff --git a/tests/conftest.py b/tests/conftest.py
index ad4b7169456a89..c6a14a99b24789 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,7 +1,21 @@
-import sys
-import threading
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
from functools import partial, wraps
from http.server import SimpleHTTPRequestHandler
+import sys
+import threading
import pytest
import torch.multiprocessing as mp
diff --git a/tests/core/test_results.py b/tests/core/test_results.py
index f4486ce6ae4194..797004b7f21ffa 100644
--- a/tests/core/test_results.py
+++ b/tests/core/test_results.py
@@ -18,7 +18,7 @@
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
-from pytorch_lightning import Trainer, seed_everything
+from pytorch_lightning import Trainer
from pytorch_lightning.core.step_result import Result, TrainResult, EvalResult
import tests.base.develop_utils as tutils
diff --git a/tests/deprecated_api/__init__.py b/tests/deprecated_api/__init__.py
new file mode 100644
index 00000000000000..99e21d1ed6b229
--- /dev/null
+++ b/tests/deprecated_api/__init__.py
@@ -0,0 +1,21 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test deprecated functionality which will be removed in vX.Y.Z"""
+import sys
+
+
+def _soft_unimport_module(str_module):
+ # once the module is imported e.g with parsing with pytest it lives in memory
+ if str_module in sys.modules:
+ del sys.modules[str_module]
diff --git a/tests/deprecated_api/test_remove_1-2.py b/tests/deprecated_api/test_remove_1-2.py
new file mode 100644
index 00000000000000..331208d56df103
--- /dev/null
+++ b/tests/deprecated_api/test_remove_1-2.py
@@ -0,0 +1,45 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test deprecated functionality which will be removed in vX.Y.Z"""
+
+import pytest
+import torch
+
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+
+
+def test_tbd_remove_in_v1_2_0():
+ with pytest.deprecated_call(match='will be removed in v1.2'):
+ ModelCheckpoint(filepath='..')
+
+ with pytest.deprecated_call(match='will be removed in v1.2'):
+ ModelCheckpoint('..')
+
+ with pytest.raises(MisconfigurationException, match='inputs which are not feasible'):
+ ModelCheckpoint(filepath='..', dirpath='.')
+
+
+def test_tbd_remove_in_v1_2_0_metrics():
+ from pytorch_lightning.metrics.classification import Fbeta
+ from pytorch_lightning.metrics.functional.classification import f1_score, fbeta_score
+
+ with pytest.deprecated_call(match='will be removed in v1.2'):
+ Fbeta(2)
+
+ with pytest.deprecated_call(match='will be removed in v1.2'):
+ fbeta_score(torch.tensor([0, 1, 2, 3]), torch.tensor([0, 1, 2, 1]), 0.2)
+
+ with pytest.deprecated_call(match='will be removed in v1.2'):
+ f1_score(torch.tensor([0, 1, 0, 1]), torch.tensor([0, 1, 0, 0]))
diff --git a/tests/test_deprecated.py b/tests/deprecated_api/test_remove_1-3.py
similarity index 60%
rename from tests/test_deprecated.py
rename to tests/deprecated_api/test_remove_1-3.py
index 59c6728009b6f0..7ec69796b1e46e 100644
--- a/tests/test_deprecated.py
+++ b/tests/deprecated_api/test_remove_1-3.py
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test deprecated functionality which will be removed in vX.Y.Z"""
-import sys
from argparse import ArgumentParser
from unittest import mock
@@ -21,10 +20,8 @@
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
-from pytorch_lightning.metrics.functional.classification import auc
from pytorch_lightning.profiler.profilers import PassThroughProfiler, SimpleProfiler
from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests.base import EvalModelTemplate
def test_tbd_remove_in_v1_3_0(tmpdir):
@@ -52,27 +49,27 @@ def __init__(self, hparams):
def test_tbd_remove_in_v1_3_0_metrics():
+ from pytorch_lightning.metrics.functional.classification import to_onehot
with pytest.deprecated_call(match='will be removed in v1.3'):
- from pytorch_lightning.metrics.functional.classification import to_onehot
to_onehot(torch.tensor([1, 2, 3]))
+ from pytorch_lightning.metrics.functional.classification import to_categorical
with pytest.deprecated_call(match='will be removed in v1.3'):
- from pytorch_lightning.metrics.functional.classification import to_categorical
to_categorical(torch.tensor([[0.2, 0.5], [0.9, 0.1]]))
+ from pytorch_lightning.metrics.functional.classification import get_num_classes
with pytest.deprecated_call(match='will be removed in v1.3'):
- from pytorch_lightning.metrics.functional.classification import get_num_classes
get_num_classes(pred=torch.tensor([0, 1]), target=torch.tensor([1, 1]))
x_binary = torch.tensor([0, 1, 2, 3])
y_binary = torch.tensor([0, 1, 2, 3])
+ from pytorch_lightning.metrics.functional.classification import roc
with pytest.deprecated_call(match='will be removed in v1.3'):
- from pytorch_lightning.metrics.functional.classification import roc
roc(pred=x_binary, target=y_binary)
+ from pytorch_lightning.metrics.functional.classification import _roc
with pytest.deprecated_call(match='will be removed in v1.3'):
- from pytorch_lightning.metrics.functional.classification import _roc
_roc(pred=x_binary, target=y_binary)
x_multy = torch.tensor([[0.85, 0.05, 0.05, 0.05],
@@ -81,64 +78,40 @@ def test_tbd_remove_in_v1_3_0_metrics():
[0.05, 0.05, 0.05, 0.85]])
y_multy = torch.tensor([0, 1, 3, 2])
+ from pytorch_lightning.metrics.functional.classification import multiclass_roc
with pytest.deprecated_call(match='will be removed in v1.3'):
- from pytorch_lightning.metrics.functional.classification import multiclass_roc
multiclass_roc(pred=x_multy, target=y_multy)
+ from pytorch_lightning.metrics.functional.classification import average_precision
with pytest.deprecated_call(match='will be removed in v1.3'):
- from pytorch_lightning.metrics.functional.classification import average_precision
average_precision(pred=x_binary, target=y_binary)
+ from pytorch_lightning.metrics.functional.classification import precision_recall_curve
with pytest.deprecated_call(match='will be removed in v1.3'):
- from pytorch_lightning.metrics.functional.classification import precision_recall_curve
precision_recall_curve(pred=x_binary, target=y_binary)
+ from pytorch_lightning.metrics.functional.classification import multiclass_precision_recall_curve
with pytest.deprecated_call(match='will be removed in v1.3'):
- from pytorch_lightning.metrics.functional.classification import multiclass_precision_recall_curve
multiclass_precision_recall_curve(pred=x_multy, target=y_multy)
+ from pytorch_lightning.metrics.functional.reduction import reduce
with pytest.deprecated_call(match='will be removed in v1.3'):
- from pytorch_lightning.metrics.functional.reduction import reduce
reduce(torch.tensor([0, 1, 1, 0]), 'sum')
+ from pytorch_lightning.metrics.functional.reduction import class_reduce
with pytest.deprecated_call(match='will be removed in v1.3'):
- from pytorch_lightning.metrics.functional.reduction import class_reduce
class_reduce(torch.randint(1, 10, (50,)).float(),
torch.randint(10, 20, (50,)).float(),
torch.randint(1, 100, (50,)).float())
-def test_tbd_remove_in_v1_2_0():
- with pytest.deprecated_call(match='will be removed in v1.2'):
- checkpoint_cb = ModelCheckpoint(filepath='.')
-
- with pytest.deprecated_call(match='will be removed in v1.2'):
- checkpoint_cb = ModelCheckpoint('.')
-
- with pytest.raises(MisconfigurationException, match='inputs which are not feasible'):
- checkpoint_cb = ModelCheckpoint(filepath='.', dirpath='.')
-
-
-def test_tbd_remove_in_v1_2_0_metrics():
- from pytorch_lightning.metrics.classification import Fbeta
- from pytorch_lightning.metrics.functional.classification import f1_score, fbeta_score
-
- with pytest.deprecated_call(match='will be removed in v1.2'):
- Fbeta(2)
-
- with pytest.deprecated_call(match='will be removed in v1.2'):
- fbeta_score(torch.tensor([0, 1, 2, 3]), torch.tensor([0, 1, 2, 1]), 0.2)
-
- with pytest.deprecated_call(match='will be removed in v1.2'):
- f1_score(torch.tensor([0, 1, 0, 1]), torch.tensor([0, 1, 0, 0]))
-
-
# TODO: remove bool from Trainer.profiler param in v1.3.0, update profiler_connector.py
@pytest.mark.parametrize(['profiler', 'expected'], [
(True, SimpleProfiler),
(False, PassThroughProfiler),
])
def test_trainer_profiler_remove_in_v1_3_0(profiler, expected):
+ # remove bool from Trainer.profiler param in v1.3.0, update profiler_connector.py
with pytest.deprecated_call(match='will be removed in v1.3'):
trainer = Trainer(profiler=profiler)
assert isinstance(trainer.profiler, expected)
@@ -162,47 +135,3 @@ def test_trainer_cli_profiler_remove_in_v1_3_0(cli_args, expected_parsed_arg, ex
assert getattr(args, "profiler") == expected_parsed_arg
trainer = Trainer.from_argparse_args(args)
assert isinstance(trainer.profiler, expected_profiler)
-
-
-def _soft_unimport_module(str_module):
- # once the module is imported e.g with parsing with pytest it lives in memory
- if str_module in sys.modules:
- del sys.modules[str_module]
-
-
-class ModelVer0_6(EvalModelTemplate):
-
- # todo: this shall not be needed while evaluate asks for dataloader explicitly
- def val_dataloader(self):
- return self.dataloader(train=False)
-
- def validation_step(self, batch, batch_idx, *args, **kwargs):
- return {'val_loss': torch.tensor(0.6)}
-
- def validation_end(self, outputs):
- return {'val_loss': torch.tensor(0.6)}
-
- def test_dataloader(self):
- return self.dataloader(train=False)
-
- def test_end(self, outputs):
- return {'test_loss': torch.tensor(0.6)}
-
-
-class ModelVer0_7(EvalModelTemplate):
-
- # todo: this shall not be needed while evaluate asks for dataloader explicitly
- def val_dataloader(self):
- return self.dataloader(train=False)
-
- def validation_step(self, batch, batch_idx, *args, **kwargs):
- return {'val_loss': torch.tensor(0.7)}
-
- def validation_end(self, outputs):
- return {'val_loss': torch.tensor(0.7)}
-
- def test_dataloader(self):
- return self.dataloader(train=False)
-
- def test_end(self, outputs):
- return {'test_loss': torch.tensor(0.7)}
diff --git a/tests/metrics/regression/test_ssim.py b/tests/metrics/regression/test_ssim.py
index f581188e89fce5..8bb304850e3f22 100644
--- a/tests/metrics/regression/test_ssim.py
+++ b/tests/metrics/regression/test_ssim.py
@@ -53,9 +53,7 @@ def _sk_metric(preds, target, data_range, multichannel):
class TestSSIM(MetricTester):
atol = 6e-5
- # TODO: for some reason this test hangs with ddp=True
- # @pytest.mark.parametrize("ddp", [True, False])
- @pytest.mark.parametrize("ddp", [False])
+ @pytest.mark.parametrize("ddp", [True, False])
@pytest.mark.parametrize("dist_sync_on_step", [True, False])
def test_ssim(self, preds, target, multichannel, ddp, dist_sync_on_step):
self.run_class_metric_test(
diff --git a/tests/metrics/utils.py b/tests/metrics/utils.py
index c607a466b20683..4bd6608ce3fcf4 100644
--- a/tests/metrics/utils.py
+++ b/tests/metrics/utils.py
@@ -11,6 +11,11 @@
from pytorch_lightning.metrics import Metric
+try:
+ set_start_method("spawn")
+except RuntimeError:
+ pass
+
NUM_PROCESSES = 2
NUM_BATCHES = 10
BATCH_SIZE = 32
@@ -165,10 +170,7 @@ def setup_class(self):
"""Setup the metric class. This will spawn the pool of workers that are
used for metric testing and setup_ddp
"""
- try:
- set_start_method("spawn")
- except RuntimeError:
- pass
+
self.poolSize = NUM_PROCESSES
self.pool = Pool(processes=self.poolSize)
self.pool.starmap(setup_ddp, [(rank, self.poolSize) for rank in range(self.poolSize)])
diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py
index f10753491d447c..f41bf59bb4f4c2 100644
--- a/tests/models/data/horovod/train_default_model.py
+++ b/tests/models/data/horovod/train_default_model.py
@@ -74,9 +74,11 @@ def run_test_from_config(trainer_options):
for dataloader in test_loaders:
run_prediction(dataloader, pretrained_model)
- # test HPC loading / saving
+ # test HPC saving
trainer.checkpoint_connector.hpc_save(ckpt_path, trainer.logger)
- trainer.checkpoint_connector.hpc_load(ckpt_path, on_gpu=args.on_gpu)
+ # test HPC loading
+ checkpoint_path = trainer.checkpoint_connector.get_max_ckpt_path_from_folder(ckpt_path)
+ trainer.checkpoint_connector.hpc_load(checkpoint_path, on_gpu=args.on_gpu)
if args.on_gpu:
trainer = Trainer(gpus=1, accelerator='horovod', max_epochs=1)
diff --git a/tests/models/test_onnx.py b/tests/models/test_onnx.py
index a3919a6a8a7ddd..82727d37479b68 100644
--- a/tests/models/test_onnx.py
+++ b/tests/models/test_onnx.py
@@ -21,44 +21,44 @@
import tests.base.develop_pipelines as tpipes
import tests.base.develop_utils as tutils
from pytorch_lightning import Trainer
-from tests.base import EvalModelTemplate
+from tests.base import BoringModel, EvalModelTemplate
def test_model_saves_with_input_sample(tmpdir):
"""Test that ONNX model saves with input sample and size is greater than 3 MB"""
- model = EvalModelTemplate()
+ model = BoringModel()
trainer = Trainer(max_epochs=1)
trainer.fit(model)
file_path = os.path.join(tmpdir, "model.onnx")
- input_sample = torch.randn((1, 28 * 28))
+ input_sample = torch.randn((1, 32))
model.to_onnx(file_path, input_sample)
assert os.path.isfile(file_path)
- assert os.path.getsize(file_path) > 3e+06
+ assert os.path.getsize(file_path) > 4e2
@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
def test_model_saves_on_gpu(tmpdir):
"""Test that model saves on gpu"""
- model = EvalModelTemplate()
+ model = BoringModel()
trainer = Trainer(gpus=1, max_epochs=1)
trainer.fit(model)
file_path = os.path.join(tmpdir, "model.onnx")
- input_sample = torch.randn((1, 28 * 28))
+ input_sample = torch.randn((1, 32))
model.to_onnx(file_path, input_sample)
assert os.path.isfile(file_path)
- assert os.path.getsize(file_path) > 3e+06
+ assert os.path.getsize(file_path) > 4e2
def test_model_saves_with_example_output(tmpdir):
"""Test that ONNX model saves when provided with example output"""
- model = EvalModelTemplate()
+ model = BoringModel()
trainer = Trainer(max_epochs=1)
trainer.fit(model)
file_path = os.path.join(tmpdir, "model.onnx")
- input_sample = torch.randn((1, 28 * 28))
+ input_sample = torch.randn((1, 32))
model.eval()
example_outputs = model.forward(input_sample)
model.to_onnx(file_path, input_sample, example_outputs=example_outputs)
@@ -67,11 +67,13 @@ def test_model_saves_with_example_output(tmpdir):
def test_model_saves_with_example_input_array(tmpdir):
"""Test that ONNX model saves with_example_input_array and size is greater than 3 MB"""
- model = EvalModelTemplate()
+ model = BoringModel()
+ model.example_input_array = torch.randn(5, 32)
+
file_path = os.path.join(tmpdir, "model.onnx")
model.to_onnx(file_path)
assert os.path.exists(file_path) is True
- assert os.path.getsize(file_path) > 3e+06
+ assert os.path.getsize(file_path) > 4e2
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@@ -100,7 +102,9 @@ def test_model_saves_on_multi_gpu(tmpdir):
def test_verbose_param(tmpdir, capsys):
"""Test that output is present when verbose parameter is set"""
- model = EvalModelTemplate()
+ model = BoringModel()
+ model.example_input_array = torch.randn(5, 32)
+
file_path = os.path.join(tmpdir, "model.onnx")
model.to_onnx(file_path, verbose=True)
captured = capsys.readouterr()
@@ -108,8 +112,8 @@ def test_verbose_param(tmpdir, capsys):
def test_error_if_no_input(tmpdir):
- """Test that an exception is thrown when there is no input tensor"""
- model = EvalModelTemplate()
+ """Test that an error is thrown when there is no input tensor"""
+ model = BoringModel()
model.example_input_array = None
file_path = os.path.join(tmpdir, "model.onnx")
with pytest.raises(ValueError, match=r'Could not export to ONNX since neither `input_sample` nor'
@@ -117,21 +121,12 @@ def test_error_if_no_input(tmpdir):
model.to_onnx(file_path)
-def test_error_if_input_sample_is_not_tensor(tmpdir):
- """Test that an exception is thrown when there is no input tensor"""
- model = EvalModelTemplate()
- model.example_input_array = None
- file_path = os.path.join(tmpdir, "model.onnx")
- input_sample = np.random.randn(1, 28 * 28)
- with pytest.raises(ValueError, match=f'Received `input_sample` of type {type(input_sample)}. Expected type is '
- f'`Tensor`'):
- model.to_onnx(file_path, input_sample)
-
-
def test_if_inference_output_is_valid(tmpdir):
"""Test that the output inferred from ONNX model is same as from PyTorch"""
- model = EvalModelTemplate()
- trainer = Trainer(max_epochs=5)
+ model = BoringModel()
+ model.example_input_array = torch.randn(5, 32)
+
+ trainer = Trainer(max_epochs=2)
trainer.fit(model)
model.eval()
diff --git a/tests/models/test_torchscript.py b/tests/models/test_torchscript.py
index bf2c34b8bfef5b..3c43b201f52e4c 100644
--- a/tests/models/test_torchscript.py
+++ b/tests/models/test_torchscript.py
@@ -16,43 +16,72 @@
import pytest
import torch
-from tests.base import EvalModelTemplate
+from tests.base import BoringModel
from tests.base.datamodules import TrialMNISTDataModule
from tests.base.models import ParityModuleRNN, BasicGAN
@pytest.mark.parametrize("modelclass", [
- EvalModelTemplate,
+ BoringModel,
ParityModuleRNN,
BasicGAN,
])
def test_torchscript_input_output(modelclass):
""" Test that scripted LightningModule forward works. """
model = modelclass()
+
+ if isinstance(model, BoringModel):
+ model.example_input_array = torch.randn(5, 32)
+
script = model.to_torchscript()
assert isinstance(script, torch.jit.ScriptModule)
+
model.eval()
- model_output = model(model.example_input_array)
+ with torch.no_grad():
+ model_output = model(model.example_input_array)
+
script_output = script(model.example_input_array)
assert torch.allclose(script_output, model_output)
@pytest.mark.parametrize("modelclass", [
- EvalModelTemplate,
+ BoringModel,
ParityModuleRNN,
BasicGAN,
])
-def test_torchscript_input_output_trace(modelclass):
- """ Test that traced LightningModule forward works. """
+def test_torchscript_example_input_output_trace(modelclass):
+ """ Test that traced LightningModule forward works with example_input_array """
model = modelclass()
+
+ if isinstance(model, BoringModel):
+ model.example_input_array = torch.randn(5, 32)
+
script = model.to_torchscript(method='trace')
assert isinstance(script, torch.jit.ScriptModule)
+
model.eval()
- model_output = model(model.example_input_array)
+ with torch.no_grad():
+ model_output = model(model.example_input_array)
+
script_output = script(model.example_input_array)
assert torch.allclose(script_output, model_output)
+def test_torchscript_input_output_trace():
+ """ Test that traced LightningModule forward works with example_inputs """
+ model = BoringModel()
+ example_inputs = torch.randn(1, 32)
+ script = model.to_torchscript(example_inputs=example_inputs, method='trace')
+ assert isinstance(script, torch.jit.ScriptModule)
+
+ model.eval()
+ with torch.no_grad():
+ model_output = model(example_inputs)
+
+ script_output = script(example_inputs)
+ assert torch.allclose(script_output, model_output)
+
+
@pytest.mark.parametrize("device", [
torch.device("cpu"),
torch.device("cuda", 0)
@@ -60,7 +89,9 @@ def test_torchscript_input_output_trace(modelclass):
@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
def test_torchscript_device(device):
""" Test that scripted module is on the correct device. """
- model = EvalModelTemplate().to(device)
+ model = BoringModel().to(device)
+ model.example_input_array = torch.randn(5, 32)
+
script = model.to_torchscript()
assert next(script.parameters()).device == device
script_output = script(model.example_input_array.to(device))
@@ -69,7 +100,7 @@ def test_torchscript_device(device):
def test_torchscript_retain_training_state():
""" Test that torchscript export does not alter the training mode of original model. """
- model = EvalModelTemplate()
+ model = BoringModel()
model.train(True)
script = model.to_torchscript()
assert model.training
@@ -81,7 +112,7 @@ def test_torchscript_retain_training_state():
@pytest.mark.parametrize("modelclass", [
- EvalModelTemplate,
+ BoringModel,
ParityModuleRNN,
BasicGAN,
])
@@ -100,7 +131,7 @@ def test_torchscript_properties(modelclass):
@pytest.mark.parametrize("modelclass", [
- EvalModelTemplate,
+ BoringModel,
ParityModuleRNN,
BasicGAN,
])
@@ -109,9 +140,27 @@ def test_torchscript_properties(modelclass):
reason="torch.save/load has bug loading script modules on torch <= 1.4",
)
def test_torchscript_save_load(tmpdir, modelclass):
- """ Test that scripted LightningModules is correctly saved and can be loaded. """
+ """ Test that scripted LightningModule is correctly saved and can be loaded. """
model = modelclass()
output_file = str(tmpdir / "model.pt")
script = model.to_torchscript(file_path=output_file)
loaded_script = torch.jit.load(output_file)
assert torch.allclose(next(script.parameters()), next(loaded_script.parameters()))
+
+
+def test_torchcript_invalid_method(tmpdir):
+ """Test that an error is thrown with invalid torchscript method"""
+ model = BoringModel()
+ model.train(True)
+
+ with pytest.raises(ValueError, match="only supports 'script' or 'trace'"):
+ model.to_torchscript(method='temp')
+
+
+def test_torchscript_with_no_input(tmpdir):
+ """Test that an error is thrown when there is no input tensor"""
+ model = BoringModel()
+ model.example_input_array = None
+
+ with pytest.raises(ValueError, match='requires either `example_inputs` or `model.example_input_array`'):
+ model.to_torchscript(method='trace')
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index e838dc60d81b31..37ab774bc83421 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -244,39 +244,6 @@ def test_distributed_backend_set_when_using_tpu(tmpdir, tpu_cores):
assert Trainer(tpu_cores=tpu_cores).distributed_backend == "tpu"
-@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
-@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-@pl_multi_process_test
-def test_result_obj_on_tpu(tmpdir):
- seed_everything(1234)
-
- batches = 5
- epochs = 2
-
- model = EvalModelTemplate()
- model.training_step = model.training_step_result_obj
- model.training_step_end = None
- model.training_epoch_end = None
- model.validation_step = model.validation_step_result_obj
- model.validation_step_end = None
- model.validation_epoch_end = None
- model.test_step = model.test_step_result_obj
- model.test_step_end = None
- model.test_epoch_end = None
-
- trainer_options = dict(
- default_root_dir=tmpdir,
- max_epochs=epochs,
- callbacks=[EarlyStopping()],
- log_every_n_steps=2,
- limit_train_batches=batches,
- weights_summary=None,
- tpu_cores=8
- )
-
- tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
-
-
@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
@pl_multi_process_test
def test_broadcast_on_tpu():
diff --git a/tests/special_tests.sh b/tests/special_tests.sh
index f7cb5819517839..950e3776bbc7fa 100644
--- a/tests/special_tests.sh
+++ b/tests/special_tests.sh
@@ -19,4 +19,4 @@ python ${DEFAULTS} tests/plugins/test_rpc_plugin.py::test_rpc_function_calls_ddp
python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_manual
python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_manual_amp
python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_automatic
-# python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_with_wrong_balance
+python ${DEFAULTS} tests/trainer/logging_tests/test_train_loop_logging_1_0.py::test_logging_sync_dist_true_ddp
diff --git a/tests/test_profiler.py b/tests/test_profiler.py
index 3bce379c1115c2..91a8631a732870 100644
--- a/tests/test_profiler.py
+++ b/tests/test_profiler.py
@@ -1,6 +1,20 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
import os
-import time
from pathlib import Path
+import time
import numpy as np
import pytest
diff --git a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py
index 0c27d8909d760f..51b9c2ac69496d 100644
--- a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py
+++ b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py
@@ -18,6 +18,7 @@
import collections
import itertools
import os
+import platform
from unittest import mock
import numpy as np
@@ -26,8 +27,8 @@
from torch.utils.data import Dataset
import pytorch_lightning as pl
-from pytorch_lightning import Trainer, callbacks
-from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning import callbacks, Trainer
+from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.core.lightning import LightningModule
from tests.base.boring_model import BoringModel, RandomDictDataset, RandomDictStringDataset
from tests.base.deterministic_model import DeterministicModel
@@ -685,6 +686,7 @@ class TestModel(BoringModel):
def training_step(self, batch, batch_idx):
acc = self.step(batch[0])
self.log('foo', torch.tensor(fake_result), on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum')
+ self.log('foo_2', 2, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum')
return acc
def validation_step(self, batch, batch_idx):
@@ -704,9 +706,46 @@ def validation_step(self, batch, batch_idx):
trainer.fit(model)
assert trainer.logged_metrics['foo'] == fake_result
+ assert trainer.logged_metrics['foo_2'] == 2
assert trainer.logged_metrics['bar'] == fake_result
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1',
+ reason="test should be run outside of pytest")
+def test_logging_sync_dist_true_ddp(tmpdir):
+ """
+ Tests to ensure that the sync_dist flag works with ddp
+ """
+ class TestLoggingSyncDistModel(BoringModel):
+ def training_step(self, batch, batch_idx):
+ acc = self.step(batch[0])
+ self.log('foo', 1, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='SUM')
+ return acc
+
+ def validation_step(self, batch, batch_idx):
+ self.training_step_called = True
+ output = self.layer(batch)
+ loss = self.loss(batch, output)
+ self.log('bar', 2, on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='AVG')
+ return {"x": loss}
+
+ model = TestLoggingSyncDistModel()
+ trainer = Trainer(
+ default_root_dir=tmpdir,
+ limit_train_batches=1,
+ limit_val_batches=1,
+ max_epochs=2,
+ weights_summary=None,
+ accelerator="ddp",
+ gpus=2,
+ )
+ trainer.fit(model)
+
+ assert trainer.logged_metrics['foo'] == 2
+ assert trainer.logged_metrics['bar'] == 2
+
+
@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
def test_logging_sync_dist_true_gpu(tmpdir):
"""
@@ -771,3 +810,48 @@ def on_train_epoch_end(self, *_):
trainer.fit(model)
assert model.epoch_end_called
assert model.on_train_epoch_end_called
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires GPU machine")
+def test_metric_are_properly_reduced(tmpdir):
+ class TestingModel(BoringModel):
+ def __init__(self, *args, **kwargs):
+ super().__init__()
+ self.train_acc = pl.metrics.Accuracy()
+ self.val_acc = pl.metrics.Accuracy()
+
+ def training_step(self, batch, batch_idx):
+ self.train_acc(torch.rand(1, 3, device=self.device), torch.randint(0, 2, (1,), device=self.device))
+ self.log('train_acc', self.train_acc, on_step=True, on_epoch=True)
+ return super().training_step(batch, batch_idx)
+
+ def validation_step(self, batch, batch_idx):
+ preds = torch.tensor(0, device=self.device)
+ targets = torch.tensor(1, device=self.device)
+ if batch_idx < 8:
+ targets = preds
+ self.val_acc(preds, targets)
+ self.log('val_acc', self.val_acc, on_step=True, on_epoch=True)
+ return super().validation_step(batch, batch_idx)
+
+ early_stop = EarlyStopping(monitor='val_acc', mode='max')
+
+ checkpoint = ModelCheckpoint(
+ monitor='val_acc',
+ save_last=True,
+ save_top_k=2,
+ mode='max',
+ )
+
+ model = TestingModel()
+ trainer = Trainer(
+ default_root_dir=tmpdir,
+ gpus=1,
+ max_epochs=2,
+ limit_train_batches=5,
+ limit_val_batches=32,
+ callbacks=[early_stop, checkpoint])
+ trainer.fit(model)
+
+ assert trainer.callback_metrics["val_acc"] == 8 / 32.
+ assert "train_acc" in trainer.callback_metrics
diff --git a/tests/trainer/optimization/test_multiple_optimizers.py b/tests/trainer/optimization/test_multiple_optimizers.py
new file mode 100644
index 00000000000000..78b6f8f7ff84a3
--- /dev/null
+++ b/tests/trainer/optimization/test_multiple_optimizers.py
@@ -0,0 +1,63 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Tests to ensure that the behaviours related to multiple optimizers works
+"""
+import torch
+
+import pytorch_lightning as pl
+from tests.base.boring_model import BoringModel
+
+
+def test_unbalanced_logging_with_multiple_optimizers(tmpdir):
+ """
+ This tests ensures reduction works in un-balanced logging settings
+ """
+ class TestModel(BoringModel):
+
+ loss_1 = []
+ loss_2 = []
+
+ def training_step(self, batch, batch_idx, optimizer_idx):
+ output = self.layer(batch)
+ loss = self.loss(batch, output)
+ if optimizer_idx == 0 and self.trainer.global_step > 10:
+ self.log("loss_1", loss, on_epoch=True, prog_bar=True)
+ self.loss_1.append(loss.detach().clone())
+ elif optimizer_idx == 1:
+ self.log("loss_2", loss, on_epoch=True, prog_bar=True)
+ self.loss_2.append(loss.detach().clone())
+ return {"loss": loss}
+
+ def configure_optimizers(self):
+ optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.001)
+ optimizer2 = torch.optim.SGD(self.layer.parameters(), lr=0.001)
+ return [optimizer, optimizer2]
+
+ model = TestModel()
+ model.training_epoch_end = None
+
+ # Initialize a trainer
+ trainer = pl.Trainer(
+ default_root_dir=tmpdir,
+ max_epochs=1,
+ )
+
+ trainer.fit(model)
+
+ assert torch.equal(trainer.callback_metrics["loss_2_step"], model.loss_2[-1])
+ assert torch.equal(trainer.callback_metrics["loss_1_step"], model.loss_1[-1])
+ # test loss are properly reduced
+ assert torch.abs(trainer.callback_metrics["loss_2_epoch"] - torch.FloatTensor(model.loss_2).mean()) < 1e-6
+ assert torch.abs(trainer.callback_metrics["loss_1_epoch"] - torch.FloatTensor(model.loss_1).mean()) < 1e-6
diff --git a/tests/trainer/test_optimizers.py b/tests/trainer/test_optimizers.py
index 2e76192836740c..52e085b2b7b8cb 100644
--- a/tests/trainer/test_optimizers.py
+++ b/tests/trainer/test_optimizers.py
@@ -15,7 +15,6 @@
import torch
from pytorch_lightning import Callback, Trainer
-from pytorch_lightning.core.optimizer import LightningOptimizer
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from tests.base import EvalModelTemplate
from tests.base.boring_model import BoringModel
@@ -177,6 +176,7 @@ def test_reducelronplateau_scheduling(tmpdir):
frequency=1,
reduce_on_plateau=True,
strict=True,
+ name=None,
), 'lr scheduler was not correctly converted to dict'
@@ -215,7 +215,13 @@ def test_optimizer_return_options(enable_pl_optimizer):
assert len(freq) == 0
assert optim[0] == opt_a
assert lr_sched[0] == dict(
- scheduler=scheduler_a, interval='epoch', frequency=1, reduce_on_plateau=False, monitor=None, strict=True
+ scheduler=scheduler_a,
+ interval='epoch',
+ frequency=1,
+ reduce_on_plateau=False,
+ monitor=None,
+ strict=True,
+ name=None,
)
# opt tuple of 1 list
@@ -225,7 +231,13 @@ def test_optimizer_return_options(enable_pl_optimizer):
assert len(freq) == 0
assert optim[0] == opt_a
assert lr_sched[0] == dict(
- scheduler=scheduler_a, interval='epoch', frequency=1, reduce_on_plateau=False, monitor=None, strict=True
+ scheduler=scheduler_a,
+ interval='epoch',
+ frequency=1,
+ reduce_on_plateau=False,
+ monitor=None,
+ strict=True,
+ name=None,
)
# opt single dictionary
@@ -235,7 +247,13 @@ def test_optimizer_return_options(enable_pl_optimizer):
assert len(freq) == 0
assert optim[0] == opt_a
assert lr_sched[0] == dict(
- scheduler=scheduler_a, interval='epoch', frequency=1, reduce_on_plateau=False, monitor=None, strict=True
+ scheduler=scheduler_a,
+ interval='epoch',
+ frequency=1,
+ reduce_on_plateau=False,
+ monitor=None,
+ strict=True,
+ name=None,
)
# opt multiple dictionaries with frequencies
@@ -247,7 +265,13 @@ def test_optimizer_return_options(enable_pl_optimizer):
assert len(optim) == len(lr_sched) == len(freq) == 2
assert optim[0] == opt_a
assert lr_sched[0] == dict(
- scheduler=scheduler_a, interval='epoch', frequency=1, reduce_on_plateau=False, monitor=None, strict=True
+ scheduler=scheduler_a,
+ interval='epoch',
+ frequency=1,
+ reduce_on_plateau=False,
+ monitor=None,
+ strict=True,
+ name=None,
)
assert freq == [1, 5]
diff --git a/tests/trainer/test_supporters.py b/tests/trainer/test_supporters.py
new file mode 100644
index 00000000000000..b8a0e066cdef89
--- /dev/null
+++ b/tests/trainer/test_supporters.py
@@ -0,0 +1,38 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+import torch
+
+from pytorch_lightning.trainer.supporters import TensorRunningAccum
+
+
+def test_tensor_running_accum_reset():
+ """ Test that reset would set all attributes to the initialization state """
+
+ window_length = 10
+
+ accum = TensorRunningAccum(window_length=window_length)
+ assert accum.last() is None
+ assert accum.mean() is None
+
+ accum.append(torch.tensor(1.5))
+ assert accum.last() == torch.tensor(1.5)
+ assert accum.mean() == torch.tensor(1.5)
+
+ accum.reset()
+ assert accum.window_length == window_length
+ assert accum.memory is None
+ assert accum.current_idx == 0
+ assert accum.last_idx is None
+ assert not accum.rotated
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 9b29d6ec2b1dd6..9e5ceccf9b646b 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -11,12 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-import glob
import math
import os
import pickle
import sys
-import types
from argparse import Namespace
from copy import deepcopy
from pathlib import Path
@@ -34,6 +32,7 @@
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.profiler.profilers import AdvancedProfiler, PassThroughProfiler, SimpleProfiler
from pytorch_lightning.trainer.logging import TrainerLoggingMixin
+from pytorch_lightning.trainer.states import TrainerState
from pytorch_lightning.utilities import NATIVE_AMP_AVAILABLE
from pytorch_lightning.utilities.cloud_io import load as pl_load
from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -61,6 +60,7 @@ def test_no_val_module(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
result = trainer.fit(model)
# training complete
assert result == 1, "amp + ddp model failed to complete"
+ assert trainer.state == TrainerState.FINISHED
# save model
new_weights_path = os.path.join(tmpdir, "save_test.ckpt")
@@ -107,6 +107,7 @@ def test_no_val_end_module(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
# traning complete
assert result == 1, "amp + ddp model failed to complete"
+ assert trainer.state == TrainerState.FINISHED
# save model
new_weights_path = os.path.join(tmpdir, "save_test.ckpt")
@@ -151,6 +152,7 @@ def test_strict_model_load(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
# traning complete
assert result == 1
+ assert trainer.state == TrainerState.FINISHED
# save model
new_weights_path = os.path.join(tmpdir, "save_test.ckpt")
@@ -468,6 +470,7 @@ def test_model_checkpoint_only_weights(tmpdir):
result = trainer.fit(model)
# training complete
assert result == 1, "training failed to complete"
+ assert trainer.state == TrainerState.FINISHED
checkpoint_path = list(trainer.checkpoint_callback.best_k_models.keys())[0]
@@ -507,35 +510,23 @@ def test_resume_from_checkpoint_epoch_restored(monkeypatch, tmpdir, tmpdir_serve
# set $TORCH_HOME, which determines torch hub's cache path, to tmpdir
monkeypatch.setenv("TORCH_HOME", tmpdir)
- hparams = EvalModelTemplate.get_default_hparams()
-
- def _new_model():
- # Create a model that tracks epochs and batches seen
- model = EvalModelTemplate(**hparams)
- model.num_epochs_seen = 0
- model.num_batches_seen = 0
- model.num_on_load_checkpoint_called = 0
+ class TestModel(BoringModel):
+ # Model that tracks epochs and batches seen
+ num_epochs_seen = 0
+ num_batches_seen = 0
+ num_on_load_checkpoint_called = 0
- def increment_epoch(self):
+ def on_epoch_end(self):
self.num_epochs_seen += 1
- def increment_batch(self, batch, batch_idx, dataloader_idx):
+ def on_train_batch_start(self, *_):
self.num_batches_seen += 1
- def increment_on_load_checkpoint(self, _):
+ def on_load_checkpoint(self, _):
self.num_on_load_checkpoint_called += 1
- # Bind methods to keep track of epoch numbers, batch numbers it has seen
- # as well as number of times it has called on_load_checkpoint()
- model.on_epoch_end = types.MethodType(increment_epoch, model)
- model.on_train_batch_start = types.MethodType(increment_batch, model)
- model.on_load_checkpoint = types.MethodType(increment_on_load_checkpoint, model)
- return model
-
- model = _new_model()
-
- trainer_options = dict(
- progress_bar_refresh_rate=0,
+ model = TestModel()
+ trainer = Trainer(
max_epochs=2,
limit_train_batches=0.65,
limit_val_batches=1,
@@ -543,144 +534,125 @@ def increment_on_load_checkpoint(self, _):
default_root_dir=tmpdir,
val_check_interval=1.0,
enable_pl_optimizer=enable_pl_optimizer,
+ progress_bar_refresh_rate=0,
+ logger=False,
+ weights_summary=None,
)
-
- trainer = Trainer(**trainer_options)
- # fit model
trainer.fit(model)
- training_batches = trainer.num_training_batches
-
assert model.num_epochs_seen == 2
- assert model.num_batches_seen == training_batches * 2
+ assert model.num_batches_seen == trainer.num_training_batches * 2
assert model.num_on_load_checkpoint_called == 0
# Other checkpoints can be uncommented if/when resuming mid-epoch is supported
- checkpoints = sorted(glob.glob(os.path.join(trainer.checkpoint_callback.dirpath, "*.ckpt")))
+ checkpoints = Path(trainer.checkpoint_callback.dirpath).glob("*.ckpt")
if url_ckpt:
# transform local paths into url checkpoints
ip, port = tmpdir_server
- checkpoints = [f"http://{ip}:{port}/" + os.path.basename(check) for check in checkpoints]
+ checkpoints = [f"http://{ip}:{port}/" + ckpt.name for ckpt in checkpoints]
- for check in checkpoints:
- next_model = _new_model()
- state = pl_load(check)
+ for ckpt in checkpoints:
+ next_model = TestModel()
+ state = pl_load(ckpt)
# Resume training
- trainer_options["max_epochs"] = 2
- new_trainer = Trainer(**trainer_options, resume_from_checkpoint=check)
+ new_trainer = Trainer(resume_from_checkpoint=ckpt, max_epochs=2)
new_trainer.fit(next_model)
- assert state["global_step"] + next_model.num_batches_seen == training_batches * trainer_options["max_epochs"]
+ assert state["global_step"] + next_model.num_batches_seen == trainer.num_training_batches * trainer.max_epochs
assert next_model.num_on_load_checkpoint_called == 1
-def _init_steps_model():
- """private method for initializing a model with 5% train epochs"""
- model = EvalModelTemplate()
-
- # define train epoch to 5% of data
- train_percent = 0.5
- # get number of samples in 1 epoch
- num_train_samples = math.floor(len(model.train_dataloader()) * train_percent)
-
- trainer_options = dict(
- limit_train_batches=train_percent,
- )
- return model, trainer_options, num_train_samples
-
-
def test_trainer_max_steps_and_epochs(tmpdir):
"""Verify model trains according to specified max steps"""
- model, trainer_options, num_train_samples = _init_steps_model()
+ model = BoringModel()
+ num_train_samples = math.floor(len(model.train_dataloader()) * 0.5)
# define less train steps than epochs
- trainer_options.update(
- default_root_dir=tmpdir,
- max_epochs=3,
- max_steps=num_train_samples + 10,
- )
-
- # fit model
- trainer = Trainer(**trainer_options)
+ trainer_kwargs = {
+ 'limit_train_batches': 0.5,
+ 'default_root_dir': tmpdir,
+ 'max_epochs': 3,
+ 'max_steps': num_train_samples + 10,
+ 'logger': False,
+ 'weights_summary': None,
+ 'progress_bar_refresh_rate': 0,
+ }
+ trainer = Trainer(**trainer_kwargs)
result = trainer.fit(model)
- assert result == 1, "Training did not complete"
- # check training stopped at max_steps
+ assert result == 1, "Training did not complete"
+ assert trainer.state == TrainerState.FINISHED
assert trainer.global_step == trainer.max_steps, "Model did not stop at max_steps"
# define less train epochs than steps
- trainer_options.update(
- max_epochs=2,
- max_steps=trainer_options["max_epochs"] * 2 * num_train_samples,
- )
-
- # fit model
- trainer = Trainer(**trainer_options)
+ trainer_kwargs['max_epochs'] = 2
+ trainer_kwargs['max_steps'] = 3 * 2 * num_train_samples
+ trainer = Trainer(**trainer_kwargs)
result = trainer.fit(model)
- assert result == 1, "Training did not complete"
- # check training stopped at max_epochs
+ assert result == 1, "Training did not complete"
+ assert trainer.state == TrainerState.FINISHED
assert trainer.global_step == num_train_samples * trainer.max_epochs
assert trainer.current_epoch == trainer.max_epochs - 1, "Model did not stop at max_epochs"
def test_trainer_min_steps_and_epochs(tmpdir):
"""Verify model trains according to specified min steps"""
- model, trainer_options, num_train_samples = _init_steps_model()
-
- # define callback for stopping the model and default epochs
- trainer_options.update(
- default_root_dir=tmpdir,
- callbacks=[EarlyStopping(monitor="early_stop_on", min_delta=1.0)],
- val_check_interval=2,
- min_epochs=1,
- max_epochs=7,
- )
-
- # define less min steps than 1 epoch
- trainer_options["min_steps"] = math.floor(num_train_samples / 2)
-
- # fit model
- trainer = Trainer(**trainer_options)
+ model = EvalModelTemplate()
+ num_train_samples = math.floor(len(model.train_dataloader()) * 0.5)
+
+ trainer_kwargs = {
+ 'limit_train_batches': 0.5,
+ 'default_root_dir': tmpdir,
+ # define callback for stopping the model
+ 'callbacks': [EarlyStopping(monitor="early_stop_on", min_delta=1.0)],
+ 'val_check_interval': 2,
+ 'min_epochs': 1,
+ 'max_epochs': 7,
+ # define less min steps than 1 epoch
+ 'min_steps': num_train_samples // 2,
+ 'logger': False,
+ 'weights_summary': None,
+ 'progress_bar_refresh_rate': 0,
+ }
+ trainer = Trainer(**trainer_kwargs)
result = trainer.fit(model)
- assert result == 1, "Training did not complete"
- # check model ran for at least min_epochs
- assert (
- trainer.global_step >= num_train_samples and trainer.current_epoch > 0
- ), "Model did not train for at least min_epochs"
+ assert result == 1, "Training did not complete"
+ assert trainer.state == TrainerState.FINISHED
+ assert trainer.current_epoch > 0
+ assert trainer.global_step >= num_train_samples, "Model did not train for at least min_epochs"
# define less epochs than min_steps
- trainer_options["min_steps"] = math.floor(num_train_samples * 1.5)
-
- # fit model
- trainer = Trainer(**trainer_options)
+ trainer_kwargs["min_steps"] = math.floor(num_train_samples * 1.5)
+ trainer = Trainer(**trainer_kwargs)
result = trainer.fit(model)
- assert result == 1, "Training did not complete"
- # check model ran for at least num_train_samples*1.5
- assert (
- trainer.global_step >= math.floor(num_train_samples * 1.5) and trainer.current_epoch > 0
- ), "Model did not train for at least min_steps"
+ assert result == 1, "Training did not complete"
+ assert trainer.state == TrainerState.FINISHED
+ assert trainer.current_epoch > 0
+ assert trainer.global_step >= math.floor(num_train_samples * 1.5), "Model did not train for at least min_steps"
def test_trainer_max_steps_accumulate_batches(tmpdir):
"""Verify model trains according to specified max steps with grad accumulated batches"""
- model, trainer_options, num_train_samples = _init_steps_model()
+ model = BoringModel()
+ num_train_samples = math.floor(len(model.train_dataloader()) * 0.5)
# define less train steps than epochs
- trainer_options.update(
+ trainer = Trainer(
+ limit_train_batches=0.5,
default_root_dir=tmpdir,
- max_steps=(num_train_samples + 10),
+ max_steps=num_train_samples + 10,
accumulate_grad_batches=10,
+ logger=False,
+ weights_summary=None,
+ progress_bar_refresh_rate=0,
)
-
- # fit model
- trainer = Trainer(**trainer_options)
result = trainer.fit(model)
- assert result == 1, "Training did not complete"
- # check training stopped at max_steps
+ assert result == 1, "Training did not complete"
+ assert trainer.state == TrainerState.FINISHED
assert trainer.global_step == trainer.max_steps, "Model did not stop at max_steps"
@@ -703,6 +675,7 @@ def test_benchmark_option(tmpdir):
# verify training completed
assert result == 1
+ assert trainer.state == TrainerState.FINISHED
# verify torch.backends.cudnn.benchmark is not turned off
assert torch.backends.cudnn.benchmark
@@ -788,6 +761,7 @@ def training_epoch_end(self, *args, **kwargs):
# check that limit_train_batches=0 turns off training
assert result == 1, "training failed to complete"
+ assert trainer.state == TrainerState.FINISHED
assert trainer.current_epoch == 0
assert not model.training_step_invoked, "`training_step` should not run when `limit_train_batches=0`"
assert not model.training_epoch_end_invoked, "`training_epoch_end` should not run when `limit_train_batches=0`"
@@ -806,6 +780,7 @@ def training_epoch_end(self, *args, **kwargs):
assert not torch.all(torch.eq(before_state_dict[key], after_state_dict[key]))
assert result == 1, "training failed to complete"
+ assert trainer.state == TrainerState.FINISHED
assert trainer.current_epoch == 0
assert model.training_step_invoked, "did not run `training_step` with `fast_dev_run=True`"
assert model.training_epoch_end_invoked, "did not run `training_epoch_end` with `fast_dev_run=True`"
@@ -844,6 +819,7 @@ def validation_epoch_end(self, *args, **kwargs):
# check that limit_val_batches=0 turns off validation
assert result == 1, "training failed to complete"
+ assert trainer.state == TrainerState.FINISHED
assert trainer.current_epoch == 1
assert not model.validation_step_invoked, "`validation_step` should not run when `limit_val_batches=0`"
assert not model.validation_epoch_end_invoked, "`validation_epoch_end` should not run when `limit_val_batches=0`"
@@ -855,6 +831,7 @@ def validation_epoch_end(self, *args, **kwargs):
result = trainer.fit(model)
assert result == 1, "training failed to complete"
+ assert trainer.state == TrainerState.FINISHED
assert trainer.current_epoch == 0
assert model.validation_step_invoked, "did not run `validation_step` with `fast_dev_run=True`"
assert model.validation_epoch_end_invoked, "did not run `validation_epoch_end` with `fast_dev_run=True`"
@@ -958,6 +935,7 @@ def test_gradient_clipping(tmpdir):
"""
Test gradient clipping
"""
+ tutils.reset_seed()
model = EvalModelTemplate()
@@ -995,6 +973,7 @@ def test_gradient_clipping_fp16(tmpdir):
"""
Test gradient clipping with fp16
"""
+ tutils.reset_seed()
model = EvalModelTemplate()
@@ -1117,7 +1096,7 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
@pytest.mark.parametrize(
"trainer_kwargs,expected",
[
- pytest.param(
+ (
dict(accelerator=None, gpus=None),
dict(
use_dp=False,
@@ -1129,7 +1108,7 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
num_processes=1,
),
),
- pytest.param(
+ (
dict(accelerator="dp", gpus=None),
dict(
use_dp=False,
@@ -1141,7 +1120,7 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
num_processes=1,
),
),
- pytest.param(
+ (
dict(accelerator="dp", gpus=None),
dict(
use_dp=False,
@@ -1153,7 +1132,7 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
num_processes=1,
),
),
- pytest.param(
+ (
dict(accelerator="ddp", gpus=None),
dict(
use_dp=False,
@@ -1165,7 +1144,7 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
num_processes=1,
),
),
- pytest.param(
+ (
dict(accelerator="ddp", num_processes=2, gpus=None),
dict(
use_dp=False,
@@ -1177,7 +1156,7 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
num_processes=2,
),
),
- pytest.param(
+ (
dict(accelerator="ddp", num_nodes=2, gpus=None),
dict(
use_dp=False,
@@ -1189,7 +1168,7 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
num_processes=1,
),
),
- pytest.param(
+ (
dict(accelerator="ddp_cpu", num_processes=2, gpus=None),
dict(
use_dp=False,
@@ -1201,7 +1180,7 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
num_processes=2,
),
),
- pytest.param(
+ (
dict(accelerator="ddp2", gpus=None),
dict(
use_dp=False,
@@ -1213,7 +1192,7 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
num_processes=1,
),
),
- pytest.param(
+ (
dict(accelerator=None, gpus=1),
dict(
use_dp=False,
@@ -1224,9 +1203,8 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
use_single_gpu=True,
num_processes=1,
),
- marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")],
),
- pytest.param(
+ (
dict(accelerator="dp", gpus=1),
dict(
use_dp=True,
@@ -1237,9 +1215,8 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
use_single_gpu=True,
num_processes=1,
),
- marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")],
),
- pytest.param(
+ (
dict(accelerator="ddp", gpus=1),
dict(
use_dp=False,
@@ -1250,9 +1227,8 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
use_single_gpu=True,
num_processes=1,
),
- marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")],
),
- pytest.param(
+ (
dict(accelerator="ddp_cpu", num_processes=2, gpus=1),
dict(
use_dp=False,
@@ -1263,9 +1239,8 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
use_single_gpu=False,
num_processes=2,
),
- marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")],
),
- pytest.param(
+ (
dict(accelerator="ddp2", gpus=1),
dict(
use_dp=False,
@@ -1276,9 +1251,8 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
use_single_gpu=False,
num_processes=1,
),
- marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")],
),
- pytest.param(
+ (
dict(accelerator=None, gpus=2),
dict(
use_dp=False,
@@ -1289,9 +1263,8 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
use_single_gpu=False,
num_processes=2,
),
- marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")],
),
- pytest.param(
+ (
dict(accelerator="dp", gpus=2),
dict(
use_dp=True,
@@ -1302,9 +1275,8 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
use_single_gpu=False,
num_processes=1,
),
- marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")],
),
- pytest.param(
+ (
dict(accelerator="ddp", gpus=2),
dict(
use_dp=False,
@@ -1315,9 +1287,8 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
use_single_gpu=False,
num_processes=2,
),
- marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")],
),
- pytest.param(
+ (
dict(accelerator="ddp2", gpus=2),
dict(
use_dp=False,
@@ -1328,21 +1299,17 @@ def test_num_sanity_val_steps_neg_one(tmpdir, limit_val_batches):
use_single_gpu=False,
num_processes=1,
),
- marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")],
),
],
)
-# Todo: mock nb Gpus so all these tests can run on any device
-# todo: think about simplification, that the the expected will be just a list use_xxx which shall be true...
-def test_trainer_config(trainer_kwargs, expected):
+def test_trainer_config(trainer_kwargs, expected, monkeypatch):
+ if trainer_kwargs["gpus"] is not None:
+ monkeypatch.setattr(torch.cuda, "is_available", lambda: True)
+ monkeypatch.setattr(torch.cuda, "device_count", lambda: trainer_kwargs["gpus"])
trainer = Trainer(**trainer_kwargs)
- assert trainer.use_dp is expected["use_dp"], 'for input: %s' % trainer_kwargs
- assert trainer.use_ddp is expected["use_ddp"], 'for input: %s' % trainer_kwargs
- assert trainer.use_ddp2 is expected["use_ddp2"], 'for input: %s' % trainer_kwargs
- assert trainer.num_gpus == expected["num_gpus"], 'for input: %s' % trainer_kwargs
- assert trainer.on_gpu is expected["on_gpu"], 'for input: %s' % trainer_kwargs
- assert trainer.use_single_gpu is expected["use_single_gpu"], 'for input: %s' % trainer_kwargs
- assert trainer.num_processes == expected["num_processes"], 'for input: %s' % trainer_kwargs
+ assert len(expected) == 7
+ for k, v in expected.items():
+ assert getattr(trainer, k) == v, f"Failed {k}: {v}"
def test_trainer_subclassing():
@@ -1358,6 +1325,7 @@ def __init__(self, custom_arg, *args, custom_kwarg="test", **kwargs):
trainer = TrainerSubclass(123, custom_kwarg="custom", fast_dev_run=True)
result = trainer.fit(model)
assert result == 1
+ assert trainer.state == TrainerState.FINISHED
assert trainer.custom_arg == 123
assert trainer.custom_kwarg == "custom"
assert trainer.fast_dev_run
@@ -1373,6 +1341,7 @@ def __init__(self, **kwargs):
trainer = TrainerSubclass(custom_kwarg="custom", fast_dev_run=True)
result = trainer.fit(model)
assert result == 1
+ assert trainer.state == TrainerState.FINISHED
assert trainer.custom_kwarg == "custom"
assert trainer.fast_dev_run