From 6abc254ae6830bcf2688f780cd12ef68e57531ce Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Mon, 26 Oct 2020 07:54:38 -0700 Subject: [PATCH 01/88] [Doc] Fix on_train_batch_end description (#4330) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix * more doc fixes Co-authored-by: Rohit Gupta Co-authored-by: Adrian Wälchli Co-authored-by: chaton --- pytorch_lightning/core/hooks.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index c185180991648..7500d1a11d440 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -132,7 +132,7 @@ def on_train_batch_end(self, outputs: Any, batch: Any, batch_idx: int, dataloade Called in the training loop after the batch. Args: - outputs: The outputs of validation_step_end(validation_step(x)) + outputs: The outputs of training_step_end(training_step(x)) batch: The batched data as it is returned by the training DataLoader. batch_idx: the index of the batch dataloader_idx: the index of the dataloader @@ -156,7 +156,7 @@ def on_validation_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: Called in the validation loop before anything happens for that batch. Args: - batch: The batched data as it is returned by the training DataLoader. + batch: The batched data as it is returned by the validation DataLoader. batch_idx: the index of the batch dataloader_idx: the index of the dataloader """ @@ -168,7 +168,7 @@ def on_validation_batch_end(self, outputs: Any, batch: Any, batch_idx: int, data Args: outputs: The outputs of validation_step_end(validation_step(x)) - batch: The batched data as it is returned by the training DataLoader. + batch: The batched data as it is returned by the validation DataLoader. batch_idx: the index of the batch dataloader_idx: the index of the dataloader """ @@ -179,7 +179,7 @@ def on_test_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) - Called in the test loop before anything happens for that batch. Args: - batch: The batched data as it is returned by the training DataLoader. + batch: The batched data as it is returned by the test DataLoader. batch_idx: the index of the batch dataloader_idx: the index of the dataloader """ @@ -191,7 +191,7 @@ def on_test_batch_end(self, outputs: Any, batch: Any, batch_idx: int, dataloader Args: outputs: The outputs of test_step_end(test_step(x)) - batch: The batched data as it is returned by the training DataLoader. + batch: The batched data as it is returned by the test DataLoader. batch_idx: the index of the batch dataloader_idx: the index of the dataloader """ From 8e3faa2da1e7c64a54072fa5115a242e71c49c4b Mon Sep 17 00:00:00 2001 From: Chenglu Date: Tue, 27 Oct 2020 02:08:58 +0800 Subject: [PATCH 02/88] get help from docstring (#4344) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add geting help message from docstring * Fix pep8 issue * Apply suggestions from code review Co-authored-by: Adrian Wälchli * Apply suggestions from code review Co-authored-by: Jirka Borovec Co-authored-by: Adrian Wälchli Co-authored-by: Jirka Borovec Co-authored-by: Sean Naren --- CHANGELOG.md | 3 ++ pytorch_lightning/utilities/argparse_utils.py | 29 +++++++++-- tests/utilities/test_argparse_utils.py | 50 +++++++++++++++++++ 3 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 tests/utilities/test_argparse_utils.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a534c6bfaf40..08e2e93b93d9a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `fsspec` support for profilers ([#4162](https://github.com/PyTorchLightning/pytorch-lightning/pull/4162)) +- Added autogenerated helptext to `Trainer.add_argparse_args`. ([#4344](https://github.com/PyTorchLightning/pytorch-lightning/pull/4344)) + + ### Changed diff --git a/pytorch_lightning/utilities/argparse_utils.py b/pytorch_lightning/utilities/argparse_utils.py index 57c9e23d80dc9..f3cf2e5f1b90d 100644 --- a/pytorch_lightning/utilities/argparse_utils.py +++ b/pytorch_lightning/utilities/argparse_utils.py @@ -14,7 +14,7 @@ import inspect import os from argparse import ArgumentParser, Namespace -from typing import Union, List, Tuple, Any +from typing import Dict, Union, List, Tuple, Any from pytorch_lightning.utilities import parsing @@ -160,7 +160,7 @@ def add_argparse_args(cls, parent_parser: ArgumentParser) -> ArgumentParser: allowed_types = (str, int, float, bool) - # TODO: get "help" from docstring :) + args_help = parse_args_from_docstring(cls.__init__.__doc__ or cls.__doc__) for arg, arg_types, arg_default in ( at for at in get_init_arguments_and_types(cls) if at[0] not in depr_arg_names ): @@ -200,13 +200,36 @@ def add_argparse_args(cls, parent_parser: ArgumentParser) -> ArgumentParser: dest=arg, default=arg_default, type=use_type, - help='autogenerated by pl.Trainer', + help=args_help.get(arg), **arg_kwargs, ) return parser +def parse_args_from_docstring(docstring: str) -> Dict[str, str]: + arg_block_indent = None + current_arg = None + parsed = {} + for line in docstring.split("\n"): + stripped = line.lstrip() + if not stripped: + continue + line_indent = len(line) - len(stripped) + if stripped.startswith(('Args:', 'Arguments:', 'Parameters:')): + arg_block_indent = line_indent + 4 + elif arg_block_indent is None: + continue + elif line_indent < arg_block_indent: + break + elif line_indent == arg_block_indent: + current_arg, arg_description = stripped.split(':', maxsplit=1) + parsed[current_arg] = arg_description.lstrip() + elif line_indent > arg_block_indent: + parsed[current_arg] += f' {stripped}' + return parsed + + def _gpus_allowed_type(x) -> Union[int, str]: if ',' in x: return str(x) diff --git a/tests/utilities/test_argparse_utils.py b/tests/utilities/test_argparse_utils.py new file mode 100644 index 0000000000000..978ad820482b2 --- /dev/null +++ b/tests/utilities/test_argparse_utils.py @@ -0,0 +1,50 @@ +from pytorch_lightning.utilities.argparse_utils import parse_args_from_docstring + + +def test_parse_args_from_docstring_normal(): + args_help = parse_args_from_docstring( + """Constrain image dataset + + Args: + root: Root directory of dataset where ``MNIST/processed/training.pt`` + and ``MNIST/processed/test.pt`` exist. + train: If ``True``, creates dataset from ``training.pt``, + otherwise from ``test.pt``. + normalize: mean and std deviation of the MNIST dataset. + download: If true, downloads the dataset from the internet and + puts it in root directory. If dataset is already downloaded, it is not + downloaded again. + num_samples: number of examples per selected class/digit + digits: list selected MNIST digits/classes + + Examples: + >>> dataset = TrialMNIST(download=True) + >>> len(dataset) + 300 + >>> sorted(set([d.item() for d in dataset.targets])) + [0, 1, 2] + >>> torch.bincount(dataset.targets) + tensor([100, 100, 100]) + """ + ) + + expected_args = ['root', 'train', 'normalize', 'download', 'num_samples', 'digits'] + assert len(args_help.keys()) == len(expected_args) + assert all([x == y for x, y in zip(args_help.keys(), expected_args)]) + assert args_help['root'] == 'Root directory of dataset where ``MNIST/processed/training.pt``' \ + ' and ``MNIST/processed/test.pt`` exist.' + assert args_help['normalize'] == 'mean and std deviation of the MNIST dataset.' + + +def test_parse_args_from_docstring_empty(): + args_help = parse_args_from_docstring( + """Constrain image dataset + + Args: + + Returns: + + Examples: + """ + ) + assert len(args_help.keys()) == 0 From 48b6de0c40428424e8c98baeaab73d85f8fb79b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 27 Oct 2020 11:07:29 +0100 Subject: [PATCH 03/88] update (#4343) Co-authored-by: chaton --- pytorch_lightning/trainer/__init__.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py index 3f073e726f7eb..6109c21b825d2 100644 --- a/pytorch_lightning/trainer/__init__.py +++ b/pytorch_lightning/trainer/__init__.py @@ -979,13 +979,9 @@ def on_train_end(self, trainer, pl_module): # check all validation data trainer = Trainer(num_sanity_val_steps=-1) -Example:: - python -m torch_xla.distributed.xla_dist - --tpu=$TPU_POD_NAME - --conda-env=torch-xla-nightly - --env=XLA_USE_BF16=1 - -- python your_trainer_file.py +This option will reset the validation dataloader unless ``num_sanity_val_steps=0``. + plugins ^^^^^^^ From c50c225f05f9a27687d5d5244c8d46a43900c635 Mon Sep 17 00:00:00 2001 From: Dusan Drevicky <55678224+ddrevicky@users.noreply.github.com> Date: Tue, 27 Oct 2020 11:57:16 +0100 Subject: [PATCH 04/88] feature: Allow str arguments in Trainer.profiler (#3656) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * allow trainer's profiler param to have a str value * add tests * update docs * update exception message * Update CHANGELOG * fix pep8 issues * cleanup test code Co-authored-by: Carlos Mocholí * Add deprecation warning if using bool for profiler * Add deprecation tests and move deprecated tests * Remove bool option to profiler from docs * Deprecate bool args to profiler in CHANGELOG * fixup! Add deprecation warning if using bool for profiler * fixup! Add deprecation tests and move deprecated tests * Apply suggestions from code review Co-authored-by: Rohit Gupta * Implement suggestions, remove whitespace * fixup! Implement suggestions, remove whitespace * Allow bool, str (case insensitive), BaseProfiler * Add info about bool deprecation to trainer * fixup! Add info about bool deprecation to trainer * Move deprecate todo to test_deprecated * Test wrong profiler type, improve error message * fixup! Test wrong profiler type, improve error message * Update pytorch_lightning/trainer/connectors/profiler_connector.py Co-authored-by: Carlos Mocholí * Apply suggestions from code review * Readd bool to profiler types, test cli profiler arg * Remove extra whitespace in doc Co-authored-by: Adrian Wälchli * Apply suggestions from code review Co-authored-by: Adrian Wälchli * Update deprecation versions Co-authored-by: Carlos Mocholí Co-authored-by: Rohit Gupta Co-authored-by: Jirka Borovec Co-authored-by: Adrian Wälchli --- CHANGELOG.md | 6 ++++ pytorch_lightning/profiler/__init__.py | 10 ++++-- pytorch_lightning/trainer/__init__.py | 11 +++--- .../trainer/connectors/profiler_connector.py | 34 +++++++++++++++--- pytorch_lightning/trainer/trainer.py | 5 +-- pytorch_lightning/utilities/argparse_utils.py | 3 +- tests/test_deprecated.py | 35 +++++++++++++++++++ tests/trainer/test_trainer.py | 30 ++++++++++++++++ 8 files changed, 115 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 08e2e93b93d9a..f8ed74235c859 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added autogenerated helptext to `Trainer.add_argparse_args`. ([#4344](https://github.com/PyTorchLightning/pytorch-lightning/pull/4344)) +- Added support for string values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) + + ### Changed @@ -48,6 +51,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated `reorder` parameter of the `auc` metric ([#4237](https://github.com/PyTorchLightning/pytorch-lightning/pull/4237)) +- Deprecated bool values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) + + ### Removed diff --git a/pytorch_lightning/profiler/__init__.py b/pytorch_lightning/profiler/__init__.py index fc684d143e4b8..fe14bb5751161 100644 --- a/pytorch_lightning/profiler/__init__.py +++ b/pytorch_lightning/profiler/__init__.py @@ -22,12 +22,12 @@ Enable simple profiling ----------------------- -If you only wish to profile the standard actions, you can set `profiler=True` when constructing -your `Trainer` object. +If you only wish to profile the standard actions, you can set `profiler="simple"` +when constructing your `Trainer` object. .. code-block:: python - trainer = Trainer(..., profiler=True) + trainer = Trainer(..., profiler="simple") The profiler's results will be printed at the completion of a training `fit()`. @@ -59,6 +59,10 @@ .. code-block:: python + trainer = Trainer(..., profiler="advanced") + + or + profiler = AdvancedProfiler() trainer = Trainer(..., profiler=profiler) diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py index 6109c21b825d2..a4bf2969f4a77 100644 --- a/pytorch_lightning/trainer/__init__.py +++ b/pytorch_lightning/trainer/__init__.py @@ -1199,14 +1199,11 @@ def world_size(self): # default used by the Trainer trainer = Trainer(profiler=None) - # to profile standard training events - trainer = Trainer(profiler=True) + # to profile standard training events, equivalent to `profiler=SimpleProfiler()` + trainer = Trainer(profiler="simple") - # equivalent to profiler=True - trainer = Trainer(profiler=SimpleProfiler()) - - # advanced profiler for function-level stats - trainer = Trainer(profiler=AdvancedProfiler()) + # advanced profiler for function-level stats, equivalent to `profiler=AdvancedProfiler()` + trainer = Trainer(profiler="advanced") progress_bar_refresh_rate ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pytorch_lightning/trainer/connectors/profiler_connector.py b/pytorch_lightning/trainer/connectors/profiler_connector.py index 17aed23ab5b32..0f6686f1f83c7 100644 --- a/pytorch_lightning/trainer/connectors/profiler_connector.py +++ b/pytorch_lightning/trainer/connectors/profiler_connector.py @@ -11,7 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License -from pytorch_lightning.profiler import PassThroughProfiler, SimpleProfiler + +from typing import Union + +from pytorch_lightning.profiler import BaseProfiler, PassThroughProfiler, SimpleProfiler, AdvancedProfiler +from pytorch_lightning.utilities import rank_zero_warn +from pytorch_lightning.utilities.exceptions import MisconfigurationException class ProfilerConnector: @@ -19,8 +24,27 @@ class ProfilerConnector: def __init__(self, trainer): self.trainer = trainer - def on_trainer_init(self, profiler): - # configure profiler - if profiler is True: - profiler = SimpleProfiler() + def on_trainer_init(self, profiler: Union[BaseProfiler, bool, str]): + + if profiler and not isinstance(profiler, (bool, str, BaseProfiler)): + # TODO: Update exception on removal of bool + raise MisconfigurationException("Only None, bool, str and subclasses of `BaseProfiler` " + "are valid values for `Trainer`'s `profiler` parameter. " + f"Received {profiler} which is of type {type(profiler)}.") + + if isinstance(profiler, bool): + rank_zero_warn("Passing a bool value as a `profiler` argument to `Trainer` is deprecated" + " and will be removed in v1.3. Use str ('simple' or 'advanced') instead.", + DeprecationWarning) + if profiler: + profiler = SimpleProfiler() + elif isinstance(profiler, str): + profiler = profiler.lower() + if profiler == "simple": + profiler = SimpleProfiler() + elif profiler == "advanced": + profiler = AdvancedProfiler() + else: + raise ValueError("When passing string value for the `profiler` parameter of" + " `Trainer`, it can only be 'simple' or 'advanced'") self.trainer.profiler = profiler or PassThroughProfiler() diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 44250ae905aba..337eb4c4ed567 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -120,7 +120,7 @@ def __init__( num_sanity_val_steps: int = 2, truncated_bptt_steps: Optional[int] = None, resume_from_checkpoint: Optional[str] = None, - profiler: Optional[Union[BaseProfiler, bool]] = None, + profiler: Optional[Union[BaseProfiler, bool, str]] = None, benchmark: bool = False, deterministic: bool = False, reload_dataloaders_every_epoch: bool = False, @@ -212,7 +212,8 @@ def __init__( progress_bar_refresh_rate: How often to refresh progress bar (in steps). Value ``0`` disables progress bar. Ignored when a custom callback is passed to :paramref:`~Trainer.callbacks`. - profiler: To profile individual steps during training and assist in identifying bottlenecks. + profiler: To profile individual steps during training and assist in identifying bottlenecks. Passing bool + value is deprecated in v1.1 and will be removed in v1.3. overfit_batches: Overfit a percent of training data (float) or a set number of batches (int). Default: 0.0 diff --git a/pytorch_lightning/utilities/argparse_utils.py b/pytorch_lightning/utilities/argparse_utils.py index f3cf2e5f1b90d..bbb89ad09aa48 100644 --- a/pytorch_lightning/utilities/argparse_utils.py +++ b/pytorch_lightning/utilities/argparse_utils.py @@ -174,8 +174,7 @@ def add_argparse_args(cls, parent_parser: ArgumentParser) -> ArgumentParser: # if the only arg type is bool if len(arg_types) == 1: use_type = parsing.str_to_bool - # if only two args (str, bool) - elif len(arg_types) == 2 and set(arg_types) == {str, bool}: + elif str in arg_types: use_type = parsing.str_to_bool_or_str else: # filter out the bool as we need to use more general diff --git a/tests/test_deprecated.py b/tests/test_deprecated.py index c8a7b1d270e35..60f13383d3777 100644 --- a/tests/test_deprecated.py +++ b/tests/test_deprecated.py @@ -1,13 +1,17 @@ """Test deprecated functionality which will be removed in vX.Y.Z""" +from argparse import ArgumentParser import pytest import sys +from unittest import mock import torch from tests.base import EvalModelTemplate from pytorch_lightning.metrics.functional.classification import auc +from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.profiler.profilers import PassThroughProfiler, SimpleProfiler from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -22,6 +26,37 @@ def test_tbd_remove_in_v1_2_0(): checkpoint_cb = ModelCheckpoint(filepath='.', dirpath='.') +# TODO: remove bool from Trainer.profiler param in v1.3.0, update profiler_connector.py +@pytest.mark.parametrize(['profiler', 'expected'], [ + (True, SimpleProfiler), + (False, PassThroughProfiler), +]) +def test_trainer_profiler_remove_in_v1_3_0(profiler, expected): + with pytest.deprecated_call(match='will be removed in v1.3'): + trainer = Trainer(profiler=profiler) + assert isinstance(trainer.profiler, expected) + + +@pytest.mark.parametrize( + ['cli_args', 'expected_parsed_arg', 'expected_profiler'], + [ + ('--profiler', True, SimpleProfiler), + ('--profiler True', True, SimpleProfiler), + ('--profiler False', False, PassThroughProfiler), + ], +) +def test_trainer_cli_profiler_remove_in_v1_3_0(cli_args, expected_parsed_arg, expected_profiler): + cli_args = cli_args.split(' ') + with mock.patch("argparse._sys.argv", ["any.py"] + cli_args): + parser = ArgumentParser(add_help=False) + parser = Trainer.add_argparse_args(parent_parser=parser) + args = Trainer.parse_argparser(parser) + + assert getattr(args, "profiler") == expected_parsed_arg + trainer = Trainer.from_argparse_args(args) + assert isinstance(trainer.profiler, expected_profiler) + + def _soft_unimport_module(str_module): # once the module is imported e.g with parsing with pytest it lives in memory if str_module in sys.modules: diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 4841d1461fec6..35257e28704ba 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -32,6 +32,7 @@ from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint from pytorch_lightning.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml, save_hparams_to_tags_csv from pytorch_lightning.loggers import TensorBoardLogger +from pytorch_lightning.profiler.profilers import AdvancedProfiler, PassThroughProfiler, SimpleProfiler from pytorch_lightning.trainer.logging import TrainerLoggingMixin from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -1408,3 +1409,32 @@ def test_log_every_n_steps(log_metrics_mock, tmpdir, train_batches, max_steps, l trainer.fit(model) expected_calls = [call(metrics=ANY, step=s) for s in range(log_interval - 1, max_steps, log_interval)] log_metrics_mock.assert_has_calls(expected_calls) + + +@pytest.mark.parametrize(['profiler', 'expected'], [ + (None, PassThroughProfiler), + (SimpleProfiler(), SimpleProfiler), + (AdvancedProfiler(), AdvancedProfiler), + ('simple', SimpleProfiler), + ('Simple', SimpleProfiler), + ('advanced', AdvancedProfiler), +]) +def test_trainer_profiler_correct_args(profiler, expected): + kwargs = {'profiler': profiler} if profiler is not None else {} + trainer = Trainer(**kwargs) + assert isinstance(trainer.profiler, expected) + + +def test_trainer_profiler_incorrect_str_arg(): + with pytest.raises(ValueError, match=r".*can only be 'simple' or 'advanced'"): + Trainer(profiler="unknown_profiler") + + +@pytest.mark.parametrize('profiler', ( + 42, [42], {"a": 42}, torch.tensor(42), Trainer(), +)) +def test_trainer_profiler_incorrect_arg_type(profiler): + with pytest.raises(MisconfigurationException, + match=r"Only None, bool, str and subclasses of `BaseProfiler` " + r"are valid values for `Trainer`'s `profiler` parameter. *"): + Trainer(profiler=profiler) From 6878f3bf4e2df94eed430cb9db4b47e419948af9 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Tue, 27 Oct 2020 05:27:59 -0700 Subject: [PATCH 05/88] Enable DDP Plugin to pass through args to LightningDistributedDataParallel (#4382) * Update ddp_plugin.py * Update ddp_plugin.py * Update ddp_plugin.py * Update test_ddp_plugin.py * Update pytorch_lightning/plugins/ddp_plugin.py * Update pytorch_lightning/plugins/ddp_plugin.py * Fixed imports, make ddp_kwargs protected Co-authored-by: SeanNaren --- pytorch_lightning/plugins/ddp_plugin.py | 27 ++++-- tests/plugins/test_ddp_plugin.py | 111 +++++++++++++++++------- 2 files changed, 103 insertions(+), 35 deletions(-) diff --git a/pytorch_lightning/plugins/ddp_plugin.py b/pytorch_lightning/plugins/ddp_plugin.py index 27deeeddfdb45..4c4fdc8f0d368 100644 --- a/pytorch_lightning/plugins/ddp_plugin.py +++ b/pytorch_lightning/plugins/ddp_plugin.py @@ -1,12 +1,16 @@ -from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel +from typing import List, Dict, Any + from pytorch_lightning.core.lightning import LightningModule -from typing import List +from pytorch_lightning.overrides.data_parallel import LightningDistributedDataParallel class DDPPlugin(object): """ Plugin to link a custom ddp implementation to any arbitrary accelerator. + This plugin forwards all constructor arguments to `LightningDistributedDataParallel`, + which in turn forwards all args to `DistributedDataParallel`. + Example:: class MyDDP(DDPPlugin): @@ -17,11 +21,16 @@ def configure_ddp(self, model, device_ids): my_ddp = MyDDP() trainer = Trainer(accelerator='ddp_x', plugins=[my_ddp]) - """ - def configure_ddp(self, model: LightningModule, device_ids: List[int]) -> LightningDistributedDataParallel: + def __init__(self, **kwargs): + self._ddp_kwargs: Dict[str, Any] = kwargs + + def configure_ddp( + self, model: LightningModule, device_ids: List[int] + ) -> LightningDistributedDataParallel: """ + Pass through all customizations from constructor to `LightningDistributedDataParallel`. Override to define a custom DDP implementation. .. note:: Only requirement is that your DDP implementation subclasses LightningDistributedDataParallel @@ -43,5 +52,13 @@ def configure_ddp(self, model, device_ids): the model wrapped in LightningDistributedDataParallel """ - model = LightningDistributedDataParallel(model, device_ids=device_ids, find_unused_parameters=True) + # if unset, default `find_unused_parameters` `True` + self._ddp_kwargs["find_unused_parameters"] = self._ddp_kwargs.get( + "find_unused_parameters", True + ) + model = LightningDistributedDataParallel( + model, + device_ids=device_ids, + **self._ddp_kwargs, + ) return model diff --git a/tests/plugins/test_ddp_plugin.py b/tests/plugins/test_ddp_plugin.py index b190f34395522..69cd0e3beb7b4 100644 --- a/tests/plugins/test_ddp_plugin.py +++ b/tests/plugins/test_ddp_plugin.py @@ -1,25 +1,30 @@ -from pytorch_lightning.callbacks import Callback -from tests.base.boring_model import BoringModel -from pytorch_lightning import accelerators, Trainer -from pytorch_lightning.plugins.ddp_plugin import DDPPlugin -import pytest import os from unittest import mock +import pytest +from pytorch_lightning import Trainer, accelerators +from pytorch_lightning.callbacks import Callback +from pytorch_lightning.plugins.ddp_plugin import DDPPlugin +from tests.base.boring_model import BoringModel -@mock.patch.dict(os.environ, { - "CUDA_VISIBLE_DEVICES": "0,1", - "SLURM_NTASKS": "2", - "SLURM_JOB_NAME": "SOME_NAME", - "SLURM_NODEID": "0", - "LOCAL_RANK": "0", - "SLURM_LOCALID": "0" -}) -@mock.patch('torch.cuda.device_count', return_value=2) -@pytest.mark.parametrize(['ddp_backend', 'gpus', 'num_processes'], - [('ddp_cpu', None, None), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)]) -def test_ddp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes): +@mock.patch.dict( + os.environ, + { + "CUDA_VISIBLE_DEVICES": "0,1", + "SLURM_NTASKS": "2", + "SLURM_JOB_NAME": "SOME_NAME", + "SLURM_NODEID": "0", + "LOCAL_RANK": "0", + "SLURM_LOCALID": "0", + }, +) +@mock.patch("torch.cuda.device_count", return_value=2) +@pytest.mark.parametrize( + ["ddp_backend", "gpus", "num_processes"], + [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], +) +def test_ddp_choice_default_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert isinstance(trainer.accelerator_backend.ddp_plugin, DDPPlugin) @@ -31,24 +36,29 @@ def on_fit_start(self, trainer, pl_module): gpus=gpus, num_processes=num_processes, distributed_backend=ddp_backend, - callbacks=[CB()] + callbacks=[CB()], ) with pytest.raises(SystemExit): trainer.fit(model) -@mock.patch.dict(os.environ, { - "CUDA_VISIBLE_DEVICES": "0,1", - "SLURM_NTASKS": "2", - "SLURM_JOB_NAME": "SOME_NAME", - "SLURM_NODEID": "0", - "LOCAL_RANK": "0", - "SLURM_LOCALID": "0" -}) -@mock.patch('torch.cuda.device_count', return_value=2) -@pytest.mark.parametrize(['ddp_backend', 'gpus', 'num_processes'], - [('ddp_cpu', None, None), ('ddp', 2, 0), ('ddp2', 2, 0), ('ddp_spawn', 2, 0)]) +@mock.patch.dict( + os.environ, + { + "CUDA_VISIBLE_DEVICES": "0,1", + "SLURM_NTASKS": "2", + "SLURM_JOB_NAME": "SOME_NAME", + "SLURM_NODEID": "0", + "LOCAL_RANK": "0", + "SLURM_LOCALID": "0", + }, +) +@mock.patch("torch.cuda.device_count", return_value=2) +@pytest.mark.parametrize( + ["ddp_backend", "gpus", "num_processes"], + [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], +) def test_ddp_choice_custom_ddp_cpu(tmpdir, ddp_backend, gpus, num_processes): class MyDDP(DDPPlugin): pass @@ -65,7 +75,48 @@ def on_fit_start(self, trainer, pl_module): num_processes=num_processes, distributed_backend=ddp_backend, plugins=[MyDDP()], - callbacks=[CB()] + callbacks=[CB()], + ) + + with pytest.raises(SystemExit): + trainer.fit(model) + + +@mock.patch.dict( + os.environ, + { + "CUDA_VISIBLE_DEVICES": "0,1", + "SLURM_NTASKS": "2", + "SLURM_JOB_NAME": "SOME_NAME", + "SLURM_NODEID": "0", + "LOCAL_RANK": "0", + "SLURM_LOCALID": "0", + }, +) +@mock.patch("torch.cuda.device_count", return_value=2) +@pytest.mark.parametrize( + ["ddp_backend", "gpus", "num_processes"], + [("ddp_cpu", None, None), ("ddp", 2, 0), ("ddp2", 2, 0), ("ddp_spawn", 2, 0)], +) +def test_ddp_choice_custom_ddp_cpu_custom_args( + tmpdir, ddp_backend, gpus, num_processes +): + class MyDDP(DDPPlugin): + pass + + class CB(Callback): + def on_fit_start(self, trainer, pl_module): + assert isinstance(trainer.accelerator_backend.ddp_plugin, MyDDP) + raise SystemExit() + + model = BoringModel() + trainer = Trainer( + fast_dev_run=True, + gpus=gpus, + num_processes=num_processes, + distributed_backend=ddp_backend, + plugins=[MyDDP(broadcast_buffers=False, find_unused_parameters=True)], + callbacks=[CB()], ) with pytest.raises(SystemExit): From 4106e2f11292979022c437b9c49b0b3348f4682f Mon Sep 17 00:00:00 2001 From: Alexander Date: Tue, 27 Oct 2020 17:30:56 +0300 Subject: [PATCH 06/88] Fix COMET_EXPERIMENT_KEY environment variable usage in comet logger (#4230) * Fix COMET_EXPERIMENT_KEY environment variable usage * Remove unused arg * Update comet.py * Add test by Lothiraldan * remove blank Co-authored-by: chaton Co-authored-by: Nicki Skafte Co-authored-by: Sean Naren --- pytorch_lightning/loggers/comet.py | 8 ++++++-- tests/loggers/test_comet.py | 33 +++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/loggers/comet.py b/pytorch_lightning/loggers/comet.py index 553d5186a979d..40cff1bc7e819 100644 --- a/pytorch_lightning/loggers/comet.py +++ b/pytorch_lightning/loggers/comet.py @@ -187,7 +187,6 @@ def experiment(self): if self._future_experiment_key is not None: os.environ["COMET_EXPERIMENT_KEY"] = self._future_experiment_key - self._future_experiment_key = None try: if self.mode == "online": @@ -212,7 +211,9 @@ def experiment(self): **self._kwargs, ) finally: - os.environ.pop("COMET_EXPERIMENT_KEY", None) + if self._future_experiment_key is not None: + os.environ.pop("COMET_EXPERIMENT_KEY") + self._future_experiment_key = None if self._experiment_name: self._experiment.set_name(self._experiment_name) @@ -278,6 +279,9 @@ def version(self) -> str: if self._experiment_key is not None: return self._experiment_key + if "COMET_EXPERIMENT_KEY" in os.environ: + return os.environ["COMET_EXPERIMENT_KEY"] + if self._future_experiment_key is not None: return self._future_experiment_key diff --git a/tests/loggers/test_comet.py b/tests/loggers/test_comet.py index dc66f1842c62c..87af510e49219 100644 --- a/tests/loggers/test_comet.py +++ b/tests/loggers/test_comet.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from unittest.mock import patch +from unittest.mock import patch, DEFAULT import pytest @@ -99,6 +99,37 @@ def test_comet_logger_experiment_name(comet): comet_experiment().set_name.assert_called_once_with(experiment_name) +@patch('pytorch_lightning.loggers.comet.comet_ml') +def test_comet_logger_manual_experiment_key(comet): + """Test that Comet Logger respects manually set COMET_EXPERIMENT_KEY.""" + + api_key = "key" + experiment_key = "96346da91469407a85641afe5766b554" + + instantation_environ = {} + + def save_os_environ(*args, **kwargs): + nonlocal instantation_environ + instantation_environ = os.environ.copy() + + return DEFAULT + + # Test api_key given + with patch.dict(os.environ, {"COMET_EXPERIMENT_KEY": experiment_key}): + with patch('pytorch_lightning.loggers.comet.CometExperiment', side_effect=save_os_environ) as comet_experiment: + logger = CometLogger(api_key=api_key) + + assert logger.version == experiment_key + + assert logger._experiment is None + + _ = logger.experiment + + comet_experiment.assert_called_once_with(api_key=api_key, project_name=None) + + assert instantation_environ["COMET_EXPERIMENT_KEY"] == experiment_key + + @patch('pytorch_lightning.loggers.comet.CometOfflineExperiment') @patch('pytorch_lightning.loggers.comet.comet_ml') def test_comet_logger_dirs_creation(comet, comet_experiment, tmpdir, monkeypatch): From 5d10a36762776c4b6f6a9c55b4e6bf7bd258137f Mon Sep 17 00:00:00 2001 From: William Falcon Date: Tue, 27 Oct 2020 18:15:26 -0400 Subject: [PATCH 07/88] Update __init__.py --- pytorch_lightning/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py index 9c476680ceba9..d28c67030ad30 100644 --- a/pytorch_lightning/__init__.py +++ b/pytorch_lightning/__init__.py @@ -1,6 +1,6 @@ """Root package info.""" -__version__ = '1.0.4rc1' +__version__ = '1.0.4' __author__ = 'William Falcon et al.' __author_email__ = 'waf2107@columbia.edu' __license__ = 'Apache-2.0' From 00cc69aed7b63fc8a23c6149f22106a0b041d04b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 28 Oct 2020 10:51:08 +0100 Subject: [PATCH 08/88] Add "monitor" to saved ModelCheckpoints (#4383) * Add key * Remove unused variables * Update CHANGELOG [skip ci] * best_model_monitor -> monitor Co-authored-by: Sean Naren Co-authored-by: Rohit Gupta --- CHANGELOG.md | 7 +++++-- pytorch_lightning/callbacks/model_checkpoint.py | 3 +-- tests/checkpointing/test_model_checkpoint.py | 5 +---- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f8ed74235c859..b20d4ae3eab0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `dirpath` and `filename` parameter in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213)) -- Added plugins docs and DDPPlugin to customize ddp across all accelerators([#4258](https://github.com/PyTorchLightning/pytorch-lightning/pull/4285)) +- Added plugins docs and DDPPlugin to customize ddp across all accelerators ([#4258](https://github.com/PyTorchLightning/pytorch-lightning/pull/4285)) - Added `strict` option to the scheduler dictionary ([#3586](https://github.com/PyTorchLightning/pytorch-lightning/pull/3586)) @@ -21,7 +21,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `fsspec` support for profilers ([#4162](https://github.com/PyTorchLightning/pytorch-lightning/pull/4162)) -- Added autogenerated helptext to `Trainer.add_argparse_args`. ([#4344](https://github.com/PyTorchLightning/pytorch-lightning/pull/4344)) +- Added autogenerated helptext to `Trainer.add_argparse_args` ([#4344](https://github.com/PyTorchLightning/pytorch-lightning/pull/4344)) + + +- Added "monitor" key to saved `ModelCheckpoints` ([#4383](https://github.com/PyTorchLightning/pytorch-lightning/pull/4383)) - Added support for string values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 6c6a1741c31c5..4c9d3f4e30072 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -131,8 +131,6 @@ class ModelCheckpoint(Callback): CHECKPOINT_JOIN_CHAR = "-" CHECKPOINT_NAME_LAST = "last" - CHECKPOINT_STATE_BEST_SCORE = "checkpoint_callback_best_model_score" - CHECKPOINT_STATE_BEST_PATH = "checkpoint_callback_best_model_path" def __init__( self, @@ -187,6 +185,7 @@ def on_validation_end(self, trainer, pl_module): def on_save_checkpoint(self, trainer, pl_module) -> Dict[str, Any]: return { + "monitor": self.monitor, "best_model_score": self.best_model_score, "best_model_path": self.best_model_path, } diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index 976a91f551e0a..1634b73424dd1 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -507,10 +507,7 @@ def test_model_checkpoint_save_last_checkpoint_contents(tmpdir): assert all(ckpt_last_epoch[k] == ckpt_last[k] for k in ("epoch", "global_step")) ch_type = type(model_checkpoint) - assert all(list( - ckpt_last["callbacks"][ch_type][k] == ckpt_last_epoch["callbacks"][ch_type][k] - for k in ("best_model_score", "best_model_path") - )) + assert ckpt_last["callbacks"][ch_type] == ckpt_last_epoch["callbacks"][ch_type] # it is easier to load the model objects than to iterate over the raw dict of tensors model_last_epoch = EvalModelTemplate.load_from_checkpoint(path_last_epoch) From b26c71eadf11d3a6aa9428504b8e646e0a8a03f8 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Wed, 28 Oct 2020 17:45:22 +0530 Subject: [PATCH 09/88] Add optimizer hooks in callbacks (#4379) * Add optimizer hooks in callbacks * optimizer param * update test Co-authored-by: Nicki Skafte --- pytorch_lightning/callbacks/base.py | 12 ++++++++++++ pytorch_lightning/trainer/callback_hook.py | 14 ++++++++++++++ pytorch_lightning/trainer/training_loop.py | 3 +-- tests/callbacks/test_callbacks.py | 18 ++++++++++++++++++ 4 files changed, 45 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/callbacks/base.py b/pytorch_lightning/callbacks/base.py index 591703c245543..004aa6d737b4a 100644 --- a/pytorch_lightning/callbacks/base.py +++ b/pytorch_lightning/callbacks/base.py @@ -166,3 +166,15 @@ def on_save_checkpoint(self, trainer, pl_module): def on_load_checkpoint(self, checkpointed_state): """Called when loading a model checkpoint, use to reload state.""" pass + + def on_after_backward(self, trainer, pl_module): + """ + Called after loss.backward() and before optimizers do anything. + """ + pass + + def on_before_zero_grad(self, trainer, pl_module, optimizer): + """ + Called after optimizer.step() and before optimizer.zero_grad(). + """ + pass diff --git a/pytorch_lightning/trainer/callback_hook.py b/pytorch_lightning/trainer/callback_hook.py index 46f2a32c0a8f1..8f3885c20fcdc 100644 --- a/pytorch_lightning/trainer/callback_hook.py +++ b/pytorch_lightning/trainer/callback_hook.py @@ -209,3 +209,17 @@ def on_load_checkpoint(self, checkpoint): if state: state = deepcopy(state) callback.on_load_checkpoint(state) + + def on_after_backward(self): + """ + Called after loss.backward() and before optimizers do anything. + """ + for callback in self.callbacks: + callback.on_after_backward(self, self.get_model()) + + def on_before_zero_grad(self, optimizer): + """ + Called after optimizer.step() and before optimizer.zero_grad(). + """ + for callback in self.callbacks: + callback.on_before_zero_grad(self, self.get_model(), optimizer) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index d32f47dbbd485..934938c63f0f6 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -454,8 +454,7 @@ def optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_ ) def on_before_zero_grad(self, optimizer): - model = self.trainer.get_model() - model.on_before_zero_grad(optimizer) + self.trainer.call_hook('on_before_zero_grad', optimizer) def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx): self.trainer.accelerator_backend.optimizer_zero_grad(batch_idx, optimizer, opt_idx) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index bb7ec8430a7df..cf88f52436576 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -55,6 +55,8 @@ def __init__(self): self.on_validation_end_called = False self.on_test_start_called = False self.on_test_end_called = False + self.on_after_backward_called = False + self.on_before_zero_grad_called = False def setup(self, trainer, pl_module, stage: str): assert isinstance(trainer, Trainer) @@ -160,6 +162,14 @@ def on_test_end(self, trainer, pl_module): _check_args(trainer, pl_module) self.on_test_end_called = True + def on_after_backward(self, trainer, pl_module): + _check_args(trainer, pl_module) + self.on_after_backward_called = True + + def on_before_zero_grad(self, trainer, pl_module, optimizer): + _check_args(trainer, pl_module) + self.on_before_zero_grad_called = True + test_callback = TestCallback() trainer_options = dict( @@ -197,6 +207,8 @@ def on_test_end(self, trainer, pl_module): assert not test_callback.on_validation_end_called assert not test_callback.on_test_start_called assert not test_callback.on_test_end_called + assert not test_callback.on_after_backward_called + assert not test_callback.on_before_zero_grad_called # fit model trainer = Trainer(**trainer_options) @@ -228,6 +240,8 @@ def on_test_end(self, trainer, pl_module): assert not test_callback.on_validation_end_called assert not test_callback.on_test_start_called assert not test_callback.on_test_end_called + assert not test_callback.on_after_backward_called + assert not test_callback.on_before_zero_grad_called trainer.fit(model) @@ -257,6 +271,8 @@ def on_test_end(self, trainer, pl_module): assert not test_callback.on_test_batch_end_called assert not test_callback.on_test_start_called assert not test_callback.on_test_end_called + assert test_callback.on_after_backward_called + assert test_callback.on_before_zero_grad_called # reset setup teardown callback test_callback.teardown_called = False @@ -277,3 +293,5 @@ def on_test_end(self, trainer, pl_module): assert not test_callback.on_validation_end_called assert not test_callback.on_validation_batch_end_called assert not test_callback.on_validation_batch_start_called + assert not test_callback.on_after_backward_called + assert not test_callback.on_before_zero_grad_called From 4a095e1c5cb2474767fbb5b90aa7961dbc81ba4e Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Wed, 28 Oct 2020 16:42:52 +0100 Subject: [PATCH 10/88] [Docs] Warning on metric states (#4388) * warning on states * suggestion * Update metrics.rst Co-authored-by: Sean Naren --- docs/source/metrics.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst index dadf654b1ef67..cf7dcfbecc183 100644 --- a/docs/source/metrics.rst +++ b/docs/source/metrics.rst @@ -78,7 +78,6 @@ If ``on_epoch`` is True, the logger automatically logs the end of epoch metric v self.valid_acc(logits, y) self.log('valid_acc', self.valid_acc, on_step=True, on_epoch=True) - This metrics API is independent of PyTorch Lightning. Metrics can directly be used in PyTorch as shown in the example: .. code-block:: python @@ -105,6 +104,13 @@ This metrics API is independent of PyTorch Lightning. Metrics can directly be us # total accuracy over all validation batches total_valid_accuracy = valid_accuracy.compute() +.. note:: + + Metrics contain internal states that keep track of the data seen so far. + Do not mix metric states across training, validation and testing. + It is highly recommended to re-initialize the metric per mode as + shown in the examples above. + ********************* Implementing a Metric ********************* From 1e1a42260a8e8eb4e5d9aedcb2bf41b0b2eb63bc Mon Sep 17 00:00:00 2001 From: Jeremy Jordan <13970565+jeremyjordan@users.noreply.github.com> Date: Wed, 28 Oct 2020 12:26:58 -0400 Subject: [PATCH 11/88] add option to log momentum (#4384) * add option to log momentum * add docstring * refactor for cleanliness Co-authored-by: Nicki Skafte --- pytorch_lightning/callbacks/lr_monitor.py | 40 ++++++++++++++++++----- tests/base/model_optimizers.py | 7 ++++ tests/callbacks/test_lr_monitor.py | 28 ++++++++++++++++ 3 files changed, 66 insertions(+), 9 deletions(-) diff --git a/pytorch_lightning/callbacks/lr_monitor.py b/pytorch_lightning/callbacks/lr_monitor.py index b6de6107b0bc9..7502829044200 100755 --- a/pytorch_lightning/callbacks/lr_monitor.py +++ b/pytorch_lightning/callbacks/lr_monitor.py @@ -36,6 +36,8 @@ class LearningRateMonitor(Callback): logging_interval: set to `epoch` or `step` to log `lr` of all optimizers at the same interval, set to `None` to log at individual interval according to the `interval` key of each scheduler. Defaults to ``None``. + log_momentum: option to also log the momentum values of the optimizer, if the optimizer + has the `momentum` attribute. Defaults to ``False``. Example:: @@ -59,13 +61,14 @@ def configure_optimizer(self): return [optimizer], [lr_scheduler] """ - def __init__(self, logging_interval: Optional[str] = None): + def __init__(self, logging_interval: Optional[str] = None, log_momentum: bool = False): if logging_interval not in (None, 'step', 'epoch'): raise MisconfigurationException( 'logging_interval should be `step` or `epoch` or `None`.' ) self.logging_interval = logging_interval + self.log_momentum = log_momentum self.lrs = None self.lr_sch_names = [] @@ -92,6 +95,7 @@ def on_train_start(self, trainer, *args, **kwargs): # Initialize for storing values self.lrs = {name: [] for name in names} + self.last_momentum_values = {name + "-momentum": None for name in names} def on_train_batch_start(self, trainer, *args, **kwargs): if not self._should_log(trainer): @@ -99,7 +103,7 @@ def on_train_batch_start(self, trainer, *args, **kwargs): if self.logging_interval != 'epoch': interval = 'step' if self.logging_interval is None else 'any' - latest_stat = self._extract_lr(trainer, interval) + latest_stat = self._extract_stats(trainer, interval) if trainer.logger is not None and latest_stat: trainer.logger.log_metrics(latest_stat, step=trainer.global_step) @@ -107,12 +111,12 @@ def on_train_batch_start(self, trainer, *args, **kwargs): def on_train_epoch_start(self, trainer, *args, **kwargs): if self.logging_interval != 'step': interval = 'epoch' if self.logging_interval is None else 'any' - latest_stat = self._extract_lr(trainer, interval) + latest_stat = self._extract_stats(trainer, interval) if trainer.logger is not None and latest_stat: trainer.logger.log_metrics(latest_stat, step=trainer.current_epoch) - def _extract_lr(self, trainer, interval: str) -> Dict[str, float]: + def _extract_stats(self, trainer, interval: str) -> Dict[str, float]: latest_stat = {} for name, scheduler in zip(self.lr_sch_names, trainer.lr_schedulers): @@ -120,15 +124,33 @@ def _extract_lr(self, trainer, interval: str) -> Dict[str, float]: param_groups = scheduler['scheduler'].optimizer.param_groups if len(param_groups) != 1: for i, pg in enumerate(param_groups): - lr, key = pg['lr'], f'{name}/pg{i + 1}' - self.lrs[key].append(lr) - latest_stat[key] = lr + lr = self._extract_lr(param_group=pg, name=f'{name}/pg{i + 1}') + latest_stat.update(lr) + momentum = self._extract_momentum(param_group=pg, name=f'{name}-momentum/pg{i + 1}') + latest_stat.update(momentum) + else: - self.lrs[name].append(param_groups[0]['lr']) - latest_stat[name] = param_groups[0]['lr'] + pg = param_groups[0] + lr = self._extract_lr(param_group=pg, name=name) + latest_stat.update(lr) + momentum = self._extract_momentum(param_group=pg, name=f'{name}-momentum') + latest_stat.update(momentum) return latest_stat + def _extract_lr(self, param_group, name: str) -> Dict[str, float]: + lr = param_group.get('lr') + self.lrs[name].append(lr) + return {name: lr} + + def _extract_momentum(self, param_group, name: str) -> Dict[str, float]: + if not self.log_momentum: + return {} + + momentum = param_group.get('momentum') + self.last_momentum_values[name] = momentum + return {name: momentum} + def _find_names(self, lr_schedulers) -> List[str]: # Create uniqe names in the case we have multiple of the same learning # rate schduler + multiple parameter groups diff --git a/tests/base/model_optimizers.py b/tests/base/model_optimizers.py index e4b8d489f872d..46574adfd0b4d 100644 --- a/tests/base/model_optimizers.py +++ b/tests/base/model_optimizers.py @@ -64,6 +64,13 @@ def configure_optimizers__single_scheduler(self): lr_scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1) return [optimizer], [lr_scheduler] + def configure_optimizers__onecycle_scheduler(self): + optimizer = optim.SGD(self.parameters(), lr=self.learning_rate, momentum=0.9) + lr_scheduler = optim.lr_scheduler.OneCycleLR(optimizer, + max_lr=self.learning_rate, + total_steps=10_000) + return [optimizer], [lr_scheduler] + def configure_optimizers__multiple_schedulers(self): optimizer1 = optim.Adam(self.parameters(), lr=self.learning_rate) optimizer2 = optim.Adam(self.parameters(), lr=self.learning_rate) diff --git a/tests/callbacks/test_lr_monitor.py b/tests/callbacks/test_lr_monitor.py index 973dd64c08b14..a6783435ed3e2 100644 --- a/tests/callbacks/test_lr_monitor.py +++ b/tests/callbacks/test_lr_monitor.py @@ -39,12 +39,40 @@ def test_lr_monitor_single_lr(tmpdir): assert result assert lr_monitor.lrs, 'No learning rates logged' + assert all(v is None for v in lr_monitor.last_momentum_values.values()), \ + 'Momentum should not be logged by default' assert len(lr_monitor.lrs) == len(trainer.lr_schedulers), \ 'Number of learning rates logged does not match number of lr schedulers' assert all([k in ['lr-Adam'] for k in lr_monitor.lrs.keys()]), \ 'Names of learning rates not set correctly' +def test_lr_monitor_single_lr_with_momentum(tmpdir): + """ Test that learning rates and momentum are extracted and logged for single lr scheduler. """ + tutils.reset_seed() + + model = EvalModelTemplate() + model.configure_optimizers = model.configure_optimizers__onecycle_scheduler + + lr_monitor = LearningRateMonitor(log_momentum=True) + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=2, + limit_val_batches=0.1, + limit_train_batches=0.5, + callbacks=[lr_monitor], + ) + result = trainer.fit(model) + assert result + + assert all(v is not None for v in lr_monitor.last_momentum_values.values()), \ + 'Expected momentum to be logged' + assert len(lr_monitor.last_momentum_values) == len(trainer.lr_schedulers), \ + 'Number of momentum values logged does not match number of lr schedulers' + assert all([k in ['lr-SGD-momentum'] for k in lr_monitor.last_momentum_values.keys()]), \ + 'Names of momentum values not set correctly' + + def test_lr_monitor_no_lr_scheduler(tmpdir): tutils.reset_seed() From 41de4538aa0c187793709a93875e67666c2ddde8 Mon Sep 17 00:00:00 2001 From: Teddy Koker Date: Wed, 28 Oct 2020 13:25:36 -0400 Subject: [PATCH 12/88] Add individuals to Metrics in CODEOWNERS (#4413) * ananyahjha93 and teddykoker to codeowners for metrics * add Justus Co-authored-by: Sean Naren --- .github/CODEOWNERS | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index b048baf1302f6..0316d766752ca 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -5,4 +5,11 @@ # the repo. Unless a later match takes precedence, # @global-owner1 and @global-owner2 will be requested for # review when someone opens a pull request. -* @williamfalcon @borda @teddykoker @awaelchli @nateraw @justusschock @tchaton @SeanNaren @ananyahjha93 +* @williamfalcon @borda @teddykoker @awaelchli @nateraw @justusschock @tchaton @SeanNaren @ananyahjha93 + +# Metrics +/pytorch_lightning/metrics/* @teddykoker @ananyahjha93 @justusschock +/tests/metrics/* @teddykoker @ananyahjha93 @justusschock +/docs/source/metrics.rst @teddykoker @ananyahjha93 @justusschock + + From ff41d80706c49e34f33215b1f1cf86619b19dfc9 Mon Sep 17 00:00:00 2001 From: Boris Dayma Date: Wed, 28 Oct 2020 14:37:06 -0500 Subject: [PATCH 13/88] feat(wandb): log in sync with Trainer step (#4405) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(wandb): log in sync with Trainer step * docs: update CHANGELOG * style(test_wandb): fix formatting * parentheses Co-authored-by: Adrian Wälchli Co-authored-by: Rohit Gupta --- CHANGELOG.md | 3 +++ pytorch_lightning/loggers/wandb.py | 9 +++++++-- tests/loggers/test_wandb.py | 10 ++++++++-- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b20d4ae3eab0c..1de62b442f25b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed santized parameters for `WandbLogger.log_hyperparams` ([#4320](https://github.com/PyTorchLightning/pytorch-lightning/pull/4320)) +- W&B log in sync with Trainer step ([#4405](https://github.com/PyTorchLightning/pytorch-lightning/pull/4405)) + + ### Deprecated diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py index e6ce264d597bf..5786a52a8e371 100644 --- a/pytorch_lightning/loggers/wandb.py +++ b/pytorch_lightning/loggers/wandb.py @@ -94,6 +94,8 @@ def __init__( self._offline = offline self._log_model = log_model self._kwargs = kwargs + # logging multiple Trainer on a single W&B run (k-fold, etc) + self._step_offset = 0 def __getstate__(self): state = self.__dict__.copy() @@ -141,8 +143,7 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: @rank_zero_only def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: assert rank_zero_only.rank == 0, 'experiment tried to log from global_rank != 0' - - self.experiment.log({'global_step': step, **metrics} if step is not None else metrics) + self.experiment.log(metrics, step=(step + self._step_offset) if step is not None else None) @property def save_dir(self) -> Optional[str]: @@ -159,6 +160,10 @@ def version(self) -> Optional[str]: return self._experiment.id if self._experiment else self._id def finalize(self, status: str) -> None: + # offset future training logged on same W&B run + if self._experiment is not None: + self._step_offset = self._experiment.step + # upload all checkpoints from saving dir if self._log_model: wandb.save(os.path.join(self.save_dir, "*.ckpt")) diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py index 6682cfdc8830a..cfb6533bd913b 100644 --- a/tests/loggers/test_wandb.py +++ b/tests/loggers/test_wandb.py @@ -29,11 +29,17 @@ def test_wandb_logger(wandb): logger = WandbLogger(anonymous=True, offline=True) logger.log_metrics({'acc': 1.0}) - wandb.init().log.assert_called_once_with({'acc': 1.0}) + wandb.init().log.assert_called_once_with({'acc': 1.0}, step=None) wandb.init().log.reset_mock() logger.log_metrics({'acc': 1.0}, step=3) - wandb.init().log.assert_called_once_with({'global_step': 3, 'acc': 1.0}) + wandb.init().log.assert_called_once_with({'acc': 1.0}, step=3) + + # continue training on same W&B run + wandb.init().step = 3 + logger.finalize('success') + logger.log_metrics({'acc': 1.0}, step=3) + wandb.init().log.assert_called_with({'acc': 1.0}, step=6) logger.log_hyperparams({'test': None, 'nested': {'a': 1}, 'b': [2, 3, 4]}) wandb.init().config.update.assert_called_once_with( From 9cfd29946a308526cee088c176a2285e6752453c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stef=20=7C=20=E3=82=B9=E3=83=86=E3=83=95?= Date: Thu, 29 Oct 2020 14:46:57 +0900 Subject: [PATCH 14/88] move example inputs to correct device when tracing module (#4360) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * use move_data_to_device instead of to; docstring also allow tuple of Tensor; not supported log error when example_inputs is a dict; commented docstring trace example * Use isinstance to check if example_inputs is a Mapping, instead of type Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> * import Mapping for isinstance check * multi-line docstring code to test TorchScript trace() * Fix PEP8 f-string is missing placeholders * minor code style improvements * Use (possibly user overwritten) transfer_batch_to_device instead of move_data_to_device Co-authored-by: Rohit Gupta * fixed weird comment about trace() log error * Remove unused import Co-authored-by: Jeff Yang * Remove logger warning about dict not example_inputs not supported by trace Co-authored-by: stef-ubuntu Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: Adrian Wälchli Co-authored-by: Rohit Gupta Co-authored-by: Jeff Yang --- pytorch_lightning/core/lightning.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 065b29c75da37..22d63d0a03a74 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -20,7 +20,7 @@ import tempfile from abc import ABC from argparse import Namespace -from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, Mapping import torch from pytorch_lightning import _logger as log @@ -1539,7 +1539,7 @@ def to_onnx(self, file_path: str, input_sample: Optional[Tensor] = None, **kwarg def to_torchscript( self, file_path: Optional[str] = None, method: Optional[str] = 'script', - example_inputs: Optional[torch.Tensor] = None, **kwargs + example_inputs: Optional[Union[torch.Tensor, Tuple[torch.Tensor]]] = None, **kwargs ) -> Union[ScriptModule, Dict[str, ScriptModule]]: """ By default compiles the whole model to a :class:`~torch.jit.ScriptModule`. @@ -1576,6 +1576,9 @@ def to_torchscript( >>> model = SimpleModel() >>> torch.jit.save(model.to_torchscript(), "model.pt") # doctest: +SKIP >>> os.path.isfile("model.pt") # doctest: +SKIP + >>> torch.jit.save(model.to_torchscript(file_path="model_trace.pt", method='trace', # doctest: +SKIP + ... example_inputs=torch.randn(1, 64))) # doctest: +SKIP + >>> os.path.isfile("model_trace.pt") # doctest: +SKIP True Return: @@ -1592,8 +1595,8 @@ def to_torchscript( if example_inputs is None: example_inputs = self.example_input_array # automatically send example inputs to the right device and use trace - torchscript_module = torch.jit.trace(func=self.eval(), example_inputs=example_inputs.to(self.device), - **kwargs) + example_inputs = self.transfer_batch_to_device(example_inputs, device=self.device) + torchscript_module = torch.jit.trace(func=self.eval(), example_inputs=example_inputs, **kwargs) else: raise ValueError(f"The 'method' parameter only supports 'script' or 'trace', but value given was:" f"{method}") From ce261e4afe230c103159ff8f846324ce956ecc60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 29 Oct 2020 10:00:07 +0100 Subject: [PATCH 15/88] Fix CSV logger warning (#4419) Co-authored-by: Jeff Yang --- pytorch_lightning/loggers/csv_logs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/loggers/csv_logs.py b/pytorch_lightning/loggers/csv_logs.py index c22f46eb03f5a..2d478855eb1b5 100644 --- a/pytorch_lightning/loggers/csv_logs.py +++ b/pytorch_lightning/loggers/csv_logs.py @@ -52,7 +52,7 @@ def __init__(self, log_dir: str) -> None: self.metrics = [] self.log_dir = log_dir - if os.path.exists(self.log_dir): + if os.path.exists(self.log_dir) and os.listdir(self.log_dir): rank_zero_warn( f"Experiment logs directory {self.log_dir} exists and is not empty." " Previous log files in this directory will be deleted when the new ones are saved!" From b459fd26ac773484e4c97c12e4bab221bb1609b0 Mon Sep 17 00:00:00 2001 From: Martin Hwang Date: Thu, 29 Oct 2020 18:50:37 +0900 Subject: [PATCH 16/88] fix: `nb` is set total number of devices, when nb is -1. (#4209) * fix: `nb` is set total number of devices, when nb is -1. Refs: #4207 * feat: add test code 1. test combination `auto_select_gpus`, `gpus` options using Trainer 2. test `pick_multiple_gpus` function directly Refs: #4207 * docs: modify contents in `Select GPU devices` Refs: #4207 * refactore: reflect the reuslt of review Refs: #4207 * refactore: reflect the reuslt of review Refs: #4207 * Update CHANGELOG.md Co-authored-by: chaton Co-authored-by: Roger Shieh <55400948+s-rog@users.noreply.github.com> Co-authored-by: Nicki Skafte --- CHANGELOG.md | 7 ++ docs/source/multi_gpu.rst | 2 + pytorch_lightning/trainer/__init__.py | 6 ++ pytorch_lightning/tuner/auto_gpu_select.py | 10 +++ tests/tuner/__init__.py | 0 tests/tuner/test_auto_gpu_select.py | 74 ++++++++++++++++++++++ 6 files changed, 99 insertions(+) create mode 100644 tests/tuner/__init__.py create mode 100644 tests/tuner/test_auto_gpu_select.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1de62b442f25b..803ece1a51ed2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -66,12 +66,19 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed + +- Fixed error using `auto_select_gpus=True` with `gpus=-1` ([#4209](https://github.com/PyTorchLightning/pytorch-lightning/pull/4209)) + + - Fixed setting device ids in DDP ([#4297](https://github.com/PyTorchLightning/pytorch-lightning/pull/4297)) + - Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/PyTorchLightning/pytorch-lightning/pull/4323)) + - Fixed WandbLogger not uploading checkpoint artifacts at the end of training ([#4341](https://github.com/PyTorchLightning/pytorch-lightning/pull/4341)) + ## [1.0.3] - 2020-10-20 ### Added diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index ea49601a397cd..8ea8646e136f8 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -206,6 +206,8 @@ Note in particular the difference between `gpus=0`, `gpus=[0]` and `gpus="0"`. `auto_select_gpus=True` will automatically help you find `k` gpus that are not occupied by other processes. This is especially useful when GPUs are configured to be in "exclusive mode", such that only one process at a time can access them. + For more details see the :ref:`Trainer guide `. + Remove CUDA flags ^^^^^^^^^^^^^^^^^ diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py index a4bf2969f4a77..954befb00a468 100644 --- a/pytorch_lightning/trainer/__init__.py +++ b/pytorch_lightning/trainer/__init__.py @@ -381,6 +381,12 @@ def training_step(self, batch, batch_idx, optimizer_idx): # enable auto selection (will find two available gpus on system) trainer = Trainer(gpus=2, auto_select_gpus=True) + # specifies all GPUs regardless of its availability + Trainer(gpus=-1, auto_select_gpus=False) + + # specifies all available GPUs (if only one GPU is not occupied, uses one gpu) + Trainer(gpus=-1, auto_select_gpus=True) + auto_lr_find ^^^^^^^^^^^^ diff --git a/pytorch_lightning/tuner/auto_gpu_select.py b/pytorch_lightning/tuner/auto_gpu_select.py index f1b13a69745bc..fd2ba4a1f3627 100644 --- a/pytorch_lightning/tuner/auto_gpu_select.py +++ b/pytorch_lightning/tuner/auto_gpu_select.py @@ -13,8 +13,18 @@ # limitations under the License. import torch +from pytorch_lightning.utilities.exceptions import MisconfigurationException + def pick_multiple_gpus(nb): + if nb == 0: + raise MisconfigurationException( + r"auto_select_gpus=True, gpus=0 is not a valid configuration.\ + Please select a valid number of GPU resources when using auto_select_gpus." + ) + + nb = torch.cuda.device_count() if nb == -1 else nb + picked = [] for _ in range(nb): picked.append(pick_single_gpu(exclude_gpus=picked)) diff --git a/tests/tuner/__init__.py b/tests/tuner/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tuner/test_auto_gpu_select.py b/tests/tuner/test_auto_gpu_select.py new file mode 100644 index 0000000000000..36b33a707b99f --- /dev/null +++ b/tests/tuner/test_auto_gpu_select.py @@ -0,0 +1,74 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re + +import pytest +import torch + +from pytorch_lightning import Trainer +from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +@pytest.mark.skipif( + torch.cuda.device_count() < 2, reason="test requires a number of GPU machine greater than 1" +) +@pytest.mark.parametrize( + ["auto_select_gpus", "gpus", "expected_error"], + [ + (True, 0, MisconfigurationException), + (True, -1, None), + (False, 0, None), + (False, -1, None), + ], +) +def test_trainer_with_gpus_options_combination_at_available_gpus_env( + auto_select_gpus, gpus, expected_error +): + if expected_error: + with pytest.raises( + expected_error, + match=re.escape( + r"auto_select_gpus=True, gpus=0 is not a valid configuration.\ + Please select a valid number of GPU resources when using auto_select_gpus." + ), + ): + trainer = Trainer(auto_select_gpus=auto_select_gpus, gpus=gpus) + else: + trainer = Trainer(auto_select_gpus=auto_select_gpus, gpus=gpus) + + +@pytest.mark.skipif( + torch.cuda.device_count() < 2, reason="test requires a number of GPU machine greater than 1" +) +@pytest.mark.parametrize( + ["nb", "expected_gpu_idxs", "expected_error"], + [ + (0, [], MisconfigurationException), + (-1, [i for i in range(torch.cuda.device_count())], None), + (1, [0], None), + ], +) +def test_pick_multiple_gpus(nb, expected_gpu_idxs, expected_error): + if expected_error: + with pytest.raises( + expected_error, + match=re.escape( + r"auto_select_gpus=True, gpus=0 is not a valid configuration.\ + Please select a valid number of GPU resources when using auto_select_gpus." + ), + ): + pick_multiple_gpus(nb) + else: + assert expected_gpu_idxs == pick_multiple_gpus(nb) From bbd81dfd55eb07c7ee07ee6c0b95f207d3ade353 Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Thu, 29 Oct 2020 18:31:37 +0100 Subject: [PATCH 17/88] Skips DDP parameter sync (#4301) * ddp no-sync * Update pytorch_lightning/trainer/training_loop.py Co-authored-by: ananthsub * Update training_loop.py * factor __enter__ and __exit__ out to separate context manager * delete _updated_model_last_step Co-authored-by: justusschock Co-authored-by: Teddy Koker Co-authored-by: ananthsub Co-authored-by: chaton Co-authored-by: Rohit Gupta --- pytorch_lightning/trainer/training_loop.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 934938c63f0f6..d1dfb3eec3733 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import subprocess +from contextlib import contextmanager from copy import copy, deepcopy import numpy as np @@ -655,6 +655,7 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx): # checks if backward or backward + optimizer step (via closure) accumulation_done = self._accumulated_batches_reached() is_final_batch = self._num_training_batches_reached() + should_accumulate = not (accumulation_done or is_final_batch) # lightning module hook splits = self.tbptt_split_batch(batch) @@ -675,13 +676,17 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx): model = self.trainer.get_model() model.toggle_optimizer(optimizer, opt_idx) - if not (accumulation_done or is_final_batch): + if should_accumulate: # For gradient accumulation # ------------------- # calculate loss (train step + train step end) # ------------------- - self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens) + + # perform dpp sync only when performing optimizer_step + with self.block_ddp_sync_behaviour(): + self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens) + batch_outputs = self._process_closure_result( batch_callback_metrics=batch_callback_metrics, batch_log_metrics=batch_log_metrics, @@ -695,7 +700,6 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx): # gradient update with accumulated gradients else: - if self.automatic_optimization: def train_step_and_backward_closure(): @@ -760,6 +764,13 @@ def train_step_and_backward_closure(): ) return result + @contextmanager + def block_ddp_sync_behaviour(self): + if isinstance(self.trainer.model, torch.nn.parallel.DistributedDataParallel): + yield from self.trainer.model.no_sync() + else: + yield + def _process_closure_result( self, batch_callback_metrics: list, batch_log_metrics: list, batch_outputs: list, opt_idx: int ) -> list: From ebe3a31ddd82c616df6612cb880b0b3b13b9ecde Mon Sep 17 00:00:00 2001 From: Jeff Yang Date: Fri, 30 Oct 2020 00:45:24 +0630 Subject: [PATCH 18/88] [docs] distributed_backend -> accelerator (#4429) * distributed_backend -> accelerator * distributed_backend -> accelerator * use_amp -> precision * format Co-authored-by: rohitgr7 --- docs/source/introduction_guide.rst | 2 +- docs/source/lightning_module.rst | 4 +-- docs/source/multi_gpu.rst | 44 +++++++++++++-------------- docs/source/performance.rst | 4 +-- docs/source/slurm.rst | 4 +-- docs/source/tpu.rst | 2 +- pytorch_lightning/trainer/__init__.py | 14 ++++----- 7 files changed, 37 insertions(+), 37 deletions(-) diff --git a/docs/source/introduction_guide.rst b/docs/source/introduction_guide.rst index e331368c94884..d6d082e2ed779 100644 --- a/docs/source/introduction_guide.rst +++ b/docs/source/introduction_guide.rst @@ -543,7 +543,7 @@ Or multiple nodes # (32 GPUs) model = LitMNIST() - trainer = Trainer(gpus=8, num_nodes=4, distributed_backend='ddp') + trainer = Trainer(gpus=8, num_nodes=4, accelerator='ddp') trainer.fit(model, train_loader) Refer to the :ref:`distributed computing guide for more details `. diff --git a/docs/source/lightning_module.rst b/docs/source/lightning_module.rst index 4c297a09a55d8..d13b449614113 100644 --- a/docs/source/lightning_module.rst +++ b/docs/source/lightning_module.rst @@ -256,7 +256,7 @@ The matching pseudocode is: Training with DataParallel ~~~~~~~~~~~~~~~~~~~~~~~~~~ -When training using a `distributed_backend` that splits data from each batch across GPUs, sometimes you might +When training using a `accelerator` that splits data from each batch across GPUs, sometimes you might need to aggregate them on the master GPU for processing (dp, or ddp2). In this case, implement the `training_step_end` method @@ -360,7 +360,7 @@ If you need to do something with all the outputs of each `validation_step`, over Validating with DataParallel ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When training using a `distributed_backend` that splits data from each batch across GPUs, sometimes you might +When training using a `accelerator` that splits data from each batch across GPUs, sometimes you might need to aggregate them on the master GPU for processing (dp, or ddp2). In this case, implement the `validation_step_end` method diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index 8ea8646e136f8..1d2600df02180 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -231,11 +231,11 @@ Distributed modes ----------------- Lightning allows multiple ways of training -- Data Parallel (`distributed_backend='dp'`) (multiple-gpus, 1 machine) -- DistributedDataParallel (`distributed_backend='ddp'`) (multiple-gpus across many machines (python script based)). -- DistributedDataParallel (`distributed_backend='ddp_spawn'`) (multiple-gpus across many machines (spawn based)). -- DistributedDataParallel 2 (`distributed_backend='ddp2'`) (DP in a machine, DDP across machines). -- Horovod (`distributed_backend='horovod'`) (multi-machine, multi-gpu, configured at runtime) +- Data Parallel (`accelerator='dp'`) (multiple-gpus, 1 machine) +- DistributedDataParallel (`accelerator='ddp'`) (multiple-gpus across many machines (python script based)). +- DistributedDataParallel (`accelerator='ddp_spawn'`) (multiple-gpus across many machines (spawn based)). +- DistributedDataParallel 2 (`accelerator='ddp2'`) (DP in a machine, DDP across machines). +- Horovod (`accelerator='horovod'`) (multi-machine, multi-gpu, configured at runtime) - TPUs (`tpu_cores=8|x`) (tpu or TPU pod) .. note:: @@ -258,7 +258,7 @@ after which the root node will aggregate the results. :skipif: torch.cuda.device_count() < 2 # train on 2 GPUs (using DP mode) - trainer = Trainer(gpus=2, distributed_backend='dp') + trainer = Trainer(gpus=2, accelerator='dp') Distributed Data Parallel ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -281,10 +281,10 @@ Distributed Data Parallel .. code-block:: python # train on 8 GPUs (same machine (ie: node)) - trainer = Trainer(gpus=8, distributed_backend='ddp') + trainer = Trainer(gpus=8, accelerator='ddp') # train on 32 GPUs (4 nodes) - trainer = Trainer(gpus=8, distributed_backend='ddp', num_nodes=4) + trainer = Trainer(gpus=8, accelerator='ddp', num_nodes=4) This Lightning implementation of DDP calls your script under the hood multiple times with the correct environment variables: @@ -330,7 +330,7 @@ In this case, we can use DDP2 which behaves like DP in a machine and DDP across .. code-block:: python # train on 32 GPUs (4 nodes) - trainer = Trainer(gpus=8, distributed_backend='ddp2', num_nodes=4) + trainer = Trainer(gpus=8, accelerator='ddp2', num_nodes=4) Distributed Data Parallel Spawn ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -348,7 +348,7 @@ project module) you can use the following method: .. code-block:: python # train on 8 GPUs (same machine (ie: node)) - trainer = Trainer(gpus=8, distributed_backend='ddp') + trainer = Trainer(gpus=8, accelerator='ddp') We STRONGLY discourage this use because it has limitations (due to Python and PyTorch): @@ -400,7 +400,7 @@ You can then call your scripts anywhere .. code-block:: bash cd /project/src - python some_file.py --distributed_backend 'ddp' --gpus 8 + python some_file.py --accelerator 'ddp' --gpus 8 Horovod @@ -421,10 +421,10 @@ Horovod can be configured in the training script to run with any number of GPUs .. code-block:: python # train Horovod on GPU (number of GPUs / machines provided on command-line) - trainer = Trainer(distributed_backend='horovod', gpus=1) + trainer = Trainer(accelerator='horovod', gpus=1) # train Horovod on CPU (number of processes / machines provided on command-line) - trainer = Trainer(distributed_backend='horovod') + trainer = Trainer(accelerator='horovod') When starting the training job, the driver application will then be used to specify the total number of worker processes: @@ -554,13 +554,13 @@ Below are the possible configurations we support. +=======+=========+====+=====+=========+============================================================+ | Y | | | | | `Trainer(gpus=1)` | +-------+---------+----+-----+---------+------------------------------------------------------------+ -| Y | | | | Y | `Trainer(gpus=1, use_amp=True)` | +| Y | | | | Y | `Trainer(gpus=1, precision=16)` | +-------+---------+----+-----+---------+------------------------------------------------------------+ -| | Y | Y | | | `Trainer(gpus=k, distributed_backend='dp')` | +| | Y | Y | | | `Trainer(gpus=k, accelerator='dp')` | +-------+---------+----+-----+---------+------------------------------------------------------------+ -| | Y | | Y | | `Trainer(gpus=k, distributed_backend='ddp')` | +| | Y | | Y | | `Trainer(gpus=k, accelerator='ddp')` | +-------+---------+----+-----+---------+------------------------------------------------------------+ -| | Y | | Y | Y | `Trainer(gpus=k, distributed_backend='ddp', use_amp=True)` | +| | Y | | Y | Y | `Trainer(gpus=k, accelerator='ddp', precision=16)` | +-------+---------+----+-----+---------+------------------------------------------------------------+ @@ -590,10 +590,10 @@ In (DDP, Horovod) your effective batch size will be 7 * gpus * num_nodes. .. code-block:: python # effective batch size = 7 * 8 - Trainer(gpus=8, distributed_backend='ddp|horovod') + Trainer(gpus=8, accelerator='ddp|horovod') # effective batch size = 7 * 8 * 10 - Trainer(gpus=8, num_nodes=10, distributed_backend='ddp|horovod') + Trainer(gpus=8, num_nodes=10, accelerator='ddp|horovod') In DDP2, your effective batch size will be 7 * num_nodes. @@ -602,10 +602,10 @@ The reason is that the full batch is visible to all GPUs on the node when using .. code-block:: python # effective batch size = 7 - Trainer(gpus=8, distributed_backend='ddp2') + Trainer(gpus=8, accelerator='ddp2') # effective batch size = 7 * 10 - Trainer(gpus=8, num_nodes=10, distributed_backend='ddp2') + Trainer(gpus=8, num_nodes=10, accelerator='ddp2') .. note:: Huge batch sizes are actually really bad for convergence. Check out: @@ -619,7 +619,7 @@ Lightning supports the use of PytorchElastic to enable fault-tolerent and elasti .. code-block:: python - Trainer(gpus=8, distributed_backend='ddp') + Trainer(gpus=8, accelerator='ddp') Following the `PytorchElastic Quickstart documentation `_, you then need to start a single-node etcd server on one of the hosts: diff --git a/docs/source/performance.rst b/docs/source/performance.rst index 7ee9fe78e601f..6e81963e31952 100644 --- a/docs/source/performance.rst +++ b/docs/source/performance.rst @@ -33,9 +33,9 @@ The best thing to do is to increase the ``num_workers`` slowly and stop once you Spawn ^^^^^ -When using ``distributed_backend=ddp_spawn`` (the ddp default) or TPU training, the way multiple GPUs/TPU cores are used is by calling ``.spawn()`` under the hood. +When using ``accelerator=ddp_spawn`` (the ddp default) or TPU training, the way multiple GPUs/TPU cores are used is by calling ``.spawn()`` under the hood. The problem is that PyTorch has issues with ``num_workers > 0`` when using ``.spawn()``. For this reason we recommend you -use ``distributed_backend=ddp`` so you can increase the ``num_workers``, however your script has to be callable like so: +use ``accelerator=ddp`` so you can increase the ``num_workers``, however your script has to be callable like so: .. code-block:: bash diff --git a/docs/source/slurm.rst b/docs/source/slurm.rst index 287fccf71fe81..be40810c3f944 100644 --- a/docs/source/slurm.rst +++ b/docs/source/slurm.rst @@ -24,7 +24,7 @@ To train a model using multiple nodes, do the following: .. code-block:: python # train on 32 GPUs across 4 nodes - trainer = Trainer(gpus=8, num_nodes=4, distributed_backend='ddp') + trainer = Trainer(gpus=8, num_nodes=4, accelerator='ddp') 3. It's a good idea to structure your training script like this: @@ -37,7 +37,7 @@ To train a model using multiple nodes, do the following: trainer = pl.Trainer( gpus=8, num_nodes=4, - distributed_backend='ddp' + accelerator='ddp' ) trainer.fit(model) diff --git a/docs/source/tpu.rst b/docs/source/tpu.rst index f6189244fa020..ffebe41a9c856 100644 --- a/docs/source/tpu.rst +++ b/docs/source/tpu.rst @@ -140,7 +140,7 @@ Lightning supports training on a single TPU core. Just pass the TPU core ID [1-8 Distributed Backend with TPU ---------------------------- -The ```distributed_backend``` option used for GPUs does not apply to TPUs. +The ``accelerator`` option used for GPUs does not apply to TPUs. TPUs work in DDP mode by default (distributing over each core) ---------------- diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py index 954befb00a468..2d0c3e46f2fe0 100644 --- a/pytorch_lightning/trainer/__init__.py +++ b/pytorch_lightning/trainer/__init__.py @@ -203,18 +203,18 @@ def forward(self, x): .. testcode:: # default used by the Trainer - trainer = Trainer(distributed_backend=None) + trainer = Trainer(accelerator=None) Example:: # dp = DataParallel - trainer = Trainer(gpus=2, distributed_backend='dp') + trainer = Trainer(gpus=2, accelerator='dp') # ddp = DistributedDataParallel - trainer = Trainer(gpus=2, num_nodes=2, distributed_backend='ddp') + trainer = Trainer(gpus=2, num_nodes=2, accelerator='ddp') # ddp2 = DistributedDataParallel + dp - trainer = Trainer(gpus=2, num_nodes=2, distributed_backend='ddp2') + trainer = Trainer(gpus=2, num_nodes=2, accelerator='ddp2') .. note:: this option does not apply to TPU. TPUs use ```ddp``` by default (over each core) @@ -948,8 +948,8 @@ def on_train_end(self, trainer, pl_module): | Number of processes to train with. Automatically set to the number of GPUs -when using ``distrbuted_backend="ddp"``. Set to a number greater than 1 when -using ``distributed_backend="ddp_cpu"`` to mimic distributed training on a +when using ``accelerator="ddp"``. Set to a number greater than 1 when +using ``accelerator="ddp_cpu"`` to mimic distributed training on a machine without GPUs. This is useful for debugging, but **will not** provide any speedup, since single-process Torch already makes effient use of multiple CPUs. @@ -957,7 +957,7 @@ def on_train_end(self, trainer, pl_module): .. testcode:: # Simulate DDP for debugging on your GPU-less laptop - trainer = Trainer(distributed_backend="ddp_cpu", num_processes=2) + trainer = Trainer(accelerator="ddp_cpu", num_processes=2) num_sanity_val_steps ^^^^^^^^^^^^^^^^^^^^ From d1234c592d645ca4200d3ce2194612ef2bcac41f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 30 Oct 2020 04:47:37 +0100 Subject: [PATCH 19/88] deprecate passing ModelCheckpoint instance to Trainer(checkpoint_callback=...) (#4336) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * first attempt * update tests * support multiple * test bugfix * changelog * pep * pep * import order * import * improve test for resuming * test * update test * add references test Co-authored-by: Carlos Mocholí * docstring suggestion deprecation Co-authored-by: Jeff Yang * paramref Co-authored-by: Carlos Mocholí Co-authored-by: Jeff Yang --- CHANGELOG.md | 2 + .../trainer/connectors/callback_connector.py | 36 ++++++--- pytorch_lightning/trainer/properties.py | 17 +++- pytorch_lightning/trainer/trainer.py | 10 ++- pytorch_lightning/tuner/batch_size_scaling.py | 2 - pytorch_lightning/tuner/lr_finder.py | 4 - tests/checkpointing/test_model_checkpoint.py | 40 ++++++++++ tests/models/test_restore.py | 77 +++++++++++++++++-- tests/test_deprecated.py | 6 ++ 9 files changed, 167 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 803ece1a51ed2..d42fcaaa7a46f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -60,6 +60,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated bool values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) +- Deprecated passing `ModelCheckpoint` instance to `checkpoint_callback` Trainer argument ([#4336](https://github.com/PyTorchLightning/pytorch-lightning/pull/4336)) + ### Removed diff --git a/pytorch_lightning/trainer/connectors/callback_connector.py b/pytorch_lightning/trainer/connectors/callback_connector.py index 187ff237056a2..b8a4276a2d747 100644 --- a/pytorch_lightning/trainer/connectors/callback_connector.py +++ b/pytorch_lightning/trainer/connectors/callback_connector.py @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. import os + +from typing import Union, Optional + from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, ProgressBarBase, ProgressBar from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -44,25 +47,31 @@ def on_trainer_init( # configure checkpoint callback # it is important that this is the last callback to run # pass through the required args to figure out defaults - checkpoint_callback = self.init_default_checkpoint_callback(checkpoint_callback) - if checkpoint_callback: - self.trainer.callbacks.append(checkpoint_callback) - - # TODO refactor codebase (tests) to not directly reach into these callbacks - self.trainer.checkpoint_callback = checkpoint_callback + self.configure_checkpoint_callbacks(checkpoint_callback) # init progress bar self.trainer._progress_bar_callback = self.configure_progress_bar( progress_bar_refresh_rate, process_position ) - def init_default_checkpoint_callback(self, checkpoint_callback): - if checkpoint_callback is True: - checkpoint_callback = ModelCheckpoint(dirpath=None, filename=None) - elif checkpoint_callback is False: - checkpoint_callback = None + def configure_checkpoint_callbacks(self, checkpoint_callback: Union[ModelCheckpoint, bool]): + if isinstance(checkpoint_callback, ModelCheckpoint): + # TODO: deprecated, remove this block in v1.4.0 + rank_zero_warn( + "Passing a ModelCheckpoint instance to Trainer(checkpoint_callbacks=...)" + " is deprecated since v1.1 and will no longer be supported in v1.4.", + DeprecationWarning + ) + self.trainer.callbacks.append(checkpoint_callback) + + if self._trainer_has_checkpoint_callbacks() and checkpoint_callback is False: + raise MisconfigurationException( + "Trainer was configured with checkpoint_callback=False but found ModelCheckpoint" + " in callbacks list." + ) - return checkpoint_callback + if not self._trainer_has_checkpoint_callbacks() and checkpoint_callback is True: + self.trainer.callbacks.append(ModelCheckpoint(dirpath=None, filename=None)) def configure_progress_bar(self, refresh_rate=1, process_position=0): progress_bars = [c for c in self.trainer.callbacks if isinstance(c, ProgressBarBase)] @@ -83,3 +92,6 @@ def configure_progress_bar(self, refresh_rate=1, process_position=0): progress_bar_callback = None return progress_bar_callback + + def _trainer_has_checkpoint_callbacks(self): + return len(self.trainer.checkpoint_callbacks) > 0 diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index afb2f4cb5eb91..8d509d41d52bf 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -17,7 +17,7 @@ from argparse import ArgumentParser, Namespace from typing import List, Optional, Union, Type, TypeVar -from pytorch_lightning.callbacks import ProgressBarBase +from pytorch_lightning.callbacks import Callback, ProgressBarBase, ModelCheckpoint from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.trainer.connectors.checkpoint_connector import CheckpointConnector from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector @@ -46,6 +46,7 @@ class TrainerProperties(ABC): _weights_save_path: str model_connector: ModelConnector checkpoint_connector: CheckpointConnector + callbacks: List[Callback] @property def use_amp(self) -> bool: @@ -187,6 +188,20 @@ def weights_save_path(self) -> str: return os.path.normpath(self._weights_save_path) return self._weights_save_path + @property + def checkpoint_callback(self) -> Optional[ModelCheckpoint]: + """ + The first checkpoint callback in the Trainer.callbacks list, or ``None`` if + no checkpoint callbacks exist. + """ + callbacks = self.checkpoint_callbacks + return callbacks[0] if len(callbacks) > 0 else None + + @property + def checkpoint_callbacks(self) -> List[ModelCheckpoint]: + """ A list of all instances of ModelCheckpoint found in the Trainer.callbacks list. """ + return [c for c in self.callbacks if isinstance(c, ModelCheckpoint)] + def save_checkpoint(self, filepath, weights_only: bool = False): self.checkpoint_connector.save_checkpoint(filepath, weights_only) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 337eb4c4ed567..008633273a0d1 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -85,7 +85,7 @@ class Trainer( def __init__( self, logger: Union[LightningLoggerBase, Iterable[LightningLoggerBase], bool] = True, - checkpoint_callback: Union[ModelCheckpoint, bool] = True, + checkpoint_callback: bool = True, callbacks: Optional[List[Callback]] = None, default_root_dir: Optional[str] = None, gradient_clip_val: float = 0, @@ -169,7 +169,12 @@ def __init__( callbacks: Add a list of callbacks. - checkpoint_callback: Callback for checkpointing. + checkpoint_callback: If ``True``, enable checkpointing. + It will configure a default ModelCheckpoint callback if there is no user-defined ModelCheckpoint in + :paramref:`~pytorch_lightning.trainer.trainer.Trainer.callbacks`. Default: ``True``. + + .. warning:: Passing a ModelCheckpoint instance to this argument is deprecated since + v1.1.0 and will be unsupported from v1.4.0. check_val_every_n_epoch: Check val every n train epochs. @@ -297,7 +302,6 @@ def __init__( # init callbacks # Declare attributes to be set in callback_connector on_trainer_init - self.checkpoint_callback: Union[ModelCheckpoint, bool] = checkpoint_callback self.callback_connector.on_trainer_init( callbacks, checkpoint_callback, diff --git a/pytorch_lightning/tuner/batch_size_scaling.py b/pytorch_lightning/tuner/batch_size_scaling.py index 87783fbde5d1f..10de8a2d289e5 100644 --- a/pytorch_lightning/tuner/batch_size_scaling.py +++ b/pytorch_lightning/tuner/batch_size_scaling.py @@ -144,7 +144,6 @@ def __scale_batch_reset_params(trainer, model, steps_per_trial): trainer.weights_summary = None # not needed before full run trainer.logger = DummyLogger() trainer.callbacks = [] # not needed before full run - trainer.checkpoint_callback = False # required for saving trainer.limit_train_batches = 1.0 trainer.optimizers, trainer.schedulers = [], [] # required for saving trainer.model = model # required for saving @@ -157,7 +156,6 @@ def __scale_batch_restore_params(trainer): trainer.weights_summary = trainer.__dumped_params['weights_summary'] trainer.logger = trainer.__dumped_params['logger'] trainer.callbacks = trainer.__dumped_params['callbacks'] - trainer.checkpoint_callback = trainer.__dumped_params['checkpoint_callback'] trainer.auto_scale_batch_size = trainer.__dumped_params['auto_scale_batch_size'] trainer.limit_train_batches = trainer.__dumped_params['limit_train_batches'] trainer.model = trainer.__dumped_params['model'] diff --git a/pytorch_lightning/tuner/lr_finder.py b/pytorch_lightning/tuner/lr_finder.py index d0ab33df8e1b8..3107f9f44824a 100644 --- a/pytorch_lightning/tuner/lr_finder.py +++ b/pytorch_lightning/tuner/lr_finder.py @@ -155,9 +155,6 @@ def lr_find( if trainer.progress_bar_callback: trainer.progress_bar_callback.disable() - # Disable standard checkpoint & early stopping - trainer.checkpoint_callback = False - # Required for saving the model trainer.optimizers, trainer.schedulers = [], [], trainer.model = model @@ -212,7 +209,6 @@ def __lr_finder_restore_params(trainer, model): trainer.logger = trainer.__dumped_params['logger'] trainer.callbacks = trainer.__dumped_params['callbacks'] trainer.max_steps = trainer.__dumped_params['max_steps'] - trainer.checkpoint_callback = trainer.__dumped_params['checkpoint_callback'] model.configure_optimizers = trainer.__dumped_params['configure_optimizers'] del trainer.__dumped_params diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index 1634b73424dd1..3bc2ca436ec15 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -743,3 +743,43 @@ def test_filepath_decomposition_dirpath_filename(tmpdir, filepath, dirpath, file assert mc_cb.dirpath == dirpath assert mc_cb.filename == filename + + +def test_configure_model_checkpoint(tmpdir): + """ Test all valid and invalid ways a checkpoint callback can be passed to the Trainer. """ + kwargs = dict(default_root_dir=tmpdir) + callback1 = ModelCheckpoint() + callback2 = ModelCheckpoint() + + # no callbacks + trainer = Trainer(checkpoint_callback=False, callbacks=[], **kwargs) + assert not any(isinstance(c, ModelCheckpoint) for c in trainer.callbacks) + assert trainer.checkpoint_callback is None + + # default configuration + trainer = Trainer(checkpoint_callback=True, callbacks=[], **kwargs) + assert len([c for c in trainer.callbacks if isinstance(c, ModelCheckpoint)]) == 1 + assert isinstance(trainer.checkpoint_callback, ModelCheckpoint) + + # custom callback passed to callbacks list, checkpoint_callback=True is ignored + trainer = Trainer(checkpoint_callback=True, callbacks=[callback1], **kwargs) + assert [c for c in trainer.callbacks if isinstance(c, ModelCheckpoint)] == [callback1] + assert trainer.checkpoint_callback == callback1 + + # multiple checkpoint callbacks + trainer = Trainer(callbacks=[callback1, callback2], **kwargs) + assert trainer.checkpoint_callback == callback1 + assert trainer.checkpoint_callbacks == [callback1, callback2] + + with pytest.warns(DeprecationWarning, match='will no longer be supported in v1.4'): + trainer = Trainer(checkpoint_callback=callback1, callbacks=[], **kwargs) + assert [c for c in trainer.callbacks if isinstance(c, ModelCheckpoint)] == [callback1] + assert trainer.checkpoint_callback == callback1 + + with pytest.warns(DeprecationWarning, match="will no longer be supported in v1.4"): + trainer = Trainer(checkpoint_callback=callback1, callbacks=[callback2], **kwargs) + assert trainer.checkpoint_callback == callback2 + assert trainer.checkpoint_callbacks == [callback2, callback1] + + with pytest.raises(MisconfigurationException, match="checkpoint_callback=False but found ModelCheckpoint"): + Trainer(checkpoint_callback=False, callbacks=[callback1], **kwargs) diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py index 862294e64765f..848d6127c4cdb 100644 --- a/tests/models/test_restore.py +++ b/tests/models/test_restore.py @@ -15,6 +15,7 @@ import logging as log import os import pickle +from copy import deepcopy import cloudpickle import pytest @@ -24,7 +25,7 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils -from pytorch_lightning import Trainer, LightningModule, Callback +from pytorch_lightning import Trainer, LightningModule, Callback, seed_everything from pytorch_lightning.callbacks import ModelCheckpoint from tests.base import EvalModelTemplate, GenericEvalModelTemplate, TrialMNIST @@ -51,24 +52,90 @@ def on_train_end(self, trainer, pl_module): self._check_properties(trainer, pl_module) -def test_resume_from_checkpoint(tmpdir): +def test_model_properties_resume_from_checkpoint(tmpdir): """ Test that properties like `current_epoch` and `global_step` in model and trainer are always the same. """ model = EvalModelTemplate() checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True) trainer_args = dict( default_root_dir=tmpdir, - max_epochs=2, + max_epochs=1, logger=False, - checkpoint_callback=checkpoint_callback, - callbacks=[ModelTrainerPropertyParity()] # this performs the assertions + callbacks=[checkpoint_callback, ModelTrainerPropertyParity()] # this performs the assertions ) trainer = Trainer(**trainer_args) trainer.fit(model) + + trainer_args.update(max_epochs=2) trainer = Trainer(**trainer_args, resume_from_checkpoint=str(tmpdir / "last.ckpt")) trainer.fit(model) +class CaptureCallbacksBeforeTraining(Callback): + callbacks = [] + + def on_train_start(self, trainer, pl_module): + self.callbacks = deepcopy(trainer.callbacks) + + +def test_callbacks_state_resume_from_checkpoint(tmpdir): + """ Test that resuming from a checkpoint restores callbacks that persist state. """ + model = EvalModelTemplate() + callback_capture = CaptureCallbacksBeforeTraining() + + def get_trainer_args(): + checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True) + trainer_args = dict( + default_root_dir=tmpdir, + max_steps=1, + logger=False, + callbacks=[ + checkpoint, + callback_capture, + ] + ) + assert checkpoint.best_model_path == "" + assert checkpoint.best_model_score == 0 + return trainer_args + + # initial training + trainer = Trainer(**get_trainer_args()) + trainer.fit(model) + callbacks_before_resume = deepcopy(trainer.callbacks) + + # resumed training + trainer = Trainer(**get_trainer_args(), resume_from_checkpoint=str(tmpdir / "last.ckpt")) + trainer.fit(model) + + assert len(callbacks_before_resume) == len(callback_capture.callbacks) + + for before, after in zip(callbacks_before_resume, callback_capture.callbacks): + if isinstance(before, ModelCheckpoint): + assert before.best_model_path == after.best_model_path + assert before.best_model_score == after.best_model_score + + +def test_callbacks_references_resume_from_checkpoint(tmpdir): + """ Test that resuming from a checkpoint sets references as expected. """ + model = EvalModelTemplate() + args = {'default_root_dir': tmpdir, 'max_steps': 1, 'logger': False} + + # initial training + checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True) + trainer = Trainer(**args, callbacks=[checkpoint]) + assert checkpoint is trainer.callbacks[0] is trainer.checkpoint_callback + trainer.fit(model) + + # resumed training + new_checkpoint = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True) + # pass in a new checkpoint object, which should take + # precedence over the one in the last.ckpt file + trainer = Trainer(**args, callbacks=[new_checkpoint], resume_from_checkpoint=str(tmpdir / "last.ckpt")) + assert checkpoint is not new_checkpoint + assert new_checkpoint is trainer.callbacks[0] is trainer.checkpoint_callback + trainer.fit(model) + + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_running_test_pretrained_model_distrib_dp(tmpdir): """Verify `test()` on pretrained model.""" diff --git a/tests/test_deprecated.py b/tests/test_deprecated.py index 60f13383d3777..67f38568e2103 100644 --- a/tests/test_deprecated.py +++ b/tests/test_deprecated.py @@ -15,6 +15,12 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException +def test_tbd_remove_in_v1_4_0(tmpdir): + with pytest.deprecated_call(match='will no longer be supported in v1.4'): + callback = ModelCheckpoint() + Trainer(checkpoint_callback=callback, callbacks=[], default_root_dir=tmpdir) + + def test_tbd_remove_in_v1_2_0(): with pytest.deprecated_call(match='will be removed in v1.2'): checkpoint_cb = ModelCheckpoint(filepath='.') From 20a8eaa335c04b9e33f4d5bb335388d80997c70c Mon Sep 17 00:00:00 2001 From: Jeff Yang Date: Fri, 30 Oct 2020 16:42:02 +0630 Subject: [PATCH 20/88] fix ModelCheckpoint docs (#4426) --- pytorch_lightning/callbacks/model_checkpoint.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 4c9d3f4e30072..1a81ca0980339 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -78,7 +78,7 @@ class ModelCheckpoint(Callback): dirpath: directory to save the model file. - Example: + Example:: # custom path # saves a file like: my/path/epoch=0.ckpt @@ -92,7 +92,7 @@ class ModelCheckpoint(Callback): filename: checkpoint filename. Can contain named formatting options to be auto-filled. - Example: + Example:: # save any arbitrary metrics like `val_loss`, etc. in name # saves a file like: my/path/epoch=2-val_loss=0.02-other_metric=0.03.ckpt @@ -104,7 +104,7 @@ class ModelCheckpoint(Callback): By default, filename is ``None`` and will be set to ``'{epoch}'``. - Example: + Example:: >>> from pytorch_lightning import Trainer >>> from pytorch_lightning.callbacks import ModelCheckpoint From e0b856c105d3c35df8d53f737762f88ae68ee1e3 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Fri, 30 Oct 2020 11:44:25 +0100 Subject: [PATCH 21/88] [Metrics] Confusion matrix class interface (#4348) * docs + precision + recall + f_beta + refactor Co-authored-by: Teddy Koker * rebase Co-authored-by: Teddy Koker * fixes Co-authored-by: Teddy Koker * added missing file * docs * docs * extra import * add confusion matrix * add to docs * add test * pep8 + isort * update tests * move util function * unify functional and class * add to init * remove old implementation * update tests * pep8 * add duplicate * fix doctest * Update pytorch_lightning/metrics/classification/confusion_matrix.py Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> * changelog * bullet point args * bullet docs * bullet docs Co-authored-by: ananyahjha93 Co-authored-by: Teddy Koker Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: chaton Co-authored-by: Roger Shieh <55400948+s-rog@users.noreply.github.com> Co-authored-by: Rohit Gupta --- CHANGELOG.md | 3 + docs/source/metrics.rst | 8 +- pytorch_lightning/metrics/__init__.py | 3 +- .../metrics/classification/__init__.py | 1 + .../metrics/classification/accuracy.py | 19 +-- .../classification/confusion_matrix.py | 111 +++++++++++++++ .../metrics/functional/__init__.py | 2 +- .../metrics/functional/classification.py | 42 ------ .../metrics/functional/confusion_matrix.py | 96 +++++++++++++ pytorch_lightning/metrics/utils.py | 28 ++++ .../classification/test_confusion_matrix.py | 133 ++++++++++++++++++ .../metrics/functional/test_classification.py | 30 ---- 12 files changed, 384 insertions(+), 92 deletions(-) create mode 100644 pytorch_lightning/metrics/classification/confusion_matrix.py create mode 100644 pytorch_lightning/metrics/functional/confusion_matrix.py create mode 100644 tests/metrics/classification/test_confusion_matrix.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d42fcaaa7a46f..bf61cbfb6a094 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for string values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) +- Added `ConfusionMatrix` class interface ([#4348](https://github.com/PyTorchLightning/pytorch-lightning/pull/4348)) + + ### Changed diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst index cf7dcfbecc183..c232ea216e885 100644 --- a/docs/source/metrics.rst +++ b/docs/source/metrics.rst @@ -188,6 +188,12 @@ Fbeta .. autoclass:: pytorch_lightning.metrics.classification.Fbeta :noindex: +ConfusionMatrix +~~~~~~~~~~~~~~~ + +.. autoclass:: pytorch_lightning.metrics.classification.ConfusionMatrix + :noindex: + Regression Metrics ------------------ @@ -275,7 +281,7 @@ average_precision [func] confusion_matrix [func] ~~~~~~~~~~~~~~~~~~~~~~~ -.. autofunction:: pytorch_lightning.metrics.functional.classification.confusion_matrix +.. autofunction:: pytorch_lightning.metrics.functional.confusion_matrix :noindex: diff --git a/pytorch_lightning/metrics/__init__.py b/pytorch_lightning/metrics/__init__.py index 3e42c73e70f84..6be7fbdea7c30 100644 --- a/pytorch_lightning/metrics/__init__.py +++ b/pytorch_lightning/metrics/__init__.py @@ -17,7 +17,8 @@ Accuracy, Precision, Recall, - Fbeta + Fbeta, + ConfusionMatrix ) from pytorch_lightning.metrics.regression import ( diff --git a/pytorch_lightning/metrics/classification/__init__.py b/pytorch_lightning/metrics/classification/__init__.py index 7b342235085b7..e440edc2ebfb9 100644 --- a/pytorch_lightning/metrics/classification/__init__.py +++ b/pytorch_lightning/metrics/classification/__init__.py @@ -14,3 +14,4 @@ from pytorch_lightning.metrics.classification.accuracy import Accuracy from pytorch_lightning.metrics.classification.precision_recall import Precision, Recall from pytorch_lightning.metrics.classification.f_beta import Fbeta +from pytorch_lightning.metrics.classification.confusion_matrix import ConfusionMatrix diff --git a/pytorch_lightning/metrics/classification/accuracy.py b/pytorch_lightning/metrics/classification/accuracy.py index 70332331f681f..c7255c6e4497e 100644 --- a/pytorch_lightning/metrics/classification/accuracy.py +++ b/pytorch_lightning/metrics/classification/accuracy.py @@ -21,6 +21,7 @@ import torch from torch import nn from pytorch_lightning.metrics.metric import Metric +from pytorch_lightning.metrics.utils import _input_format_classification class Accuracy(Metric): @@ -60,7 +61,6 @@ class Accuracy(Metric): tensor(0.5000) """ - def __init__( self, threshold: float = 0.5, @@ -79,21 +79,6 @@ def __init__( self.threshold = threshold - def _input_format(self, preds: torch.Tensor, target: torch.Tensor): - if not (len(preds.shape) == len(target.shape) or len(preds.shape) == len(target.shape) + 1): - raise ValueError( - "preds and target must have same number of dimensions, or one additional dimension for preds" - ) - - if len(preds.shape) == len(target.shape) + 1: - # multi class probabilites - preds = torch.argmax(preds, dim=1) - - if len(preds.shape) == len(target.shape) and preds.dtype == torch.float: - # binary or multilabel probablities - preds = (preds >= self.threshold).long() - return preds, target - def update(self, preds: torch.Tensor, target: torch.Tensor): """ Update state with predictions and targets. @@ -102,7 +87,7 @@ def update(self, preds: torch.Tensor, target: torch.Tensor): preds: Predictions from model target: Ground truth values """ - preds, target = self._input_format(preds, target) + preds, target = _input_format_classification(preds, target, self.threshold) assert preds.shape == target.shape self.correct += torch.sum(preds == target) diff --git a/pytorch_lightning/metrics/classification/confusion_matrix.py b/pytorch_lightning/metrics/classification/confusion_matrix.py new file mode 100644 index 0000000000000..5a825d8f191cc --- /dev/null +++ b/pytorch_lightning/metrics/classification/confusion_matrix.py @@ -0,0 +1,111 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Optional + +import torch + +from pytorch_lightning.metrics.metric import Metric +from pytorch_lightning.metrics.functional.confusion_matrix import ( + _confusion_matrix_update, + _confusion_matrix_compute +) + + +class ConfusionMatrix(Metric): + """ + Computes the confusion matrix. Works with binary, multiclass, and multilabel data. + Accepts logits from a model output or integer class values in prediction. + Works with multi-dimensional preds and target. + + Forward accepts + + - ``preds`` (float or long tensor): ``(N, ...)`` or ``(N, C, ...)`` where C is the number of classes + - ``target`` (long tensor): ``(N, ...)`` + + If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument. + This is the case for binary and multi-label logits. + + If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``. + + Args: + num_classes: Number of classes in the dataset. + normalize: Normalization mode for confusion matrix. Choose from + + - ``None``: no normalization (default) + - ``'true'``: normalization over the targets (most commonly used) + - ``'pred'``: normalization over the predictions + - ``'all'``: normalization over the whole matrix + + threshold: + Threshold value for binary or multi-label logits. default: 0.5 + compute_on_step: + Forward only calls ``update()`` and return None if this is set to False. default: True + dist_sync_on_step: + Synchronize metric state across processes at each ``forward()`` + before returning the value at the step. default: False + process_group: + Specify the process group on which synchronization is called. default: None (which selects the entire world) + + Example: + + >>> from pytorch_lightning.metrics import ConfusionMatrix + >>> target = torch.tensor([1, 1, 0, 0]) + >>> preds = torch.tensor([0, 1, 0, 0]) + >>> confmat = ConfusionMatrix(num_classes=2) + >>> confmat(preds, target) + tensor([[2., 0.], + [1., 1.]]) + + """ + def __init__( + self, + num_classes: int, + normalize: Optional[str] = None, + threshold: float = 0.5, + compute_on_step: bool = True, + dist_sync_on_step: bool = False, + process_group: Optional[Any] = None, + ): + + super().__init__( + compute_on_step=compute_on_step, + dist_sync_on_step=dist_sync_on_step, + process_group=process_group, + ) + self.num_classes = num_classes + self.normalize = normalize + self.threshold = threshold + + allowed_normalize = ('true', 'pred', 'all', None) + assert self.normalize in allowed_normalize, \ + f"Argument average needs to one of the following: {allowed_normalize}" + + self.add_state("confmat", default=torch.zeros(num_classes, num_classes), dist_reduce_fx="sum") + + def update(self, preds: torch.Tensor, target: torch.Tensor): + """ + Update state with predictions and targets. + + Args: + preds: Predictions from model + target: Ground truth values + """ + confmat = _confusion_matrix_update(preds, target, self.num_classes, self.threshold) + self.confmat += confmat + + def compute(self) -> torch.Tensor: + """ + Computes confusion matrix + """ + return _confusion_matrix_compute(self.confmat, self.normalize) diff --git a/pytorch_lightning/metrics/functional/__init__.py b/pytorch_lightning/metrics/functional/__init__.py index 1af3db85b0de3..189e3945e1a56 100644 --- a/pytorch_lightning/metrics/functional/__init__.py +++ b/pytorch_lightning/metrics/functional/__init__.py @@ -16,7 +16,6 @@ auc, auroc, average_precision, - confusion_matrix, dice_score, f1_score, fbeta_score, @@ -44,3 +43,4 @@ from pytorch_lightning.metrics.functional.mean_squared_log_error import mean_squared_log_error from pytorch_lightning.metrics.functional.psnr import psnr from pytorch_lightning.metrics.functional.ssim import ssim +from pytorch_lightning.metrics.functional.confusion_matrix import confusion_matrix diff --git a/pytorch_lightning/metrics/functional/classification.py b/pytorch_lightning/metrics/functional/classification.py index a831611fb9593..51b80308a579c 100644 --- a/pytorch_lightning/metrics/functional/classification.py +++ b/pytorch_lightning/metrics/functional/classification.py @@ -301,48 +301,6 @@ def _confmat_normalize(cm): return cm -def confusion_matrix( - pred: torch.Tensor, - target: torch.Tensor, - normalize: bool = False, - num_classes: Optional[int] = None -) -> torch.Tensor: - """ - Computes the confusion matrix C where each entry C_{i,j} is the number of observations - in group i that were predicted in group j. - - Args: - pred: estimated targets - target: ground truth labels - normalize: normalizes confusion matrix - num_classes: number of classes - - Return: - Tensor, confusion matrix C [num_classes, num_classes ] - - Example: - - >>> x = torch.tensor([1, 2, 3]) - >>> y = torch.tensor([0, 2, 3]) - >>> confusion_matrix(x, y) - tensor([[0., 1., 0., 0.], - [0., 0., 0., 0.], - [0., 0., 1., 0.], - [0., 0., 0., 1.]]) - """ - num_classes = get_num_classes(pred, target, num_classes) - - unique_labels = (target.view(-1) * num_classes + pred.view(-1)).to(torch.int) - - bins = torch.bincount(unique_labels, minlength=num_classes ** 2) - cm = bins.reshape(num_classes, num_classes).squeeze().float() - - if normalize: - cm = _confmat_normalize(cm) - - return cm - - def precision_recall( pred: torch.Tensor, target: torch.Tensor, diff --git a/pytorch_lightning/metrics/functional/confusion_matrix.py b/pytorch_lightning/metrics/functional/confusion_matrix.py new file mode 100644 index 0000000000000..143d237b3b2c6 --- /dev/null +++ b/pytorch_lightning/metrics/functional/confusion_matrix.py @@ -0,0 +1,96 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional + +import torch + +from pytorch_lightning.utilities import rank_zero_warn +from pytorch_lightning.metrics.utils import _input_format_classification + + +def _confusion_matrix_update(preds: torch.Tensor, + target: torch.Tensor, + num_classes: int, + threshold: float = 0.5) -> torch.Tensor: + preds, target = _input_format_classification(preds, target, threshold) + unique_mapping = (target.view(-1) * num_classes + preds.view(-1)).to(torch.long) + bins = torch.bincount(unique_mapping, minlength=num_classes ** 2) + confmat = bins.reshape(num_classes, num_classes) + return confmat + + +def _confusion_matrix_compute(confmat: torch.Tensor, + normalize: Optional[str] = None) -> torch.Tensor: + allowed_normalize = ('true', 'pred', 'all', None) + assert normalize in allowed_normalize, \ + f"Argument average needs to one of the following: {allowed_normalize}" + confmat = confmat.float() + if normalize is not None: + if normalize == 'true': + cm = confmat / confmat.sum(axis=1, keepdim=True) + elif normalize == 'pred': + cm = confmat / confmat.sum(axis=0, keepdim=True) + elif normalize == 'all': + cm = confmat / confmat.sum() + nan_elements = cm[torch.isnan(cm)].nelement() + if nan_elements != 0: + cm[torch.isnan(cm)] = 0 + rank_zero_warn(f'{nan_elements} nan values found in confusion matrix have been replaced with zeros.') + return cm + return confmat + + +def confusion_matrix( + preds: torch.Tensor, + target: torch.Tensor, + num_classes: int, + normalize: Optional[str] = None, + threshold: float = 0.5 +) -> torch.Tensor: + """ + Computes the confusion matrix. Works with binary, multiclass, and multilabel data. + Accepts logits from a model output or integer class values in prediction. + Works with multi-dimensional preds and target. + + If preds and target are the same shape and preds is a float tensor, we use the ``self.threshold`` argument. + This is the case for binary and multi-label logits. + + If preds has an extra dimension as in the case of multi-class scores we perform an argmax on ``dim=1``. + + Args: + preds: (float or long tensor), Either a ``(N, ...)`` tensor with labels or + ``(N, C, ...)`` where C is the number of classes, tensor with logits/probabilities + target: ``target`` (long tensor), tensor with shape ``(N, ...)`` with ground true labels + num_classes: Number of classes in the dataset. + normalize: Normalization mode for confusion matrix. Choose from + + - ``None``: no normalization (default) + - ``'true'``: normalization over the targets (most commonly used) + - ``'pred'``: normalization over the predictions + - ``'all'``: normalization over the whole matrix + + threshold: + Threshold value for binary or multi-label logits. default: 0.5 + + Example: + + >>> from pytorch_lightning.metrics.functional import confusion_matrix + >>> target = torch.tensor([1, 1, 0, 0]) + >>> preds = torch.tensor([0, 1, 0, 0]) + >>> confusion_matrix(preds, target, num_classes=2) + tensor([[2., 0.], + [1., 1.]]) + """ + confmat = _confusion_matrix_update(preds, target, num_classes, threshold) + return _confusion_matrix_compute(confmat, normalize) diff --git a/pytorch_lightning/metrics/utils.py b/pytorch_lightning/metrics/utils.py index 886c070c2fd18..e9419cf70172b 100644 --- a/pytorch_lightning/metrics/utils.py +++ b/pytorch_lightning/metrics/utils.py @@ -67,3 +67,31 @@ def _check_same_shape(pred: torch.Tensor, target: torch.Tensor): """ Check that predictions and target have the same shape, else raise error """ if pred.shape != target.shape: raise RuntimeError('Predictions and targets are expected to have the same shape') + + +def _input_format_classification(preds: torch.Tensor, target: torch.Tensor, threshold: float): + """ Convert preds and target tensors into label tensors + + Args: + preds: either tensor with labels, tensor with probabilities/logits or + multilabel tensor + target: tensor with ground true labels + threshold: float used for thresholding multilabel input + + Returns: + preds: tensor with labels + target: tensor with labels + """ + if not (len(preds.shape) == len(target.shape) or len(preds.shape) == len(target.shape) + 1): + raise ValueError( + "preds and target must have same number of dimensions, or one additional dimension for preds" + ) + + if len(preds.shape) == len(target.shape) + 1: + # multi class probabilites + preds = torch.argmax(preds, dim=1) + + if len(preds.shape) == len(target.shape) and preds.dtype == torch.float: + # binary or multilabel probablities + preds = (preds >= threshold).long() + return preds, target diff --git a/tests/metrics/classification/test_confusion_matrix.py b/tests/metrics/classification/test_confusion_matrix.py new file mode 100644 index 0000000000000..c1ddff77e0818 --- /dev/null +++ b/tests/metrics/classification/test_confusion_matrix.py @@ -0,0 +1,133 @@ +from functools import partial + +import numpy as np +import pytest +import torch +from sklearn.metrics import confusion_matrix as sk_confusion_matrix + +from pytorch_lightning.metrics.classification.confusion_matrix import ConfusionMatrix +from pytorch_lightning.metrics.functional.confusion_matrix import confusion_matrix +from tests.metrics.classification.inputs import ( + _binary_inputs, + _binary_prob_inputs, + _multiclass_inputs, + _multiclass_prob_inputs, + _multidim_multiclass_inputs, + _multidim_multiclass_prob_inputs, + _multilabel_inputs, + _multilabel_prob_inputs +) +from tests.metrics.utils import NUM_CLASSES, THRESHOLD, MetricTester + +torch.manual_seed(42) + + +def _binary_prob_sk_metric(preds, target, normalize=None): + sk_preds = (preds.view(-1).numpy() >= THRESHOLD).astype(np.uint8) + sk_target = target.view(-1).numpy() + + return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize) + + +def _binary_sk_metric(preds, target, normalize=None): + sk_preds = preds.view(-1).numpy() + sk_target = target.view(-1).numpy() + + return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize) + + +def _multilabel_prob_sk_metric(preds, target, normalize=None): + sk_preds = (preds.view(-1).numpy() >= THRESHOLD).astype(np.uint8) + sk_target = target.view(-1).numpy() + + return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize) + + +def _multilabel_sk_metric(preds, target, normalize=None): + sk_preds = preds.view(-1).numpy() + sk_target = target.view(-1).numpy() + + return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize) + + +def _multiclass_prob_sk_metric(preds, target, normalize=None): + sk_preds = torch.argmax(preds, dim=len(preds.shape) - 1).view(-1).numpy() + sk_target = target.view(-1).numpy() + + return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize) + + +def _multiclass_sk_metric(preds, target, normalize=None): + sk_preds = preds.view(-1).numpy() + sk_target = target.view(-1).numpy() + + return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize) + + +def _multidim_multiclass_prob_sk_metric(preds, target, normalize=None): + sk_preds = torch.argmax(preds, dim=len(preds.shape) - 2).view(-1).numpy() + sk_target = target.view(-1).numpy() + + return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize) + + +def _multidim_multiclass_sk_metric(preds, target, normalize=None): + sk_preds = preds.view(-1).numpy() + sk_target = target.view(-1).numpy() + + return sk_confusion_matrix(y_true=sk_target, y_pred=sk_preds, normalize=normalize) + + +@pytest.mark.parametrize("normalize", ['true', 'pred', 'all', None]) +@pytest.mark.parametrize("preds, target, sk_metric, num_classes", [ + (_binary_prob_inputs.preds, _binary_prob_inputs.target, _binary_prob_sk_metric, 2), + (_binary_inputs.preds, _binary_inputs.target, _binary_sk_metric, 2), + (_multilabel_prob_inputs.preds, _multilabel_prob_inputs.target, _multilabel_prob_sk_metric, 2), + (_multilabel_inputs.preds, _multilabel_inputs.target, _multilabel_sk_metric, 2), + (_multiclass_prob_inputs.preds, _multiclass_prob_inputs.target, _multiclass_prob_sk_metric, NUM_CLASSES), + (_multiclass_inputs.preds, _multiclass_inputs.target, _multiclass_sk_metric, NUM_CLASSES), + ( + _multidim_multiclass_prob_inputs.preds, + _multidim_multiclass_prob_inputs.target, + _multidim_multiclass_prob_sk_metric, + NUM_CLASSES + ), + ( + _multidim_multiclass_inputs.preds, + _multidim_multiclass_inputs.target, + _multidim_multiclass_sk_metric, + NUM_CLASSES + ) +]) +class TestConfusionMatrix(MetricTester): + @pytest.mark.parametrize("ddp", [True, False]) + @pytest.mark.parametrize("dist_sync_on_step", [True, False]) + def test_confusion_matrix(self, normalize, preds, target, sk_metric, num_classes, ddp, dist_sync_on_step): + self.run_class_metric_test(ddp=ddp, + preds=preds, + target=target, + metric_class=ConfusionMatrix, + sk_metric=partial(sk_metric, normalize=normalize), + dist_sync_on_step=dist_sync_on_step, + metric_args={"num_classes": num_classes, + "threshold": THRESHOLD, + "normalize": normalize} + ) + + def test_confusion_matrix_functional(self, normalize, preds, target, sk_metric, num_classes): + self.run_functional_metric_test(preds, + target, + metric_functional=confusion_matrix, + sk_metric=partial(sk_metric, normalize=normalize), + metric_args={"num_classes": num_classes, + "threshold": THRESHOLD, + "normalize": normalize} + ) + + +def test_warning_on_nan(tmpdir): + preds = torch.randint(3, size=(20,)) + target = torch.randint(3, size=(20,)) + + with pytest.warns(UserWarning, match='.* nan values found in confusion matrix have been replaced with zeros.'): + confmat = confusion_matrix(preds, target, num_classes=5, normalize='true') diff --git a/tests/metrics/functional/test_classification.py b/tests/metrics/functional/test_classification.py index 139aeea8cc9ad..cab543f8dd6e6 100644 --- a/tests/metrics/functional/test_classification.py +++ b/tests/metrics/functional/test_classification.py @@ -9,7 +9,6 @@ recall_score as sk_recall, f1_score as sk_f1_score, fbeta_score as sk_fbeta_score, - confusion_matrix as sk_confusion_matrix, roc_curve as sk_roc_curve, roc_auc_score as sk_roc_auc_score, precision_recall_curve as sk_precision_recall_curve @@ -23,7 +22,6 @@ stat_scores, stat_scores_multiple_classes, accuracy, - confusion_matrix, precision, recall, fbeta_score, @@ -47,7 +45,6 @@ pytest.param(partial(sk_f1_score, average='micro'), f1_score, False, id='f1_score'), pytest.param(partial(sk_fbeta_score, average='micro', beta=2), partial(fbeta_score, beta=2), False, id='fbeta_score'), - pytest.param(sk_confusion_matrix, confusion_matrix, False, id='confusion_matrix'), pytest.param(sk_roc_curve, roc, True, id='roc'), pytest.param(sk_precision_recall_curve, precision_recall_curve, True, id='precision_recall_curve'), pytest.param(sk_roc_auc_score, auroc, True, id='auroc') @@ -216,33 +213,6 @@ def test_accuracy(): assert acc.item() == 0.50 -def test_confusion_matrix(): - target = (torch.arange(120) % 3).view(-1, 1) - pred = target.clone() - cm = confusion_matrix(pred, target, normalize=True) - - assert torch.allclose(cm, torch.tensor([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])) - - pred = torch.zeros_like(pred) - cm = confusion_matrix(pred, target, normalize=True) - assert torch.allclose(cm, torch.tensor([[1., 0., 0.], [1., 0., 0.], [1., 0., 0.]])) - - target = torch.LongTensor([0, 0, 0, 0, 0]) - pred = target.clone() - cm = confusion_matrix(pred, target, normalize=False, num_classes=3) - assert torch.allclose(cm, torch.tensor([[5., 0., 0.], [0., 0., 0.], [0., 0., 0.]])) - - # Example taken from https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html - target = torch.LongTensor([0] * 13 + [1] * 16 + [2] * 9) - pred = torch.LongTensor([0] * 13 + [1] * 10 + [2] * 15) - cm = confusion_matrix(pred, target, normalize=False, num_classes=3) - assert torch.allclose(cm, torch.tensor([[13., 0., 0.], [0., 10., 6.], [0., 0., 9.]])) - to_compare = cm / torch.tensor([[13.], [16.], [9.]]) - - cm = confusion_matrix(pred, target, normalize=True, num_classes=3) - assert torch.allclose(cm, to_compare) - - @pytest.mark.parametrize(['pred', 'target', 'expected_prec', 'expected_rec'], [ pytest.param(torch.tensor([1., 0., 1., 0.]), torch.tensor([0., 1., 1., 0.]), [0.5, 0.5], [0.5, 0.5]), pytest.param(to_onehot(torch.tensor([1., 0., 1., 0.])), torch.tensor([0., 1., 1., 0.]), [0.5, 0.5], [0.5, 0.5]) From 48e0b33d56f78d2fdcdb80f35cfad08e3da93e15 Mon Sep 17 00:00:00 2001 From: Jeff Yang Date: Fri, 30 Oct 2020 20:04:29 +0630 Subject: [PATCH 22/88] [Changelog] 1.0.4 (#4440) * changelog 1.0.4 * changelog 1.0.4 --- CHANGELOG.md | 51 +++++++++++++++++++-------------------------------- 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bf61cbfb6a094..6b170d3898172 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,78 +9,65 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- Added `dirpath` and `filename` parameter in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213)) +- Added "monitor" key to saved `ModelCheckpoints` ([#4383](https://github.com/PyTorchLightning/pytorch-lightning/pull/4383)) +- Added `ConfusionMatrix` class interface ([#4348](https://github.com/PyTorchLightning/pytorch-lightning/pull/4348)) -- Added plugins docs and DDPPlugin to customize ddp across all accelerators ([#4258](https://github.com/PyTorchLightning/pytorch-lightning/pull/4285)) +### Changed +- W&B log in sync with Trainer step ([#4405](https://github.com/PyTorchLightning/pytorch-lightning/pull/4405)) -- Added `strict` option to the scheduler dictionary ([#3586](https://github.com/PyTorchLightning/pytorch-lightning/pull/3586)) +### Deprecated +- Deprecated passing `ModelCheckpoint` instance to `checkpoint_callback` Trainer argument ([#4336](https://github.com/PyTorchLightning/pytorch-lightning/pull/4336)) -- Added `fsspec` support for profilers ([#4162](https://github.com/PyTorchLightning/pytorch-lightning/pull/4162)) +### Removed +### Fixed -- Added autogenerated helptext to `Trainer.add_argparse_args` ([#4344](https://github.com/PyTorchLightning/pytorch-lightning/pull/4344)) +- Fixed error using `auto_select_gpus=True` with `gpus=-1` ([#4209](https://github.com/PyTorchLightning/pytorch-lightning/pull/4209)) -- Added "monitor" key to saved `ModelCheckpoints` ([#4383](https://github.com/PyTorchLightning/pytorch-lightning/pull/4383)) +## [1.0.4] - 2020-10-27 + +### Added +- Added `dirpath` and `filename` parameter in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213)) -- Added support for string values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) +- Added plugins docs and DDPPlugin to customize ddp across all accelerators ([#4258](https://github.com/PyTorchLightning/pytorch-lightning/pull/4285)) +- Added `strict` option to the scheduler dictionary ([#3586](https://github.com/PyTorchLightning/pytorch-lightning/pull/3586)) -- Added `ConfusionMatrix` class interface ([#4348](https://github.com/PyTorchLightning/pytorch-lightning/pull/4348)) +- Added `fsspec` support for profilers ([#4162](https://github.com/PyTorchLightning/pytorch-lightning/pull/4162)) +- Added autogenerated helptext to `Trainer.add_argparse_args` ([#4344](https://github.com/PyTorchLightning/pytorch-lightning/pull/4344)) -### Changed +- Added support for string values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) +### Changed - Improved error messages for invalid `configure_optimizers` returns ([#3587](https://github.com/PyTorchLightning/pytorch-lightning/pull/3587)) - - Allow changing the logged step value in `validation_step` ([#4130](https://github.com/PyTorchLightning/pytorch-lightning/pull/4130)) - - Allow setting `replace_sampler_ddp=True` with a distributed sampler already added ([#4273](https://github.com/PyTorchLightning/pytorch-lightning/pull/4273)) - - Fixed santized parameters for `WandbLogger.log_hyperparams` ([#4320](https://github.com/PyTorchLightning/pytorch-lightning/pull/4320)) - -- W&B log in sync with Trainer step ([#4405](https://github.com/PyTorchLightning/pytorch-lightning/pull/4405)) - - ### Deprecated - - Deprecated `filepath` in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213)) - - Deprecated `reorder` parameter of the `auc` metric ([#4237](https://github.com/PyTorchLightning/pytorch-lightning/pull/4237)) - - Deprecated bool values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) - -- Deprecated passing `ModelCheckpoint` instance to `checkpoint_callback` Trainer argument ([#4336](https://github.com/PyTorchLightning/pytorch-lightning/pull/4336)) - -### Removed - - - ### Fixed - -- Fixed error using `auto_select_gpus=True` with `gpus=-1` ([#4209](https://github.com/PyTorchLightning/pytorch-lightning/pull/4209)) - - - Fixed setting device ids in DDP ([#4297](https://github.com/PyTorchLightning/pytorch-lightning/pull/4297)) - - Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/PyTorchLightning/pytorch-lightning/pull/4323)) - - Fixed WandbLogger not uploading checkpoint artifacts at the end of training ([#4341](https://github.com/PyTorchLightning/pytorch-lightning/pull/4341)) From 0f584faa6b3520efa378f78de4e12a34f039d8f9 Mon Sep 17 00:00:00 2001 From: Jeff Yang Date: Fri, 30 Oct 2020 22:12:14 +0630 Subject: [PATCH 23/88] PyTorch 1.7 Stable support (#3821) * prepare for 1.7 support [ci skip] * tpu [ci skip] * test run 1.7 * all 1.7, needs to fix tests * couple with torchvision * windows try * remove windows * 1.7 is here * on purpose fail [ci skip] * return [ci skip] * 1.7 docker * back to normal [ci skip] * change to some_val [ci skip] * add seed [ci skip] * 4 places [ci skip] * fail on purpose [ci skip] * verbose=True [ci skip] * use filename to track * use filename to track * monitor epoch + changelog * Update tests/checkpointing/test_model_checkpoint.py Co-authored-by: Rohit Gupta Co-authored-by: Sean Naren Co-authored-by: Rohit Gupta --- .github/workflows/ci_dockers.yml | 6 +++--- .github/workflows/ci_test-conda.yml | 2 +- .github/workflows/ci_test-full.yml | 2 +- .github/workflows/nightly.yml | 2 +- CHANGELOG.md | 2 ++ README.md | 10 +++++----- dockers/base-xla/Dockerfile | 2 +- pytorch_lightning/core/grads.py | 4 ++-- tests/checkpointing/test_model_checkpoint.py | 20 ++++++++++++++------ tests/metrics/utils.py | 2 +- tests/models/test_grad_norm.py | 4 ++-- 11 files changed, 33 insertions(+), 23 deletions(-) diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index c8816486f2688..8e8d56b04d501 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -40,7 +40,7 @@ jobs: fail-fast: false matrix: python_version: [3.7] - xla_version: [1.6] # todo: , "nightly" + xla_version: [1.6, "nightly"] steps: - name: Checkout uses: actions/checkout@v2 @@ -66,8 +66,8 @@ jobs: fail-fast: false matrix: include: - #- python_version: 3.8 - # pytorch_version: 1.7 # todo + - python_version: 3.8 + pytorch_version: 1.7 - python_version: 3.7 pytorch_version: 1.6 - python_version: 3.6 diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index f652cbb1a4b58..b165b0a604344 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -16,7 +16,7 @@ jobs: matrix: # os: [ubuntu-20.04] python-version: [3.7] - pytorch-version: [1.3, 1.4, 1.5, 1.6] # , 1.7 # todo + pytorch-version: [1.3, 1.4, 1.5, 1.6, 1.7] # Timeout: https://stackoverflow.com/a/59076067/4521646 timeout-minutes: 35 diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index d74a923693e0b..05db07c2a6c34 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -89,7 +89,7 @@ jobs: run: | # python -m pip install --upgrade --user pip pip install --requirement requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet --upgrade - pip install --requirement ./requirements/devel.txt --quiet --upgrade + pip install --requirement ./requirements/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet --upgrade python --version pip --version pip list diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index eb10c43936044..f60ecd09dfb84 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -76,7 +76,7 @@ jobs: fail-fast: false matrix: python_version: [3.6, 3.7, 3.8] - pytorch_version: [1.3, 1.4, 1.5, 1.6] # todo: , 1.7 + pytorch_version: [1.3, 1.4, 1.5, 1.6, 1.7] exclude: # excludes PT 1.3 as it is missing on pypi - python_version: 3.8 diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b170d3898172..c102be008d0f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added +- Added PyTorch 1.7 Stable support ([#3821](https://github.com/PyTorchLightning/pytorch-lightning/pull/3821)) + - Added "monitor" key to saved `ModelCheckpoints` ([#4383](https://github.com/PyTorchLightning/pytorch-lightning/pull/4383)) - Added `ConfusionMatrix` class interface ([#4348](https://github.com/PyTorchLightning/pytorch-lightning/pull/4348)) diff --git a/README.md b/README.md index 21f4aaab19ad1..fea674bc396b0 100644 --- a/README.md +++ b/README.md @@ -89,14 +89,14 @@ Lightning can automatically export to ONNX or TorchScript for those cases. ## Continuous Integration
-| System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 (latest) | 1.7 (nightly) | +| System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 | 1.7 (latest) | | :---: | :---: | :---: | :---: | :---: | :---: | -| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | - | +| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | | Linux py3.7 [GPUs**] | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | - | | Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | -| Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | -| OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | -| Windows py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | +| Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | +| OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | +| Windows py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - _\* `torch>=1.4` is the minimal pytorch version for Python 3.8_ - _\** tests run on two NVIDIA K80_ diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile index f44465383a0e0..3eaabade428e6 100644 --- a/dockers/base-xla/Dockerfile +++ b/dockers/base-xla/Dockerfile @@ -110,4 +110,4 @@ RUN \ conda info && \ pip list && \ python -c "import sys; assert sys.version[:3] == '$PYTHON_VERSION', sys.version" && \ - python -c "import torch; ver = '$XLA_VERSION' ; ver = dict(nightly='1.7').get(ver, ver) ; assert torch.__version__[:3] == ver, torch.__version__" + python -c "import torch; ver = '$XLA_VERSION' ; ver = dict(nightly='1.8').get(ver, ver) ; assert torch.__version__[:3] == ver, torch.__version__" diff --git a/pytorch_lightning/core/grads.py b/pytorch_lightning/core/grads.py index 2cdeaf4e59010..4ba1acf5689a7 100644 --- a/pytorch_lightning/core/grads.py +++ b/pytorch_lightning/core/grads.py @@ -46,11 +46,11 @@ def grad_norm(self, norm_type: Union[float, int, str]) -> Dict[str, float]: continue param_norm = float(p.grad.data.norm(norm_type)) - norms[f'grad_{norm_type}_norm_{name}'] = round(param_norm, 3) + norms[f'grad_{norm_type}_norm_{name}'] = round(param_norm, 4) all_norms.append(param_norm) total_norm = float(torch.tensor(all_norms).norm(norm_type)) - norms[f'grad_{norm_type}_norm_total'] = round(total_norm, 3) + norms[f'grad_{norm_type}_norm_total'] = round(total_norm, 4) return norms diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index 3bc2ca436ec15..19705a6ebc9a2 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -365,9 +365,14 @@ def test_model_checkpoint_topk_zero(tmpdir): def test_model_checkpoint_topk_all(tmpdir): """ Test that save_top_k=-1 tracks the best models when monitor key is provided. """ seed_everything(1000) - epochs = 2 - model = EvalModelTemplate() - checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_top_k=-1) + epochs = 3 + + class CustomModel(EvalModelTemplate): + def validation_epoch_end(self, outputs): + return {'epoch': self.current_epoch} + + model = CustomModel() + checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="epoch", mode='max', save_top_k=-1) trainer = Trainer( default_root_dir=tmpdir, checkpoint_callback=checkpoint_callback, @@ -375,10 +380,13 @@ def test_model_checkpoint_topk_all(tmpdir): logger=False, ) trainer.fit(model) - assert checkpoint_callback.best_model_path == tmpdir / "epoch=1.ckpt" - assert checkpoint_callback.best_model_score > 0 + + assert checkpoint_callback.monitor == 'epoch' + assert checkpoint_callback.best_model_path == tmpdir / "epoch=2.ckpt" + assert checkpoint_callback.best_model_score == epochs - 1 + assert len(os.listdir(tmpdir)) == len(checkpoint_callback.best_k_models) == epochs assert set(checkpoint_callback.best_k_models.keys()) == set(str(tmpdir / f"epoch={i}.ckpt") for i in range(epochs)) - assert checkpoint_callback.kth_best_model_path == tmpdir / "epoch=0.ckpt" + assert checkpoint_callback.kth_best_model_path == tmpdir / 'epoch=0.ckpt' def test_ckpt_metric_names(tmpdir): diff --git a/tests/metrics/utils.py b/tests/metrics/utils.py index b946d23c79813..e7cc7c6123b48 100644 --- a/tests/metrics/utils.py +++ b/tests/metrics/utils.py @@ -24,7 +24,7 @@ def setup_ddp(rank, world_size): os.environ["MASTER_ADDR"] = 'localhost' os.environ['MASTER_PORT'] = '8088' - if torch.distributed.is_available(): + if torch.distributed.is_available() and sys.platform not in ['win32', 'cygwin']: torch.distributed.init_process_group("gloo", rank=rank, world_size=world_size) diff --git a/tests/models/test_grad_norm.py b/tests/models/test_grad_norm.py index 89d8ff89999c1..51ba7f34048e0 100644 --- a/tests/models/test_grad_norm.py +++ b/tests/models/test_grad_norm.py @@ -49,11 +49,11 @@ def on_after_backward(self): norm = np.linalg.norm(flat, self.norm_type) norms.append(norm) - out[prefix + name] = round(norm, 3) + out[prefix + name] = round(norm, 4) # handle total norm norm = np.linalg.norm(norms, self.norm_type) - out[prefix + 'total'] = round(norm, 3) + out[prefix + 'total'] = round(norm, 4) self.stored_grad_norms.append(out) From 38bb4e2da069e2971669bd900bbc3f7727121c61 Mon Sep 17 00:00:00 2001 From: Dusan Drevicky <55678224+ddrevicky@users.noreply.github.com> Date: Fri, 30 Oct 2020 19:56:13 +0100 Subject: [PATCH 24/88] [Metrics] Add multiclass auroc (#4236) * Add functional multiclass AUROC metric * Add multiclass_auroc tests * fixup! Add functional multiclass AUROC metric * fixup! fixup! Add functional multiclass AUROC metric * Add multiclass_auroc doc reference * Update CHANGELOG * formatting * Shorter error message regex match in tests * Set num classes as pytest parameter * formatting * Update CHANGELOG Co-authored-by: Jirka Borovec Co-authored-by: Nicki Skafte --- CHANGELOG.md | 2 + docs/source/metrics.rst | 7 +++ .../metrics/functional/__init__.py | 1 + .../metrics/functional/classification.py | 61 ++++++++++++++++++- .../metrics/functional/test_classification.py | 42 +++++++++++++ 5 files changed, 111 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c102be008d0f3..ccd296d3b0e6e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `ConfusionMatrix` class interface ([#4348](https://github.com/PyTorchLightning/pytorch-lightning/pull/4348)) +- Added multiclass AUROC metric ([#4236](https://github.com/PyTorchLightning/pytorch-lightning/pull/4236)) + ### Changed - W&B log in sync with Trainer step ([#4405](https://github.com/PyTorchLightning/pytorch-lightning/pull/4405)) diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst index c232ea216e885..de3cd01c33e9b 100644 --- a/docs/source/metrics.rst +++ b/docs/source/metrics.rst @@ -271,6 +271,13 @@ auroc [func] :noindex: +multiclass_auroc [func] +~~~~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.metrics.functional.classification.multiclass_auroc + :noindex: + + average_precision [func] ~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pytorch_lightning/metrics/functional/__init__.py b/pytorch_lightning/metrics/functional/__init__.py index 189e3945e1a56..620072b44a2da 100644 --- a/pytorch_lightning/metrics/functional/__init__.py +++ b/pytorch_lightning/metrics/functional/__init__.py @@ -21,6 +21,7 @@ fbeta_score, multiclass_precision_recall_curve, multiclass_roc, + multiclass_auroc, precision, precision_recall, precision_recall_curve, diff --git a/pytorch_lightning/metrics/functional/classification.py b/pytorch_lightning/metrics/functional/classification.py index 51b80308a579c..aec1b47096e26 100644 --- a/pytorch_lightning/metrics/functional/classification.py +++ b/pytorch_lightning/metrics/functional/classification.py @@ -817,13 +817,14 @@ def new_func(*args, **kwargs) -> torch.Tensor: def multiclass_auc_decorator(reorder: bool = True) -> Callable: def wrapper(func_to_decorate: Callable) -> Callable: + @wraps(func_to_decorate) def new_func(*args, **kwargs) -> torch.Tensor: results = [] for class_result in func_to_decorate(*args, **kwargs): x, y = class_result[:2] results.append(auc(x, y, reorder=reorder)) - return torch.cat(results) + return torch.stack(results) return new_func @@ -858,7 +859,7 @@ def auroc( if any(target > 1): raise ValueError('AUROC metric is meant for binary classification, but' ' target tensor contains value different from 0 and 1.' - ' Multiclass is currently not supported.') + ' Use `multiclass_auroc` for multi class classification.') @auc_decorator(reorder=True) def _auroc(pred, target, sample_weight, pos_label): @@ -867,6 +868,62 @@ def _auroc(pred, target, sample_weight, pos_label): return _auroc(pred=pred, target=target, sample_weight=sample_weight, pos_label=pos_label) +def multiclass_auroc( + pred: torch.Tensor, + target: torch.Tensor, + sample_weight: Optional[Sequence] = None, + num_classes: Optional[int] = None, +) -> torch.Tensor: + """ + Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from multiclass + prediction scores + + Args: + pred: estimated probabilities, with shape [N, C] + target: ground-truth labels, with shape [N,] + sample_weight: sample weights + num_classes: number of classes (default: None, computes automatically from data) + + Return: + Tensor containing ROCAUC score + + Example: + + >>> pred = torch.tensor([[0.85, 0.05, 0.05, 0.05], + ... [0.05, 0.85, 0.05, 0.05], + ... [0.05, 0.05, 0.85, 0.05], + ... [0.05, 0.05, 0.05, 0.85]]) + >>> target = torch.tensor([0, 1, 3, 2]) + >>> multiclass_auroc(pred, target) # doctest: +NORMALIZE_WHITESPACE + tensor(0.6667) + """ + if not torch.allclose(pred.sum(dim=1), torch.tensor(1.0)): + raise ValueError( + "Multiclass AUROC metric expects the target scores to be" + " probabilities, i.e. they should sum up to 1.0 over classes") + + if torch.unique(target).size(0) != pred.size(1): + raise ValueError( + f"Number of classes found in in 'target' ({torch.unique(target).size(0)})" + f" does not equal the number of columns in 'pred' ({pred.size(1)})." + " Multiclass AUROC is not defined when all of the classes do not" + " occur in the target labels.") + + if num_classes is not None and num_classes != pred.size(1): + raise ValueError( + f"Number of classes deduced from 'pred' ({pred.size(1)}) does not equal" + f" the number of classes passed in 'num_classes' ({num_classes}).") + + @multiclass_auc_decorator(reorder=False) + def _multiclass_auroc(pred, target, sample_weight, num_classes): + return multiclass_roc(pred, target, sample_weight, num_classes) + + class_aurocs = _multiclass_auroc(pred=pred, target=target, + sample_weight=sample_weight, + num_classes=num_classes) + return torch.mean(class_aurocs) + + def average_precision( pred: torch.Tensor, target: torch.Tensor, diff --git a/tests/metrics/functional/test_classification.py b/tests/metrics/functional/test_classification.py index cab543f8dd6e6..12eb8555b10aa 100644 --- a/tests/metrics/functional/test_classification.py +++ b/tests/metrics/functional/test_classification.py @@ -30,6 +30,7 @@ dice_score, average_precision, auroc, + multiclass_auroc, precision_recall_curve, roc, auc, @@ -316,6 +317,47 @@ def test_auroc(pred, target, expected): assert score == expected +def test_multiclass_auroc(): + with pytest.raises(ValueError, + match=r".*probabilities, i.e. they should sum up to 1.0 over classes"): + _ = multiclass_auroc(pred=torch.tensor([[0.9, 0.9], + [1.0, 0]]), + target=torch.tensor([0, 1])) + + with pytest.raises(ValueError, + match=r".*not defined when all of the classes do not occur in the target.*"): + _ = multiclass_auroc(pred=torch.rand((4, 3)).softmax(dim=1), + target=torch.tensor([1, 0, 1, 0])) + + with pytest.raises(ValueError, + match=r".*does not equal the number of classes passed in 'num_classes'.*"): + _ = multiclass_auroc(pred=torch.rand((5, 4)).softmax(dim=1), + target=torch.tensor([0, 1, 2, 2, 3]), + num_classes=6) + + +@pytest.mark.parametrize('n_cls', [2, 5, 10, 50]) +def test_multiclass_auroc_against_sklearn(n_cls): + device = 'cuda' if torch.cuda.is_available() else 'cpu' + + n_samples = 300 + pred = torch.rand(n_samples, n_cls, device=device).softmax(dim=1) + target = torch.randint(n_cls, (n_samples,), device=device) + # Make sure target includes all class labels so that multiclass AUROC is defined + target[10:10 + n_cls] = torch.arange(n_cls) + + pl_score = multiclass_auroc(pred, target) + # For the binary case, sklearn expects an (n_samples,) array of probabilities of + # the positive class + pred = pred[:, 1] if n_cls == 2 else pred + sk_score = sk_roc_auc_score(target.cpu().detach().numpy(), + pred.cpu().detach().numpy(), + multi_class="ovr") + + sk_score = torch.tensor(sk_score, dtype=torch.float, device=device) + assert torch.allclose(sk_score, pl_score) + + @pytest.mark.parametrize(['x', 'y', 'expected'], [ pytest.param([0, 1], [0, 1], 0.5), pytest.param([1, 0], [0, 1], 0.5), From b50dd12332bf83209d9535c8516486edc1a6b252 Mon Sep 17 00:00:00 2001 From: Kevin Mathew T Date: Sat, 31 Oct 2020 17:45:11 +0530 Subject: [PATCH 25/88] ReduceLROnPlateau doc fixed (#4459) --- docs/source/optimizers.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst index 2b8025959c9ca..00e8ef88aa686 100644 --- a/docs/source/optimizers.rst +++ b/docs/source/optimizers.rst @@ -108,7 +108,7 @@ Every optimizer you use can be paired with any `LearningRateScheduler Date: Sat, 31 Oct 2020 20:17:07 +0630 Subject: [PATCH 26/88] [docker] Lock cuda version (#4453) * lock cuda version * back to normal --- .github/workflows/ci_dockers.yml | 7 +++++-- .github/workflows/docker-builds.yml | 2 +- dockers/base-conda/Dockerfile | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 8e8d56b04d501..1cbc99d1b68b8 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -108,8 +108,11 @@ jobs: pytorch_version: 1.6 - python_version: 3.6 pytorch_version: 1.4 - #- python_version: 3.7 - # pytorch_version: 1.8 # todo + - python_version: 3.7 + pytorch_version: 1.7 + # TODO + # - python_version: 3.7 + # pytorch_version: 1.8 steps: - name: Checkout uses: actions/checkout@v2 diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 0ba6f701f65d6..673e65d5ccfae 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -14,7 +14,7 @@ jobs: fail-fast: false matrix: python_version: [3.6, 3.7, 3.8] - pytorch_version: [1.3, 1.4, 1.5, 1.6] + pytorch_version: [1.3, 1.4, 1.5, 1.6, 1.7] exclude: # excludes PT 1.3 as it is missing on pypi - python_version: 3.8 diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 6a7f03970cf75..d11e61d92edbd 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -74,7 +74,7 @@ ENV CONDA_ENV=lightning COPY environment.yml environment.yml # conda init -RUN conda create -y --name $CONDA_ENV && \ +RUN conda create -y --name $CONDA_ENV cudatoolkit=${CUDA_VERSION} && \ conda init bash && \ # NOTE: this requires that the channel is presented in the yaml before packages # replace channel to nigtly if needed, fix PT version and remove Horovod as it will be installe later From 6211fd4b0c0b37532bb2f222a06e439096eb6cfe Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Sat, 31 Oct 2020 20:43:19 +0000 Subject: [PATCH 27/88] Fix type checker issue with explicit cast of ref_model object (#4457) Co-authored-by: Jeff Yang --- pytorch_lightning/trainer/properties.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index 8d509d41d52bf..42b5f7a36641c 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -15,7 +15,7 @@ import os from abc import ABC from argparse import ArgumentParser, Namespace -from typing import List, Optional, Union, Type, TypeVar +from typing import List, Optional, Union, Type, TypeVar, cast from pytorch_lightning.callbacks import Callback, ProgressBarBase, ModelCheckpoint from pytorch_lightning.core.lightning import LightningModule @@ -154,6 +154,7 @@ def progress_bar_callback(self): def progress_bar_dict(self) -> dict: """ Read-only for progress bar metrics. """ ref_model = self.model if not self.data_parallel else self.model.module + ref_model = cast(LightningModule, ref_model) return dict(**ref_model.get_progress_bar_dict(), **self.logger_connector.progress_bar_metrics) @property From 839813eb7be1fbc614859db5086d6ed0d7fd3340 Mon Sep 17 00:00:00 2001 From: Lezwon Castelino Date: Sun, 1 Nov 2020 05:34:25 +0530 Subject: [PATCH 28/88] timeout for tpu check (#4340) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * timeout for tpu check * added tests * updated CHANGELOG.md * fixed windows tests * Update pytorch_lightning/utilities/xla_device_utils.py Co-authored-by: Jirka Borovec * requested changes Co-authored-by: Jirka Borovec Co-authored-by: Adrian Wälchli --- CHANGELOG.md | 2 ++ .../utilities/xla_device_utils.py | 16 +++++++++------ tests/utilities/test_xla_device_utils.py | 20 ++++++++++++++++--- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ccd296d3b0e6e..b2e4107ef5ff8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -48,6 +48,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for string values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) +- Added timeout for `tpu_device_exists` to ensure process does not hang indefinitely ([#4340](https://github.com/PyTorchLightning/pytorch-lightning/pull/4340)) + ### Changed - Improved error messages for invalid `configure_optimizers` returns ([#3587](https://github.com/PyTorchLightning/pytorch-lightning/pull/3587)) diff --git a/pytorch_lightning/utilities/xla_device_utils.py b/pytorch_lightning/utilities/xla_device_utils.py index 5687992981ae6..14a59fd105c5a 100644 --- a/pytorch_lightning/utilities/xla_device_utils.py +++ b/pytorch_lightning/utilities/xla_device_utils.py @@ -13,6 +13,7 @@ # limitations under the License. import functools import importlib +import queue as q from multiprocessing import Process, Queue import torch @@ -24,10 +25,10 @@ xm = None -def inner_f(queue, func, **kwargs): # pragma: no cover +def inner_f(queue, func, *args, **kwargs): # pragma: no cover try: - queue.put(func(**kwargs)) - except Exception as _e: + queue.put(func(*args, **kwargs)) + except Exception: import traceback traceback.print_exc() @@ -38,10 +39,13 @@ def pl_multi_process(func): @functools.wraps(func) def wrapper(*args, **kwargs): queue = Queue() - proc = Process(target=inner_f, args=(queue, func,), kwargs=kwargs) + proc = Process(target=inner_f, args=(queue, func, *args), kwargs=kwargs) proc.start() - proc.join() - return queue.get() + proc.join(10) + try: + return queue.get_nowait() + except q.Empty: + return False return wrapper diff --git a/tests/utilities/test_xla_device_utils.py b/tests/utilities/test_xla_device_utils.py index b0a3497a0f3be..10de63db049e7 100644 --- a/tests/utilities/test_xla_device_utils.py +++ b/tests/utilities/test_xla_device_utils.py @@ -11,13 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import time + import pytest -from pytorch_lightning.utilities.xla_device_utils import XLADeviceUtils as xdu +import pytorch_lightning.utilities.xla_device_utils as xla_utils from tests.base.develop_utils import pl_multi_process_test try: import torch_xla.core.xla_model as xm + XLA_AVAILABLE = True except ImportError as e: XLA_AVAILABLE = False @@ -26,13 +29,13 @@ @pytest.mark.skipif(XLA_AVAILABLE, reason="test requires torch_xla to be absent") def test_tpu_device_absence(): """Check tpu_device_exists returns None when torch_xla is not available""" - assert xdu.tpu_device_exists() is None + assert xla_utils.XLADeviceUtils.tpu_device_exists() is None @pytest.mark.skipif(not XLA_AVAILABLE, reason="test requires torch_xla to be installed") def test_tpu_device_presence(): """Check tpu_device_exists returns True when TPU is available""" - assert xdu.tpu_device_exists() is True + assert xla_utils.XLADeviceUtils.tpu_device_exists() is True @pytest.mark.skipif(not XLA_AVAILABLE, reason="test requires torch_xla to be installed") @@ -42,3 +45,14 @@ def test_xla_device_is_a_tpu(): device = xm.xla_device() device_type = xm.xla_device_hw(device) return device_type == "TPU" + + +def test_result_returns_within_10_seconds(): + """Check that pl_multi_process returns within 10 seconds""" + + start = time.time() + result = xla_utils.pl_multi_process(time.sleep)(25) + end = time.time() + elapsed_time = int(end - start) + assert elapsed_time <= 10 + assert result is False From d537485e410fcd515ade7aaac5125f040b2d1133 Mon Sep 17 00:00:00 2001 From: Jeff Yang Date: Mon, 2 Nov 2020 00:47:51 +0630 Subject: [PATCH 29/88] lock pytorch nightly version (#4469) --- environment.yml | 4 ++-- requirements.txt | 2 +- requirements/examples.txt | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/environment.yml b/environment.yml index 09eb997b78ee6..3d59c1eeed0dd 100644 --- a/environment.yml +++ b/environment.yml @@ -26,7 +26,7 @@ dependencies: - python>=3.6 - pip>20.1 - numpy>=1.16.4 - - pytorch>=1.3 + - pytorch>=1.3,<1.8 - future>=0.17.1 - PyYAML>=5.1 - tqdm>=4.41.0 @@ -41,7 +41,7 @@ dependencies: - torchtext>=0.3.1 # Examples - - torchvision>=0.4.1 + - torchvision>=0.4.1,<0.9.0 - pip: - test-tube>=0.7.5 diff --git a/requirements.txt b/requirements.txt index 0f8423e0860f0..d270e2bc5d854 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # the default package dependencies numpy>=1.16.4 -torch>=1.3 +torch>=1.3,<1.8 future>=0.17.1 # required for builtins in setup.py # pyyaml>=3.13 PyYAML>=5.1 # OmegaConf requirement >=5.1 diff --git a/requirements/examples.txt b/requirements/examples.txt index c87d10a39346f..e930579b8b369 100644 --- a/requirements/examples.txt +++ b/requirements/examples.txt @@ -1,2 +1,2 @@ -torchvision>=0.4.1 +torchvision>=0.4.1,<0.9.0 gym>=0.17.0 \ No newline at end of file From 6ae4c6ec857891f0fdc7d18277b346a7ea6991b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 2 Nov 2020 06:18:20 +0100 Subject: [PATCH 30/88] update docs on checkpoint_callback Trainer argument (#4461) * docs update * update callbacks docs * docs * notebook examples * warning * line lenght * update deprecation Co-authored-by: Sean Naren Co-authored-by: Roger Shieh <55400948+s-rog@users.noreply.github.com> --- docs/source/weights_loading.rst | 8 ++--- notebooks/05-trainer-flags-overview.ipynb | 6 ++-- .../callbacks/model_checkpoint.py | 4 +-- pytorch_lightning/trainer/__init__.py | 36 +++++++------------ .../trainer/connectors/callback_connector.py | 4 +-- pytorch_lightning/trainer/trainer.py | 2 +- tests/checkpointing/test_model_checkpoint.py | 4 +-- tests/test_deprecated.py | 4 +-- 8 files changed, 29 insertions(+), 39 deletions(-) diff --git a/docs/source/weights_loading.rst b/docs/source/weights_loading.rst index bd6900769f30f..e3e4349253a54 100644 --- a/docs/source/weights_loading.rst +++ b/docs/source/weights_loading.rst @@ -65,8 +65,8 @@ You can customize the checkpointing behavior to monitor any quantity of your tra # 3. Init ModelCheckpoint callback, monitoring 'val_loss' checkpoint_callback = ModelCheckpoint(monitor='val_loss') - # 4. Pass your callback to checkpoint_callback trainer flag - trainer = Trainer(checkpoint_callback=checkpoint_callback) + # 4. Add your callback to the callbacks list + trainer = Trainer(callbacks=[checkpoint_callback]) You can also control more advanced options, like `save_top_k`, to save the best k models and the mode of the monitored quantity (min/max/auto, where the mode is automatically inferred from the name of the monitored quantity), `save_weights_only` or `period` to set the interval of epochs between checkpoints, to avoid slowdowns. @@ -89,14 +89,14 @@ You can also control more advanced options, like `save_top_k`, to save the best save_top_k=3, mode='min') - trainer = Trainer(checkpoint_callback=checkpoint_callback) + trainer = Trainer(callbacks=[checkpoint_callback]) You can retrieve the checkpoint after training by calling .. code-block:: python checkpoint_callback = ModelCheckpoint(dirpath='my/path/') - trainer = Trainer(checkpoint_callback=checkpoint_callback) + trainer = Trainer(callbacks=[checkpoint_callback]) trainer.fit(model) checkpoint_callback.best_model_path diff --git a/notebooks/05-trainer-flags-overview.ipynb b/notebooks/05-trainer-flags-overview.ipynb index a070ce03629ba..4589887ecb986 100644 --- a/notebooks/05-trainer-flags-overview.ipynb +++ b/notebooks/05-trainer-flags-overview.ipynb @@ -2223,7 +2223,7 @@ "source": [ "from pytorch_lightning.callbacks import ModelCheckpoint\n", "\n", - "trainer = pl.Trainer(checkpoint_callback=ModelCheckpoint(monitor='val_loss'))\n", + "trainer = pl.Trainer(callbacks=[ModelCheckpoint(monitor='val_loss')])\n", "\n", "trainer.fit(model, train_loader, val_loader)" ], @@ -2265,7 +2265,7 @@ " prefix='',\n", ")\n", "\n", - "trainer = Trainer(checkpoint_callback=checkpoint_callback)\n", + "trainer = Trainer(callbacks=[checkpoint_callback])\n", "\n", "trainer.fit(model, train_loader, val_loader)" ], @@ -2471,7 +2471,7 @@ "# **NOTE: this saves weights to some/path NOT my/path\n", "checkpoint = ModelCheckpoint(filepath='some/path')\n", "trainer = pl.Trainer(\n", - " checkpoint_callback=checkpoint,\n", + " callbacks=[checkpoint],\n", " weights_save_path='my/path'\n", ")\n", "trainer.fit(model, train_loader, val_loader)" diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 1a81ca0980339..33ed1d71d9eb4 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -111,7 +111,7 @@ class ModelCheckpoint(Callback): # saves checkpoints to 'my/path/' at every epoch >>> checkpoint_callback = ModelCheckpoint(dirpath='my/path/') - >>> trainer = Trainer(checkpoint_callback=checkpoint_callback) + >>> trainer = Trainer(callbacks=[checkpoint_callback]) # save epoch and val_loss in name # saves a file like: my/path/sample-mnist-epoch=02-val_loss=0.32.ckpt @@ -123,7 +123,7 @@ class ModelCheckpoint(Callback): # retrieve the best checkpoint after training checkpoint_callback = ModelCheckpoint(dirpath='my/path/') - trainer = Trainer(checkpoint_callback=checkpoint_callback) + trainer = Trainer(callbacks=[checkpoint_callback]) model = ... trainer.fit(model) checkpoint_callback.best_model_path diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py index 2d0c3e46f2fe0..c0a3744f162fd 100644 --- a/pytorch_lightning/trainer/__init__.py +++ b/pytorch_lightning/trainer/__init__.py @@ -484,10 +484,7 @@ def training_step(self, batch, batch_idx, optimizer_idx): | -Add a list of :class:`~pytorch_lightning.callbacks.Callback`. These callbacks DO NOT replace the explicit callbacks -(loggers or :class:`~pytorch_lightning.callbacks.ModelCheckpoint`). - -.. note:: Only user defined callbacks (ie: Not :class:`~pytorch_lightning.callbacks.ModelCheckpoint`) +Add a list of :class:`~pytorch_lightning.callbacks.Callback`. .. code-block:: python @@ -537,34 +534,27 @@ def on_train_end(self, trainer, pl_module): | -Pass in a callback for checkpointing. Checkpoints capture the exact value of all parameters used by a model. By default Lightning saves a checkpoint for you in your current working directory, with the state of your last training epoch, -but you can override the default behavior by Initializing the :class:`~pytorch_lightning.callbacks.ModelCheckpoint` callback, -and passing it to :class:`~pytorch_lightning.trainer.Trainer` `checkpoint_callback` flag. +Checkpoints capture the exact value of all parameters used by a model. +To disable automatic checkpointing, set this to `False`. .. code-block:: python - from pytorch_lightning.callbacks import ModelCheckpoint + # default used by Trainer + trainer = Trainer(checkpoint_callback=True) - # default used by the Trainer - checkpoint_callback = ModelCheckpoint( - dirpath=os.getcwd(), - save_top_k=True, - verbose=True, - monitor='checkpoint_on', - mode='min', - prefix='' - ) + # turn off automatic checkpointing + trainer = Trainer(checkpoint_callback=False) - trainer = Trainer(checkpoint_callback=checkpoint_callback) -To disable automatic checkpointing, set this to `False`. +You can override the default behavior by initializing the :class:`~pytorch_lightning.callbacks.ModelCheckpoint` +callback, and adding it to the :paramref:`~pytorch_lightning.trainer.trainer.Trainer.callbacks` list. +See :ref:`Saving and Loading Weights ` for how to customize checkpointing. -.. code-block:: python - trainer = Trainer(checkpoint_callback=False) +.. warning:: Passing a ModelCheckpoint instance to this argument is deprecated since + v1.1.0 and will be unsupported from v1.3.0. -See also :ref:`Saving and Loading Weights `. default_root_dir ^^^^^^^^^^^^^^^^ @@ -1529,7 +1519,7 @@ def tbptt_split_batch(self, batch, split_size): # **NOTE: this saves weights to some/path NOT my/path checkpoint = ModelCheckpoint(dirpath='some/path') trainer = Trainer( - checkpoint_callback=checkpoint, + callbacks=[checkpoint], weights_save_path='my/path' ) diff --git a/pytorch_lightning/trainer/connectors/callback_connector.py b/pytorch_lightning/trainer/connectors/callback_connector.py index b8a4276a2d747..c9ef4ae32be77 100644 --- a/pytorch_lightning/trainer/connectors/callback_connector.py +++ b/pytorch_lightning/trainer/connectors/callback_connector.py @@ -56,10 +56,10 @@ def on_trainer_init( def configure_checkpoint_callbacks(self, checkpoint_callback: Union[ModelCheckpoint, bool]): if isinstance(checkpoint_callback, ModelCheckpoint): - # TODO: deprecated, remove this block in v1.4.0 + # TODO: deprecated, remove this block in v1.3.0 rank_zero_warn( "Passing a ModelCheckpoint instance to Trainer(checkpoint_callbacks=...)" - " is deprecated since v1.1 and will no longer be supported in v1.4.", + " is deprecated since v1.1 and will no longer be supported in v1.3.", DeprecationWarning ) self.trainer.callbacks.append(checkpoint_callback) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 008633273a0d1..1f6b97e7892f9 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -174,7 +174,7 @@ def __init__( :paramref:`~pytorch_lightning.trainer.trainer.Trainer.callbacks`. Default: ``True``. .. warning:: Passing a ModelCheckpoint instance to this argument is deprecated since - v1.1.0 and will be unsupported from v1.4.0. + v1.1.0 and will be unsupported from v1.3.0. check_val_every_n_epoch: Check val every n train epochs. diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index 19705a6ebc9a2..c2b5b7e9fc8a9 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -779,12 +779,12 @@ def test_configure_model_checkpoint(tmpdir): assert trainer.checkpoint_callback == callback1 assert trainer.checkpoint_callbacks == [callback1, callback2] - with pytest.warns(DeprecationWarning, match='will no longer be supported in v1.4'): + with pytest.warns(DeprecationWarning, match='will no longer be supported in v1.3'): trainer = Trainer(checkpoint_callback=callback1, callbacks=[], **kwargs) assert [c for c in trainer.callbacks if isinstance(c, ModelCheckpoint)] == [callback1] assert trainer.checkpoint_callback == callback1 - with pytest.warns(DeprecationWarning, match="will no longer be supported in v1.4"): + with pytest.warns(DeprecationWarning, match="will no longer be supported in v1.3"): trainer = Trainer(checkpoint_callback=callback1, callbacks=[callback2], **kwargs) assert trainer.checkpoint_callback == callback2 assert trainer.checkpoint_callbacks == [callback2, callback1] diff --git a/tests/test_deprecated.py b/tests/test_deprecated.py index 67f38568e2103..de3fb63fe9664 100644 --- a/tests/test_deprecated.py +++ b/tests/test_deprecated.py @@ -15,8 +15,8 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException -def test_tbd_remove_in_v1_4_0(tmpdir): - with pytest.deprecated_call(match='will no longer be supported in v1.4'): +def test_tbd_remove_in_v1_3_0(tmpdir): + with pytest.deprecated_call(match='will no longer be supported in v1.3'): callback = ModelCheckpoint() Trainer(checkpoint_callback=callback, callbacks=[], default_root_dir=tmpdir) From 169e8c3d98dc2b17b79bc3746f5cea4754dd2339 Mon Sep 17 00:00:00 2001 From: Muyang Li Date: Mon, 2 Nov 2020 15:30:58 +0800 Subject: [PATCH 31/88] Fix a typo in README (#4474) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fea674bc396b0..33ee544802914 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,7 @@ trainer = pl.Trainer() trainer.fit(autoencoder, DataLoader(train), DataLoader(val)) ``` -#### And without changing a single line of code, you could run on GPU/TPUss +#### And without changing a single line of code, you could run on GPUs/TPUs ```python # 8 GPUs trainer = Trainer(max_epochs=1, gpus=8) From c2e72c3c86e951dee45a5352de2eeedf5dc09059 Mon Sep 17 00:00:00 2001 From: chaton Date: Mon, 2 Nov 2020 09:04:50 +0000 Subject: [PATCH 32/88] [BUG-FIX] WandbLogger _sanitize_callable (#4422) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix * resolve CodeFormatter * Update pytorch_lightning/loggers/base.py Co-authored-by: Adrian Wälchli Co-authored-by: Adrian Wälchli --- pytorch_lightning/loggers/base.py | 2 +- tests/loggers/test_wandb.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/loggers/base.py b/pytorch_lightning/loggers/base.py index cf0b22d7d446f..2246d02bc9bcb 100644 --- a/pytorch_lightning/loggers/base.py +++ b/pytorch_lightning/loggers/base.py @@ -188,7 +188,7 @@ def _sanitize_callable(val): return val.__name__ return _val except Exception: - return val.__name__ + return getattr(val, "__name__", None) return val return {key: _sanitize_callable(val) for key, val in params.items()} diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py index cfb6533bd913b..a7fd986bad642 100644 --- a/tests/loggers/test_wandb.py +++ b/tests/loggers/test_wandb.py @@ -19,7 +19,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.loggers import WandbLogger -from tests.base import EvalModelTemplate +from tests.base import EvalModelTemplate, BoringModel @mock.patch('pytorch_lightning.loggers.wandb.wandb') @@ -135,6 +135,8 @@ def return_something(): def wrapper_something(): return return_something + + params.wrapper_something_wo_name = lambda: lambda: '1' params.wrapper_something = wrapper_something assert isinstance(params.gpus, types.FunctionType) @@ -144,3 +146,4 @@ def wrapper_something(): assert params["gpus"] == '_gpus_arg_default' assert params["something"] == "something" assert params["wrapper_something"] == "wrapper_something" + assert params["wrapper_something_wo_name"] == "" From f40d08679d31ef6e705f1e0e5a66473c817325e1 Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Mon, 2 Nov 2020 10:46:02 +0000 Subject: [PATCH 33/88] Add manual logging to training_step manual optimization (#4476) --- docs/source/optimizers.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst index 00e8ef88aa686..1e7baadb64480 100644 --- a/docs/source/optimizers.rst +++ b/docs/source/optimizers.rst @@ -48,6 +48,10 @@ to manually manage the optimization process. To do so, do the following: opt_d.step() opt_d.zero_grad() + # log losses + self.log('loss_a', loss_a) + self.log('loss_b', loss_b) + .. note:: This is only recommended for experts who need ultimate flexibility Manual optimization does not yet support accumulated gradients but will be live in 1.1.0 From ef03c39ab75dc246edddbff3af84035113141ec7 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 2 Nov 2020 15:05:58 +0100 Subject: [PATCH 34/88] Add step index in checkpoint name (#3807) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * true final value of global step * ch check * tests * save each validation interval * wip * add test * add test * wip * fix tests, revert old edits, fix merge conflicts, update doctests * test + bugfix * sort files * format test * suggestion by ananth * added changelog * naming * docs * example * suggestion Co-authored-by: Carlos Mocholí * fix test * pep * pep Co-authored-by: Adrian Wälchli Co-authored-by: Rohit Gupta Co-authored-by: Carlos Mocholí --- CHANGELOG.md | 2 + .../callbacks/model_checkpoint.py | 65 ++++++------ pytorch_lightning/trainer/evaluation_loop.py | 7 +- tests/checkpointing/test_model_checkpoint.py | 100 +++++++++++++----- tests/loggers/test_comet.py | 2 +- tests/loggers/test_mlflow.py | 4 +- tests/loggers/test_wandb.py | 2 +- tests/trainer/test_trainer.py | 2 +- 8 files changed, 117 insertions(+), 67 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b2e4107ef5ff8..cdb9ddc804d72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added multiclass AUROC metric ([#4236](https://github.com/PyTorchLightning/pytorch-lightning/pull/4236)) +- Added global step indexing to the checkpoint name for a better sub-epoch checkpointing experience ([#3807](https://github.com/PyTorchLightning/pytorch-lightning/pull/3807)) + ### Changed - W&B log in sync with Trainer step ([#4405](https://github.com/PyTorchLightning/pytorch-lightning/pull/4405)) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 33ed1d71d9eb4..f3eabf5611cf0 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -101,7 +101,7 @@ class ModelCheckpoint(Callback): ... filename='{epoch}-{val_loss:.2f}-{other_metric:.2f}' ... ) - By default, filename is ``None`` and will be set to ``'{epoch}'``. + By default, filename is ``None`` and will be set to ``'{epoch}-{step}'``. Example:: @@ -222,16 +222,16 @@ def save_checkpoint(self, trainer, pl_module): monitor_candidates = self._monitor_candidates(trainer) # ie: path/val_loss=0.5.ckpt - filepath = self._get_metric_interpolated_filepath_name(epoch, monitor_candidates) + filepath = self._get_metric_interpolated_filepath_name(monitor_candidates, epoch, global_step) # callback supports multiple simultaneous modes # here we call each mode sequentially # Mode 1: save all checkpoints OR only the top k if self.save_top_k: - self._save_top_k_checkpoints(monitor_candidates, trainer, pl_module, epoch, filepath) + self._save_top_k_checkpoints(monitor_candidates, trainer, pl_module, filepath) # Mode 2: save the last checkpoint - self._save_last_checkpoint(trainer, pl_module, epoch, monitor_candidates, filepath) + self._save_last_checkpoint(trainer, pl_module, monitor_candidates, filepath) def __validate_init_configuration(self): if self.save_top_k is not None and self.save_top_k < -1: @@ -360,16 +360,17 @@ def _format_checkpoint_name( cls, filename: Optional[str], epoch: int, + step: int, metrics: Dict[str, Any], prefix: str = "", ) -> str: if not filename: # filename is not set, use default name - filename = "{epoch}" + filename = "{epoch}-{step}" # check and parse user passed keys in the string groups = re.findall(r"(\{.*?)[:\}]", filename) if len(groups) >= 0: - metrics["epoch"] = epoch + metrics.update({"epoch": epoch, 'step': step}) for group in groups: name = group[1:] filename = filename.replace(group, name + "={" + name) @@ -379,7 +380,7 @@ def _format_checkpoint_name( return cls.CHECKPOINT_JOIN_CHAR.join([txt for txt in (prefix, filename) if txt]) def format_checkpoint_name( - self, epoch: int, metrics: Dict[str, Any], ver: Optional[int] = None + self, epoch: int, step: int, metrics: Dict[str, Any], ver: Optional[int] = None ) -> str: """Generate a filename according to the defined template. @@ -387,24 +388,24 @@ def format_checkpoint_name( >>> tmpdir = os.path.dirname(__file__) >>> ckpt = ModelCheckpoint(dirpath=tmpdir, filename='{epoch}') - >>> os.path.basename(ckpt.format_checkpoint_name(0, {})) + >>> os.path.basename(ckpt.format_checkpoint_name(0, 1, metrics={})) 'epoch=0.ckpt' >>> ckpt = ModelCheckpoint(dirpath=tmpdir, filename='{epoch:03d}') - >>> os.path.basename(ckpt.format_checkpoint_name(5, {})) + >>> os.path.basename(ckpt.format_checkpoint_name(5, 2, metrics={})) 'epoch=005.ckpt' >>> ckpt = ModelCheckpoint(dirpath=tmpdir, filename='{epoch}-{val_loss:.2f}') - >>> os.path.basename(ckpt.format_checkpoint_name(2, dict(val_loss=0.123456))) + >>> os.path.basename(ckpt.format_checkpoint_name(2, 3, metrics=dict(val_loss=0.123456))) 'epoch=2-val_loss=0.12.ckpt' >>> ckpt = ModelCheckpoint(dirpath=tmpdir, filename='{missing:d}') - >>> os.path.basename(ckpt.format_checkpoint_name(0, {})) + >>> os.path.basename(ckpt.format_checkpoint_name(0, 4, metrics={})) 'missing=0.ckpt' - >>> ckpt = ModelCheckpoint(filename='{epoch}') - >>> os.path.basename(ckpt.format_checkpoint_name(0, {})) - 'epoch=0.ckpt' + >>> ckpt = ModelCheckpoint(filename='{step}') + >>> os.path.basename(ckpt.format_checkpoint_name(0, 0, {})) + 'step=0.ckpt' """ filename = self._format_checkpoint_name( - self.filename, epoch, metrics, prefix=self.prefix + self.filename, epoch, step, metrics, prefix=self.prefix ) if ver is not None: filename = self.CHECKPOINT_JOIN_CHAR.join((filename, f"v{ver}")) @@ -479,13 +480,11 @@ def _validate_monitor_key(self, trainer): ) raise MisconfigurationException(m) - def _get_metric_interpolated_filepath_name(self, epoch, ckpt_name_metrics): - filepath = self.format_checkpoint_name(epoch, ckpt_name_metrics) + def _get_metric_interpolated_filepath_name(self, ckpt_name_metrics: Dict[str, Any], epoch: int, step: int): + filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics) version_cnt = 0 while self._fs.exists(filepath): - filepath = self.format_checkpoint_name( - epoch, ckpt_name_metrics, ver=version_cnt - ) + filepath = self.format_checkpoint_name(epoch, step, ckpt_name_metrics, ver=version_cnt) # this epoch called before version_cnt += 1 return filepath @@ -494,9 +493,10 @@ def _monitor_candidates(self, trainer): ckpt_name_metrics = deepcopy(trainer.logger_connector.logged_metrics) ckpt_name_metrics.update(trainer.logger_connector.callback_metrics) ckpt_name_metrics.update(trainer.logger_connector.progress_bar_metrics) + ckpt_name_metrics.update({"step": trainer.global_step, "epoch": trainer.current_epoch}) return ckpt_name_metrics - def _save_last_checkpoint(self, trainer, pl_module, epoch, ckpt_name_metrics, filepath): + def _save_last_checkpoint(self, trainer, pl_module, ckpt_name_metrics, filepath): should_save_last = self.monitor is None or self.save_last if not should_save_last: return @@ -506,7 +506,11 @@ def _save_last_checkpoint(self, trainer, pl_module, epoch, ckpt_name_metrics, fi # when user ALSO asked for the 'last.ckpt' change the name if self.save_last: last_filepath = self._format_checkpoint_name( - self.CHECKPOINT_NAME_LAST, epoch, ckpt_name_metrics, prefix=self.prefix + self.CHECKPOINT_NAME_LAST, + trainer.current_epoch, + trainer.global_step, + ckpt_name_metrics, + prefix=self.prefix ) last_filepath = os.path.join(self.dirpath, f"{last_filepath}.ckpt") @@ -523,17 +527,19 @@ def _save_last_checkpoint(self, trainer, pl_module, epoch, ckpt_name_metrics, fi if self.monitor is None: self.best_model_path = self.last_model_path - def _save_top_k_checkpoints(self, metrics, trainer, pl_module, epoch, filepath): + def _save_top_k_checkpoints(self, metrics, trainer, pl_module, filepath): current = metrics.get(self.monitor) + epoch = metrics.get("epoch") + step = metrics.get("step") if not isinstance(current, torch.Tensor) and current is not None: current = torch.tensor(current, device=pl_module.device) if self.check_monitor_top_k(current): - self._update_best_and_save(filepath, current, epoch, trainer, pl_module) + self._update_best_and_save(filepath, current, epoch, step, trainer, pl_module) elif self.verbose: rank_zero_info( - f"Epoch {epoch:d}: {self.monitor} was not in top {self.save_top_k}" + f"Epoch {epoch:d}, step {step:d}: {self.monitor} was not in top {self.save_top_k}" ) def _is_valid_monitor_key(self, metrics): @@ -544,11 +550,11 @@ def _update_best_and_save( filepath: str, current: torch.Tensor, epoch: int, + step: int, trainer, pl_module, ): - - k = epoch + 1 if self.save_top_k == -1 else self.save_top_k + k = len(self.best_k_models) + 1 if self.save_top_k == -1 else self.save_top_k del_list = [] if len(self.best_k_models) == k and k > 0: @@ -575,9 +581,8 @@ def _update_best_and_save( if self.verbose: rank_zero_info( - f"Epoch {epoch:d}: {self.monitor} reached" - f" {current:0.5f} (best {self.best_model_score:0.5f})," - f" saving model to {filepath} as top {k}" + f"Epoch {epoch:d}, global step {step:d}: {self.monitor} reached {current:0.5f}" + f' (best {self.best_model_score:0.5f}), saving model to "{filepath}" as top {k}' ) self._save_model(filepath, trainer, pl_module) diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index 9dab036583dd8..ffc72f8f0022e 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -250,9 +250,10 @@ def __run_eval_epoch_end(self, num_dataloaders, using_eval_result): # depre warning if eval_results is not None and user_reduced: step = 'testing_epoch_end' if self.testing else 'validation_epoch_end' - m = f'The {step} should not return anything as of 9.1.' \ - f'to log, use self.log(...) or self.write(...) directly in the LightningModule' - self.warning_cache.warn(m) + self.warning_cache.warn( + f'The {step} should not return anything as of 9.1.' + ' To log, use self.log(...) or self.write(...) directly in the LightningModule' + ) if using_eval_result and not user_reduced: eval_results = self.__auto_reduce_result_objs(outputs) diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index c2b5b7e9fc8a9..d3d5f67bcfeaa 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -100,7 +100,7 @@ def test_model_checkpoint_to_yaml(tmpdir, save_top_k): path_yaml = os.path.join(tmpdir, 'best_k_models.yaml') checkpoint.to_yaml(path_yaml) d = yaml.full_load(open(path_yaml, 'r')) - best_k = {k: v.item() for k, v in checkpoint.best_k_models.items()} + best_k = {k: v for k, v in checkpoint.best_k_models.items()} assert d == best_k @@ -185,67 +185,72 @@ def test_model_checkpoint_no_extraneous_invocations(tmpdir): def test_model_checkpoint_format_checkpoint_name(tmpdir): # empty filename: - ckpt_name = ModelCheckpoint._format_checkpoint_name('', 3, {}) - assert ckpt_name == 'epoch=3' + ckpt_name = ModelCheckpoint._format_checkpoint_name('', 3, 2, {}) + assert ckpt_name == 'epoch=3-step=2' - ckpt_name = ModelCheckpoint._format_checkpoint_name(None, 3, {}, prefix='test') - assert ckpt_name == 'test-epoch=3' + ckpt_name = ModelCheckpoint._format_checkpoint_name(None, 3, 2, {}, prefix='test') + assert ckpt_name == 'test-epoch=3-step=2' # no groups case: - ckpt_name = ModelCheckpoint._format_checkpoint_name('ckpt', 3, {}, prefix='test') + ckpt_name = ModelCheckpoint._format_checkpoint_name('ckpt', 3, 2, {}, prefix='test') assert ckpt_name == 'test-ckpt' # no prefix - ckpt_name = ModelCheckpoint._format_checkpoint_name('{epoch:03d}-{acc}', 3, {'acc': 0.03}) + ckpt_name = ModelCheckpoint._format_checkpoint_name('{epoch:03d}-{acc}', 3, 2, {'acc': 0.03}) assert ckpt_name == 'epoch=003-acc=0.03' # prefix char_org = ModelCheckpoint.CHECKPOINT_JOIN_CHAR ModelCheckpoint.CHECKPOINT_JOIN_CHAR = '@' - ckpt_name = ModelCheckpoint._format_checkpoint_name('{epoch},{acc:.5f}', 3, {'acc': 0.03}, prefix='test') + ckpt_name = ModelCheckpoint._format_checkpoint_name('{epoch},{acc:.5f}', 3, 2, {'acc': 0.03}, prefix='test') assert ckpt_name == 'test@epoch=3,acc=0.03000' ModelCheckpoint.CHECKPOINT_JOIN_CHAR = char_org # no dirpath set - ckpt_name = ModelCheckpoint(monitor='early_stop_on', dirpath=None).format_checkpoint_name(3, {}) - assert ckpt_name == 'epoch=3.ckpt' - ckpt_name = ModelCheckpoint(monitor='early_stop_on', dirpath='').format_checkpoint_name(5, {}) - assert ckpt_name == 'epoch=5.ckpt' + ckpt_name = ModelCheckpoint(monitor='early_stop_on', dirpath=None).format_checkpoint_name(3, 2, {}) + assert ckpt_name == 'epoch=3-step=2.ckpt' + ckpt_name = ModelCheckpoint(monitor='early_stop_on', dirpath='').format_checkpoint_name(5, 4, {}) + assert ckpt_name == 'epoch=5-step=4.ckpt' # CWD - ckpt_name = ModelCheckpoint(monitor='early_stop_on', dirpath='.').format_checkpoint_name(3, {}) - assert ckpt_name == str(Path('.').resolve() / 'epoch=3.ckpt') + ckpt_name = ModelCheckpoint(monitor='early_stop_on', dirpath='.').format_checkpoint_name(3, 4, {}) + assert ckpt_name == str(Path('.').resolve() / 'epoch=3-step=4.ckpt') # with ver ckpt_name = ModelCheckpoint( monitor='early_stop_on', dirpath=tmpdir, filename='name', prefix='test' - ).format_checkpoint_name(3, {}, ver=3) + ).format_checkpoint_name(3, 2, {}, ver=3) assert ckpt_name == tmpdir / 'test-name-v3.ckpt' # using slashes ckpt_name = ModelCheckpoint( monitor='early_stop_on', dirpath=None, filename='{epoch}_{val/loss:.5f}' - ).format_checkpoint_name(4, {'val/loss': 0.03}) + ).format_checkpoint_name(4, 3, {'val/loss': 0.03}) assert ckpt_name == 'epoch=4_val/loss=0.03000.ckpt' # TODO: Checks with filepath. To be removed in v1.2 # CWD - ckpt_name = ModelCheckpoint(monitor='early_stop_on', filepath='.').format_checkpoint_name(3, {}) - assert ckpt_name == str(Path('.').resolve() / 'epoch=3.ckpt') + ckpt_name = ModelCheckpoint(monitor='early_stop_on', filepath='.').format_checkpoint_name(3, 2, {}) + assert ckpt_name == str(Path('.').resolve() / 'epoch=3-step=2.ckpt') # dir does not exist so it is used as filename filepath = tmpdir / 'dir' - ckpt_name = ModelCheckpoint(monitor='early_stop_on', filepath=filepath, prefix='test').format_checkpoint_name(3, {}) + ckpt_name = ModelCheckpoint( + monitor='early_stop_on', filepath=filepath, prefix='test' + ).format_checkpoint_name(3, 2, {}) assert ckpt_name == tmpdir / 'test-dir.ckpt' # now, dir exists os.mkdir(filepath) - ckpt_name = ModelCheckpoint(monitor='early_stop_on', filepath=filepath, prefix='test').format_checkpoint_name(3, {}) - assert ckpt_name == filepath / 'test-epoch=3.ckpt' + ckpt_name = ModelCheckpoint( + monitor='early_stop_on', filepath=filepath, prefix='test' + ).format_checkpoint_name(3, 2, {}) + assert ckpt_name == filepath / 'test-epoch=3-step=2.ckpt' def test_model_checkpoint_save_last(tmpdir): """Tests that save_last produces only one last checkpoint.""" + seed_everything() model = EvalModelTemplate() epochs = 3 ModelCheckpoint.CHECKPOINT_NAME_LAST = 'last-{epoch}' @@ -257,10 +262,15 @@ def test_model_checkpoint_save_last(tmpdir): logger=False, ) trainer.fit(model) - last_filename = model_checkpoint._format_checkpoint_name(ModelCheckpoint.CHECKPOINT_NAME_LAST, epochs - 1, {}) + last_filename = model_checkpoint._format_checkpoint_name( + ModelCheckpoint.CHECKPOINT_NAME_LAST, trainer.current_epoch, trainer.global_step, {} + ) last_filename = last_filename + '.ckpt' assert str(tmpdir / last_filename) == model_checkpoint.last_model_path - assert set(os.listdir(tmpdir)) == set([f'epoch={i}.ckpt' for i in range(epochs)] + [last_filename]) + assert set(os.listdir(tmpdir)) == set( + [f"epoch={i}-step={j}.ckpt" for i, j in zip(range(epochs), [9, 19, 29])] + [last_filename] + ) + ModelCheckpoint.CHECKPOINT_NAME_LAST = 'last' @@ -295,6 +305,7 @@ def test_none_monitor_save_last(tmpdir): def test_model_checkpoint_none_monitor(tmpdir): """ Test that it is possible to save all checkpoints when monitor=None. """ + seed_everything() model = EvalModelTemplate() model.validation_step = model.validation_step_no_monitor model.validation_epoch_end = model.validation_epoch_end_no_monitor @@ -311,13 +322,13 @@ def test_model_checkpoint_none_monitor(tmpdir): # these should not be set if monitor is None assert checkpoint_callback.monitor is None - assert checkpoint_callback.best_model_path == checkpoint_callback.last_model_path == tmpdir / 'epoch=1.ckpt' + assert checkpoint_callback.best_model_path == checkpoint_callback.last_model_path == tmpdir / 'epoch=1-step=19.ckpt' assert checkpoint_callback.best_model_score == 0 assert checkpoint_callback.best_k_models == {} assert checkpoint_callback.kth_best_model_path == '' # check that the correct ckpts were created - expected = [f'epoch={e}.ckpt' for e in range(epochs)] + expected = [f'epoch={i}-step={j}.ckpt' for i, j in zip(range(epochs), [9, 19])] assert set(os.listdir(tmpdir)) == set(expected) @@ -325,13 +336,14 @@ def test_model_checkpoint_none_monitor(tmpdir): def test_model_checkpoint_period(tmpdir, period): model = EvalModelTemplate() epochs = 5 - checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, save_top_k=-1, period=period) + checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, filename='{epoch}', save_top_k=-1, period=period) trainer = Trainer( default_root_dir=tmpdir, checkpoint_callback=checkpoint_callback, max_epochs=epochs, limit_train_batches=0.1, limit_val_batches=0.1, + val_check_interval=1.0, logger=False, ) trainer.fit(model) @@ -372,12 +384,19 @@ def validation_epoch_end(self, outputs): return {'epoch': self.current_epoch} model = CustomModel() - checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="epoch", mode='max', save_top_k=-1) + checkpoint_callback = ModelCheckpoint( + dirpath=tmpdir, + filename="{epoch}", + monitor="epoch", + mode='max', + save_top_k=-1, + ) trainer = Trainer( default_root_dir=tmpdir, checkpoint_callback=checkpoint_callback, max_epochs=epochs, logger=False, + val_check_interval=1.0, ) trainer.fit(model) @@ -439,7 +458,7 @@ def test_default_checkpoint_behavior(tmpdir): # make sure the checkpoint we saved has the metric in the name ckpts = os.listdir(os.path.join(tmpdir, 'lightning_logs', 'version_0', 'checkpoints')) assert len(ckpts) == 1 - assert ckpts[0] == 'epoch=2.ckpt' + assert ckpts[0] == 'epoch=2-step=14.ckpt' def test_ckpt_metric_names_results(tmpdir): @@ -497,7 +516,7 @@ def test_model_checkpoint_save_last_checkpoint_contents(tmpdir): model = EvalModelTemplate() num_epochs = 3 model_checkpoint = ModelCheckpoint( - monitor='early_stop_on', dirpath=tmpdir, save_top_k=num_epochs, save_last=True + monitor='early_stop_on', dirpath=tmpdir, filename="{epoch}", save_top_k=num_epochs, save_last=True ) trainer = Trainer( default_root_dir=tmpdir, @@ -509,6 +528,7 @@ def test_model_checkpoint_save_last_checkpoint_contents(tmpdir): path_last_epoch = str(tmpdir / f"epoch={num_epochs - 1}.ckpt") path_last = str(tmpdir / "last.ckpt") assert path_last == model_checkpoint.last_model_path + assert os.path.isfile(path_last_epoch) ckpt_last_epoch = torch.load(path_last_epoch) ckpt_last = torch.load(path_last) @@ -791,3 +811,25 @@ def test_configure_model_checkpoint(tmpdir): with pytest.raises(MisconfigurationException, match="checkpoint_callback=False but found ModelCheckpoint"): Trainer(checkpoint_callback=False, callbacks=[callback1], **kwargs) + + +def test_val_check_interval_checkpoint_files(tmpdir): + """ Test correct checkpoint naming when validating/checkpointing multiple times per epoch. """ + model = EvalModelTemplate() + model_checkpoint = ModelCheckpoint( + dirpath=tmpdir, + save_top_k=-1, + monitor="val_acc", + mode="max", + verbose=True + ) + trainer = Trainer( + default_root_dir=tmpdir, + val_check_interval=0.2, + max_epochs=1, + limit_train_batches=10, + callbacks=[model_checkpoint] + ) + trainer.fit(model) + files = sorted([p.name for p in Path(tmpdir).glob("*.ckpt")]) + assert files == [f"epoch=0-step={s}.ckpt" for s in [1, 3, 5, 7, 9]] diff --git a/tests/loggers/test_comet.py b/tests/loggers/test_comet.py index 87af510e49219..fc61829645b6e 100644 --- a/tests/loggers/test_comet.py +++ b/tests/loggers/test_comet.py @@ -159,7 +159,7 @@ def test_comet_logger_dirs_creation(comet, comet_experiment, tmpdir, monkeypatch trainer.fit(model) assert trainer.checkpoint_callback.dirpath == (tmpdir / 'test' / "1" / 'checkpoints') - assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0.ckpt'} + assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0-step=9.ckpt'} @patch('pytorch_lightning.loggers.comet.comet_ml') diff --git a/tests/loggers/test_mlflow.py b/tests/loggers/test_mlflow.py index db2c353dc4e2c..a200fbf549e6a 100644 --- a/tests/loggers/test_mlflow.py +++ b/tests/loggers/test_mlflow.py @@ -115,7 +115,7 @@ def test_mlflow_log_dir(client, mlflow, tmpdir): ) trainer.fit(model) assert trainer.checkpoint_callback.dirpath == (tmpdir / "exp-id" / "run-id" / 'checkpoints') - assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0.ckpt'} + assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0-step=0.ckpt'} def test_mlflow_logger_dirs_creation(tmpdir): @@ -143,7 +143,7 @@ def test_mlflow_logger_dirs_creation(tmpdir): assert 'epoch' in os.listdir(tmpdir / exp_id / run_id / 'metrics') assert set(os.listdir(tmpdir / exp_id / run_id / 'params')) == model.hparams.keys() assert trainer.checkpoint_callback.dirpath == (tmpdir / exp_id / run_id / 'checkpoints') - assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0.ckpt'} + assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0-step=9.ckpt'} @mock.patch('pytorch_lightning.loggers.mlflow.mlflow') diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py index a7fd986bad642..468ca819f91b1 100644 --- a/tests/loggers/test_wandb.py +++ b/tests/loggers/test_wandb.py @@ -116,7 +116,7 @@ def test_wandb_logger_dirs_creation(wandb, tmpdir): trainer.fit(model) assert trainer.checkpoint_callback.dirpath == str(tmpdir / 'project' / version / 'checkpoints') - assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0.ckpt'} + assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0-step=9.ckpt'} def test_wandb_sanitize_callable_params(tmpdir): diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 35257e28704ba..6fceae4b5e59d 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -430,7 +430,7 @@ def mock_save_function(filepath, *args): losses = [10, 9, 2.8, 5, 2.5] checkpoint_callback = ModelCheckpoint( - dirpath=tmpdir, monitor='checkpoint_on', save_top_k=save_top_k, + dirpath=tmpdir, filename='{epoch}', monitor='checkpoint_on', save_top_k=save_top_k, save_last=save_last, prefix=file_prefix, verbose=1 ) checkpoint_callback.save_function = mock_save_function From 9b8102d1a53d3b395ed7e1670b7430bc59461f4e Mon Sep 17 00:00:00 2001 From: chaton Date: Mon, 2 Nov 2020 15:23:24 +0000 Subject: [PATCH 35/88] [DOC] Clarify `tpu_cores` training. (#4475) * better explanation around tpu_cores * more details on tpu training * Apply suggestions from code review Co-authored-by: Jeff Yang --- docs/source/tpu.rst | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/docs/source/tpu.rst b/docs/source/tpu.rst index ffebe41a9c856..5f4c48076d813 100644 --- a/docs/source/tpu.rst +++ b/docs/source/tpu.rst @@ -128,13 +128,27 @@ That's it! Your model will train on all 8 TPU cores. ---------------- -Single TPU core training +TPU core training + ------------------------ -Lightning supports training on a single TPU core. Just pass the TPU core ID [1-8] in a list. + +Lightning supports training on a single TPU core or 8 TPU cores. + +The Trainer parameters ``tpu_cores`` defines how many TPU cores to train on (1 or 8) / Single TPU to train on [1]. + +For Single TPU training, Just pass the TPU core ID [1-8] in a list. + +Single TPU core training. Model will train on TPU core ID 5. .. code-block:: python - trainer = pl.Trainer(tpu_cores=[1]) + trainer = pl.Trainer(tpu_cores=[5]) + +8 TPU cores training. Model will train on 8 TPU cores. + +.. code-block:: python + + trainer = pl.Trainer(tpu_cores=8) ---------------- From 102fa9ee7dd087ee167b120ea7812360928408f7 Mon Sep 17 00:00:00 2001 From: chaton Date: Mon, 2 Nov 2020 16:36:48 +0000 Subject: [PATCH 36/88] [BUGFIX] AMP + Precision unscale grad (#4441) * move unscale within Native plugin * remove gradient tracking from lightning backward * forgot trainer.fit * typo * update * cleanup * set to 1.6 * typo * skip if below 1.6 strict * update changelog * remove useless code * Update tests/plugins/test_amp_plugin.py Co-authored-by: Sean Naren * Update tests/plugins/test_amp_plugin.py Co-authored-by: Sean Naren * update changelog * Update CHANGELOG.md Co-authored-by: Sean Naren Co-authored-by: Jeff Yang --- CHANGELOG.md | 10 ++- pytorch_lightning/accelerators/accelerator.py | 5 -- pytorch_lightning/core/lightning.py | 1 - pytorch_lightning/plugins/native_amp.py | 5 ++ pytorch_lightning/trainer/training_loop.py | 25 +++++--- tests/plugins/test_amp_plugin.py | 62 +++++++++++++++++++ 6 files changed, 90 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cdb9ddc804d72..84d483dd03f2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,12 +17,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added multiclass AUROC metric ([#4236](https://github.com/PyTorchLightning/pytorch-lightning/pull/4236)) +- Added timeout for `tpu_device_exists` to ensure process does not hang indefinitely ([#4340](https://github.com/PyTorchLightning/pytorch-lightning/pull/4340)) + - Added global step indexing to the checkpoint name for a better sub-epoch checkpointing experience ([#3807](https://github.com/PyTorchLightning/pytorch-lightning/pull/3807)) ### Changed - W&B log in sync with Trainer step ([#4405](https://github.com/PyTorchLightning/pytorch-lightning/pull/4405)) +- Hook `on_after_backward` is called only when `optimizer_step` is being called ([#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) + +- Moved `track_and_norm_grad` into `training loop` and called only when `optimizer_step` is being called ([#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) + ### Deprecated - Deprecated passing `ModelCheckpoint` instance to `checkpoint_callback` Trainer argument ([#4336](https://github.com/PyTorchLightning/pytorch-lightning/pull/4336)) @@ -33,6 +39,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed error using `auto_select_gpus=True` with `gpus=-1` ([#4209](https://github.com/PyTorchLightning/pytorch-lightning/pull/4209)) +- Fixed AMP unscale for `on_after_backward` ([#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) ## [1.0.4] - 2020-10-27 @@ -50,8 +57,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for string values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) -- Added timeout for `tpu_device_exists` to ensure process does not hang indefinitely ([#4340](https://github.com/PyTorchLightning/pytorch-lightning/pull/4340)) - ### Changed - Improved error messages for invalid `configure_optimizers` returns ([#3587](https://github.com/PyTorchLightning/pytorch-lightning/pull/3587)) @@ -78,7 +83,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed WandbLogger not uploading checkpoint artifacts at the end of training ([#4341](https://github.com/PyTorchLightning/pytorch-lightning/pull/4341)) - ## [1.0.3] - 2020-10-20 ### Added diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 8e1969cc9368e..8ece6c4ec1b10 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -131,11 +131,6 @@ def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx): model_ref.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx) def clip_gradients(self, optimizer, clip_val=None): - - if self.trainer.amp_backend == AMPType.NATIVE: - self.trainer.scaler.unscale_(optimizer) - - # apply clip gradients # TODO: separate TPU case from here self._clip_gradients(optimizer, clip_val) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 22d63d0a03a74..d7125eb171a9c 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1101,7 +1101,6 @@ def backward(self, loss, optimizer, optimizer_idx): """ loss.backward(*args, **kwargs) - self.trainer.train_loop.track_and_norm_grad(optimizer=optimizer) def toggle_optimizer(self, optimizer: Optimizer, optimizer_idx: int): """ diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py index 6506540bde6e1..b016b6c5d24fb 100644 --- a/pytorch_lightning/plugins/native_amp.py +++ b/pytorch_lightning/plugins/native_amp.py @@ -38,6 +38,11 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): # once backward has been applied, release graph closure_loss = closure_loss.detach() + + # unscale gradient to allow analyze within `on_after_backward` + if not self.trainer.train_loop.should_accumulate(): + self.trainer.scaler.unscale_(optimizer) + return closure_loss def training_step(self, fx, args): diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index d1dfb3eec3733..0d269c333b269 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -652,11 +652,6 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx): if response == -1: return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic) - # checks if backward or backward + optimizer step (via closure) - accumulation_done = self._accumulated_batches_reached() - is_final_batch = self._num_training_batches_reached() - should_accumulate = not (accumulation_done or is_final_batch) - # lightning module hook splits = self.tbptt_split_batch(batch) @@ -676,7 +671,7 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx): model = self.trainer.get_model() model.toggle_optimizer(optimizer, opt_idx) - if should_accumulate: + if self.should_accumulate(): # For gradient accumulation # ------------------- @@ -767,7 +762,7 @@ def train_step_and_backward_closure(): @contextmanager def block_ddp_sync_behaviour(self): if isinstance(self.trainer.model, torch.nn.parallel.DistributedDataParallel): - yield from self.trainer.model.no_sync() + yield self.trainer.model.no_sync() else: yield @@ -817,8 +812,10 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, with self.trainer.profiler.profile("model_backward"): self.backward(result, optimizer, opt_idx) - # hook - self.on_after_backward(result.training_step_output, batch_idx, result.loss) + # hook - call this hook only + # when gradients have finished to accumulate + if not self.should_accumulate(): + self.on_after_backward(result.training_step_output, batch_idx, result.loss) # check if loss or model weights are nan if self.trainer.terminate_on_nan: @@ -837,6 +834,10 @@ def backward(self, result, optimizer, opt_idx, *args, **kwargs): result.closure_loss, optimizer, opt_idx, *args, **kwargs ) + if not self.should_accumulate(): + # track gradients + self.track_and_norm_grad(optimizer=optimizer) + def update_train_loop_lr_schedulers(self, monitor_metrics=None): num_accumulated_batches_reached = self._accumulated_batches_reached() num_training_batches_reached = self._num_training_batches_reached() @@ -863,6 +864,12 @@ def _accumulated_batches_reached(self): def _num_training_batches_reached(self): return (self.trainer.batch_idx + 1) == self.trainer.num_training_batches + def should_accumulate(self): + # checks if backward or backward + optimizer step (via closure) + accumulation_done = self._accumulated_batches_reached() + is_final_batch = self._num_training_batches_reached() + return not (accumulation_done or is_final_batch) + def should_check_val_fx(self, batch_idx, is_last_batch): # decide if we should run validation is_val_check_batch = (batch_idx + 1) % self.trainer.val_check_batch == 0 diff --git a/tests/plugins/test_amp_plugin.py b/tests/plugins/test_amp_plugin.py index c0d5747b5fc7e..6fd000b61d97f 100644 --- a/tests/plugins/test_amp_plugin.py +++ b/tests/plugins/test_amp_plugin.py @@ -84,3 +84,65 @@ def on_fit_start(self, trainer, pl_module): with pytest.raises(SystemExit): trainer.fit(model) + + +@pytest.mark.skipif( + LooseVersion(torch.__version__) < LooseVersion("1.6.0"), + reason="Minimal PT version is set to 1.6") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_amp_gradient_unscale(tmpdir): + + class ExtendedBoringModel(BoringModel): + + def on_after_backward(self): + norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2) + if not (torch.isinf(norm) or torch.isnan(norm)): + assert norm.item() < 15. + + model = ExtendedBoringModel() + + trainer = Trainer( + max_epochs=2, + default_root_dir=os.getcwd(), + limit_train_batches=2, + limit_test_batches=2, + limit_val_batches=2, + amp_backend='native', + distributed_backend='ddp_spawn', + gpus=2, + precision=16, + track_grad_norm=2, + log_every_n_steps=1 + ) + trainer.fit(model) + + +@pytest.mark.skipif( + LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Minimal PT version is set to 1.6") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_amp_gradient_unscale_accumulate_grad_batches(tmpdir): + + class ExtendedBoringModel(BoringModel): + + def on_after_backward(self): + norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2) + if not (torch.isinf(norm) or torch.isnan(norm)): + assert norm.item() < 15. + + model = ExtendedBoringModel() + + trainer = Trainer( + max_epochs=2, + default_root_dir=os.getcwd(), + limit_train_batches=2, + limit_test_batches=2, + limit_val_batches=2, + amp_backend='native', + distributed_backend='ddp_spawn', + gpus=2, + precision=16, + track_grad_norm=2, + log_every_n_steps=1, + accumulate_grad_batches=2, + ) + trainer.fit(model) From 66ade19d564cc6420a39cb2a64203b5cd21a4b43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 2 Nov 2020 19:33:37 +0100 Subject: [PATCH 37/88] Rename conflicting test directories (#4451) * logging -> logging_tests * warnings -> warnings_tests Co-authored-by: Jeff Yang Co-authored-by: Rohit Gupta --- tests/trainer/{logging => logging_tests}/__init__.py | 0 .../{logging => logging_tests}/test_distributed_logging.py | 0 .../{logging => logging_tests}/test_eval_loop_logging_1_0.py | 0 .../{logging => logging_tests}/test_train_loop_logging_1_0.py | 0 tests/trainer/{warnings => warnings_tests}/__init__.py | 0 tests/trainer/{warnings => warnings_tests}/test_flow_warnings.py | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename tests/trainer/{logging => logging_tests}/__init__.py (100%) rename tests/trainer/{logging => logging_tests}/test_distributed_logging.py (100%) rename tests/trainer/{logging => logging_tests}/test_eval_loop_logging_1_0.py (100%) rename tests/trainer/{logging => logging_tests}/test_train_loop_logging_1_0.py (100%) rename tests/trainer/{warnings => warnings_tests}/__init__.py (100%) rename tests/trainer/{warnings => warnings_tests}/test_flow_warnings.py (100%) diff --git a/tests/trainer/logging/__init__.py b/tests/trainer/logging_tests/__init__.py similarity index 100% rename from tests/trainer/logging/__init__.py rename to tests/trainer/logging_tests/__init__.py diff --git a/tests/trainer/logging/test_distributed_logging.py b/tests/trainer/logging_tests/test_distributed_logging.py similarity index 100% rename from tests/trainer/logging/test_distributed_logging.py rename to tests/trainer/logging_tests/test_distributed_logging.py diff --git a/tests/trainer/logging/test_eval_loop_logging_1_0.py b/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py similarity index 100% rename from tests/trainer/logging/test_eval_loop_logging_1_0.py rename to tests/trainer/logging_tests/test_eval_loop_logging_1_0.py diff --git a/tests/trainer/logging/test_train_loop_logging_1_0.py b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py similarity index 100% rename from tests/trainer/logging/test_train_loop_logging_1_0.py rename to tests/trainer/logging_tests/test_train_loop_logging_1_0.py diff --git a/tests/trainer/warnings/__init__.py b/tests/trainer/warnings_tests/__init__.py similarity index 100% rename from tests/trainer/warnings/__init__.py rename to tests/trainer/warnings_tests/__init__.py diff --git a/tests/trainer/warnings/test_flow_warnings.py b/tests/trainer/warnings_tests/test_flow_warnings.py similarity index 100% rename from tests/trainer/warnings/test_flow_warnings.py rename to tests/trainer/warnings_tests/test_flow_warnings.py From 19187d38f936e213e1c2e35b8d2a86f16afff8cd Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Mon, 2 Nov 2020 20:44:49 +0100 Subject: [PATCH 38/88] [Metrics] Detach bugfix (#4313) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * detach on buffer * doc update * remove file * changelog * suggestions * Update docs/source/metrics.rst Co-authored-by: Teddy Koker * fix for 4266 * Update docs/source/metrics.rst Co-authored-by: Adrian Wälchli * Update CHANGELOG.md Co-authored-by: Teddy Koker Co-authored-by: chaton Co-authored-by: Adrian Wälchli Co-authored-by: Ananya Harsh Jha Co-authored-by: Roger Shieh Co-authored-by: Sean Naren Co-authored-by: Rohit Gupta --- CHANGELOG.md | 2 ++ docs/source/metrics.rst | 14 +++++++++++++- pytorch_lightning/core/step_result.py | 6 +++--- pytorch_lightning/metrics/metric.py | 3 ++- 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 84d483dd03f2c..95417dd8aa9ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed error using `auto_select_gpus=True` with `gpus=-1` ([#4209](https://github.com/PyTorchLightning/pytorch-lightning/pull/4209)) +- Fixed that metrics do not store computational graph for all seen data ([#4313](https://github.com/PyTorchLightning/pytorch-lightning/pull/4313)) + - Fixed AMP unscale for `on_after_backward` ([#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) ## [1.0.4] - 2020-10-27 diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst index de3cd01c33e9b..4fadfaa507168 100644 --- a/docs/source/metrics.rst +++ b/docs/source/metrics.rst @@ -150,6 +150,19 @@ Example implementation: def compute(self): return self.correct.float() / self.total +Metrics support backpropagation, if all computations involved in the metric calculation +are differentiable. However, note that the cached state is detached from the computational +graph and cannot be backpropagated. Not doing this would mean storing the computational +graph for each update call, which can lead to out-of-memory errors. +In practise this means that: + +.. code-block:: python + + metric = MyMetric() + val = metric(pred, target) # this value can be backpropagated + val = metric.compute() # this value cannot be backpropagated + + ********** Metric API ********** @@ -453,4 +466,3 @@ embedding_similarity [func] .. autofunction:: pytorch_lightning.metrics.functional.self_supervised.embedding_similarity :noindex: - diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 650c1876d0cd0..a8224f45e3829 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -259,7 +259,7 @@ def get_batch_log_metrics(self, include_forked_originals=True) -> dict: if options['logger'] and options['on_step']: if isinstance(self[k], Metric): - result[k] = self[k]._forward_cache + result[k] = self[k]._forward_cache.detach() else: result[k] = self[k] @@ -281,7 +281,7 @@ def get_epoch_log_metrics(self) -> dict: if options['logger'] and options['on_epoch']: if isinstance(self[k], Metric): - result[k] = self[k].compute() + result[k] = self[k].compute().detach() else: result[k] = self[k] @@ -307,7 +307,7 @@ def get_epoch_pbar_metrics(self): if options['prog_bar'] and options['on_epoch']: if isinstance(self[k], Metric): - result[k] = self[k].compute() + result[k] = self[k].compute().detach() else: result[k] = self[k] diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py index 3a853be0ebdd5..f003e0d3da72a 100644 --- a/pytorch_lightning/metrics/metric.py +++ b/pytorch_lightning/metrics/metric.py @@ -150,7 +150,8 @@ def forward(self, *args, **kwargs): Automatically calls ``update()``. Returns the metric value over inputs if ``compute_on_step`` is True. """ # add current step - self.update(*args, **kwargs) + with torch.no_grad(): + self.update(*args, **kwargs) self._forward_cache = None if self.compute_on_step: From ac3f7393fd090484c8230ec375db503819710227 Mon Sep 17 00:00:00 2001 From: chaton Date: Mon, 2 Nov 2020 20:51:43 +0000 Subject: [PATCH 39/88] [FEAT] logging refactors 1/n (#4439) * introducing new logging object * typo * typo * Update pytorch_lightning/trainer/logging.py Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> * Update pytorch_lightning/trainer/logging.py Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> * update on comments * update on comments * add more doctstring * Update pytorch_lightning/core/lightning.py Co-authored-by: Sean Naren * resolve on comments * solve pyright * Update pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> * update on comments * Update pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py Co-authored-by: Sean Naren * update on comments Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: Sean Naren --- pytorch_lightning/core/lightning.py | 29 +- pytorch_lightning/core/step_result.py | 77 ++- .../connectors/logger_connector/__init__.py | 1 + .../callback_hook_validator.py | 220 ++++++++ .../logger_connector/epoch_result_store.py | 528 ++++++++++++++++++ .../logger_connector.py | 95 +++- pytorch_lightning/trainer/evaluation_loop.py | 15 +- pytorch_lightning/trainer/logging.py | 6 +- pytorch_lightning/trainer/trainer.py | 21 +- pytorch_lightning/trainer/training_loop.py | 57 +- .../trainer/logging/test_logger_connector.py | 249 +++++++++ 11 files changed, 1229 insertions(+), 69 deletions(-) create mode 100644 pytorch_lightning/trainer/connectors/logger_connector/__init__.py create mode 100644 pytorch_lightning/trainer/connectors/logger_connector/callback_hook_validator.py create mode 100644 pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py rename pytorch_lightning/trainer/connectors/{ => logger_connector}/logger_connector.py (84%) create mode 100644 tests/trainer/logging/test_logger_connector.py diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index d7125eb171a9c..3f2ca7ce27bfc 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -11,13 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import os +import tempfile import collections import copy import inspect -import os import re -import tempfile from abc import ABC from argparse import Namespace from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, Mapping @@ -28,16 +27,17 @@ from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks, ModelHooks from pytorch_lightning.core.memory import ModelSummary from pytorch_lightning.core.saving import ALLOWED_CONFIG_TYPES, PRIMITIVE_TYPES, ModelIO +from pytorch_lightning.core.step_result import Result from pytorch_lightning.utilities import rank_zero_warn, AMPType from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.utilities.xla_device_utils import XLADeviceUtils from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.core.step_result import Result from pytorch_lightning.utilities.parsing import ( AttributeDict, collect_init_args, get_init_args, ) +from pytorch_lightning.callbacks import Callback from torch import ScriptModule, Tensor from torch.nn import Module from torch.optim.optimizer import Optimizer @@ -111,6 +111,8 @@ def __init__(self, *args, **kwargs): self._datamodule = None self._results: Optional[Result] = None self._current_fx_name = '' + self._current_hook_fx_name = None + self._current_dataloader_idx = None def optimizers(self): opts = self.trainer.optimizers @@ -244,6 +246,18 @@ def log( on_step = self.__auto_choose_log_on_step(on_step) on_epoch = self.__auto_choose_log_on_epoch(on_epoch) + if self._current_hook_fx_name is not None: + self.trainer.logger_connector.check_logging_in_callbacks( + self._current_hook_fx_name, + on_step=on_step, + on_epoch=on_epoch + ) + + # make sure user doesn't introduce logic for multi-dataloaders + if "/dataloader_idx_" in name: + raise MisconfigurationException( + f"Logged key: {name} should not contain information about dataloader_idx.") + self._results.log( name, value, @@ -257,7 +271,8 @@ def log( enable_graph, sync_dist, sync_dist_op, - sync_dist_group + sync_dist_group, + self._current_dataloader_idx, ) def log_dict( @@ -1277,11 +1292,11 @@ def tbptt_split_batch(self, batch, split_size): batch_split = [] for i, x in enumerate(batch): if isinstance(x, torch.Tensor): - split_x = x[:, t : t + split_size] + split_x = x[:, t: t + split_size] elif isinstance(x, collections.Sequence): split_x = [None] * len(x) for batch_idx in range(len(x)): - split_x[batch_idx] = x[batch_idx][t : t + split_size] + split_x[batch_idx] = x[batch_idx][t: t + split_size] batch_split.append(split_x) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index a8224f45e3829..059c724aa75a9 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -124,6 +124,7 @@ def log( sync_dist: bool = False, sync_dist_op: Union[Any, str] = 'mean', sync_dist_group: Optional[Any] = None, + dataloader_idx: Optional[int] = None, ): # no metrics should be logged with graphs if not enable_graph and isinstance(value, torch.Tensor): @@ -144,6 +145,7 @@ def log( # set step version step_name = f'{name}_step' + self.__set_meta( step_name, value, @@ -154,12 +156,15 @@ def log( reduce_fx=reduce_fx, tbptt_reduce_fx=tbptt_reduce_fx, tbptt_pad_token=tbptt_pad_token, - forked=False + forked=False, + dataloader_idx=dataloader_idx, ) + self.__setitem__(step_name, value) # set epoch version epoch_name = f'{name}_epoch' + self.__set_meta( epoch_name, value, @@ -170,7 +175,8 @@ def log( reduce_fx=reduce_fx, tbptt_reduce_fx=tbptt_reduce_fx, tbptt_pad_token=tbptt_pad_token, - forked=False + forked=False, + dataloader_idx=dataloader_idx, ) self.__setitem__(epoch_name, value) @@ -185,7 +191,8 @@ def log( reduce_fx, tbptt_reduce_fx=tbptt_reduce_fx, tbptt_pad_token=tbptt_pad_token, - forked=was_forked + forked=was_forked, + dataloader_idx=dataloader_idx, ) # set the value @@ -202,7 +209,8 @@ def __set_meta( reduce_fx: Callable, tbptt_pad_token: int, tbptt_reduce_fx: Callable, - forked: bool + forked: bool, + dataloader_idx: Union[int, None] ): # set the meta for the item meta_value = value @@ -215,7 +223,8 @@ def __set_meta( value=meta_value, tbptt_reduce_fx=tbptt_reduce_fx, tbptt_pad_token=tbptt_pad_token, - forked=forked + forked=forked, + dataloader_idx=dataloader_idx, ) self['meta'][name] = meta @@ -225,13 +234,22 @@ def __set_meta( _internal['_reduce_on_epoch'] = max(_internal['_reduce_on_epoch'], on_epoch) def track_batch_size(self, batch): + batch_size = Result.extract_batch_size(batch) + Result.attach_batch_size(batch_size, self) + + @staticmethod + def extract_batch_size(batch): try: batch_size = Result.unpack_batch_size(batch) except RecursionError as re: batch_size = 1 + return batch_size - meta = self['meta'] - meta['_internal']['batch_sizes'].append(batch_size) + @staticmethod + def attach_batch_size(batch_size: Union[int, None], result: 'Result') -> None: + if batch_size is not None: + meta = result['meta'] + meta['_internal']['batch_sizes'].append(batch_size) def get_batch_sizes(self): meta = self['meta'] @@ -242,7 +260,12 @@ def get_callback_metrics(self) -> dict: return result - def get_batch_log_metrics(self, include_forked_originals=True) -> dict: + def _add_dataloader_idx(self, k: str, dataloader_idx: Union[int, None], add_dataloader_idx: bool) -> str: + if dataloader_idx is not None and add_dataloader_idx: + return f"{k}/dataloader_idx_{dataloader_idx}" + return k + + def get_batch_log_metrics(self, include_forked_originals=True, add_dataloader_idx=False) -> dict: """ Gets the metrics to log at the end of the batch step @@ -257,15 +280,17 @@ def get_batch_log_metrics(self, include_forked_originals=True) -> dict: if options['forked'] and not include_forked_originals: continue + dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx) + if options['logger'] and options['on_step']: if isinstance(self[k], Metric): - result[k] = self[k]._forward_cache.detach() + result[dl_key] = self[k]._forward_cache.detach() else: - result[k] = self[k] + result[dl_key] = self[k] return result - def get_epoch_log_metrics(self) -> dict: + def get_epoch_log_metrics(self, add_dataloader_idx=False) -> dict: """ Gets the metrics to log at the end of epoch """ @@ -279,11 +304,13 @@ def get_epoch_log_metrics(self) -> dict: if options['forked']: continue + dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx) + if options['logger'] and options['on_epoch']: if isinstance(self[k], Metric): - result[k] = self[k].compute().detach() + result[dl_key] = self[k].compute().detach() else: - result[k] = self[k] + result[dl_key] = self[k] if k in self and not options['on_epoch'] and isinstance(self[k], Metric): # compute metric on epoch anyway so state does not accumulate @@ -291,7 +318,7 @@ def get_epoch_log_metrics(self) -> dict: return result - def get_epoch_pbar_metrics(self): + def get_epoch_pbar_metrics(self, add_dataloader_idx=False): """ Gets the metrics to log at the end of epoch """ @@ -305,11 +332,13 @@ def get_epoch_pbar_metrics(self): if options['forked']: continue + dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx) + if options['prog_bar'] and options['on_epoch']: if isinstance(self[k], Metric): - result[k] = self[k].compute().detach() + result[dl_key] = self[k].compute().detach() else: - result[k] = self[k] + result[dl_key] = self[k] if k in self and not options['on_epoch'] and isinstance(self[k], Metric): # compute metric on epoch anyway so state does not accumulate @@ -317,7 +346,7 @@ def get_epoch_pbar_metrics(self): return result - def get_forked_metrics(self): + def get_forked_metrics(self, add_dataloader_idx=False): """ Gets the metrics to log at the end of epoch """ @@ -328,12 +357,14 @@ def get_forked_metrics(self): if k == '_internal': continue + dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx) + if options['forked']: - result[k] = self[k] + result[dl_key] = self[k] return result - def get_batch_pbar_metrics(self, include_forked_originals=True): + def get_batch_pbar_metrics(self, include_forked_originals=True, add_dataloader_idx=False): """ Gets the metrics to log at the end of the batch step """ @@ -347,11 +378,13 @@ def get_batch_pbar_metrics(self, include_forked_originals=True): if options['forked'] and not include_forked_originals: continue + dl_key = self._add_dataloader_idx(k, options["dataloader_idx"], add_dataloader_idx) + if options['prog_bar'] and options['on_step']: if isinstance(self[k], Metric): - result[k] = self[k]._forward_cache + result[dl_key] = self[k]._forward_cache else: - result[k] = self[k] + result[dl_key] = self[k] return result @@ -473,6 +506,8 @@ def reduce_on_epoch_end(cls, outputs): if option['on_epoch']: fx = option['reduce_fx'] if fx == torch.mean: + if isinstance(result[k], list): + result[k] = torch.tensor(result[k]).float() try: reduced_val = weighted_mean(result[k], batch_sizes) except Exception as e: diff --git a/pytorch_lightning/trainer/connectors/logger_connector/__init__.py b/pytorch_lightning/trainer/connectors/logger_connector/__init__.py new file mode 100644 index 0000000000000..4034840a09b97 --- /dev/null +++ b/pytorch_lightning/trainer/connectors/logger_connector/__init__.py @@ -0,0 +1 @@ +from pytorch_lightning.trainer.connectors.logger_connector.logger_connector import LoggerConnector diff --git a/pytorch_lightning/trainer/connectors/logger_connector/callback_hook_validator.py b/pytorch_lightning/trainer/connectors/logger_connector/callback_hook_validator.py new file mode 100644 index 0000000000000..3ce4b523545c3 --- /dev/null +++ b/pytorch_lightning/trainer/connectors/logger_connector/callback_hook_validator.py @@ -0,0 +1,220 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +class CallbackHookNameValidator: + + @staticmethod + def check_logging_in_callbacks(current_hook_fx_name: str = None, on_step: bool = None, + on_epoch: bool = None) -> None: + if current_hook_fx_name is None: + return + + internal_func = getattr(CallbackHookNameValidator, f"_{current_hook_fx_name}_log", None) + + if internal_func is None: + return + + current_callback_hook_auth_args = internal_func() + + if current_callback_hook_auth_args is not None: + m = "{} function supports only {} in {}. Provided {}" + if on_step not in current_callback_hook_auth_args["on_step"]: + msg = m.format(current_hook_fx_name, "on_step", current_callback_hook_auth_args["on_step"], on_step) + raise MisconfigurationException(msg) + + if on_epoch not in current_callback_hook_auth_args["on_epoch"]: + msg = m.format(current_hook_fx_name, "on_epoch", current_callback_hook_auth_args["on_epoch"], on_epoch) + raise MisconfigurationException(msg) + else: + raise MisconfigurationException( + f"{current_hook_fx_name} function doesn't support logging using self.log() yet." + ) + + @staticmethod + def _setup_log(): + """Called when fit or test begins""" + return None + + @staticmethod + def _teardown_log(): + """Called at the end of fit and test""" + return None + + @staticmethod + def _on_init_start_log(): + """Called when the trainer initialization begins, model has not yet been set.""" + return None + + @staticmethod + def _on_init_end_log(): + """Called when the trainer initialization ends, model has not yet been set.""" + return None + + @staticmethod + def _on_fit_start_log(): + """Called when the trainer initialization begins, model has not yet been set.""" + return None + + @staticmethod + def _on_fit_end_log(): + """Called when the trainer initialization begins, model has not yet been set.""" + return None + + @staticmethod + def _on_sanity_check_start_log(): + """Called when the validation sanity check starts.""" + return None + + @staticmethod + def _on_sanity_check_end_log(): + """Called when the validation sanity check ends.""" + return None + + @staticmethod + def _on_train_epoch_start_log(): + """Called when the epoch begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_train_epoch_end_log(): + """Called when the epoch ends.""" + return {"on_step": [False], "on_epoch": [False, True]} + + @staticmethod + def _on_validation_epoch_start_log(): + """Called when the epoch begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_validation_epoch_end_log(): + """Called when the epoch ends.""" + return {"on_step": [False], "on_epoch": [False, True]} + + @staticmethod + def _on_test_epoch_start_log(): + """Called when the epoch begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_test_epoch_end_log(): + """Called when the epoch ends.""" + return {"on_step": [False], "on_epoch": [False, True]} + + @staticmethod + def _on_epoch_start_log(): + """Called when the epoch begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_epoch_end_log(): + """Called when the epoch ends.""" + return {"on_step": [False], "on_epoch": [False, True]} + + @staticmethod + def _on_train_start_log(): + """Called when the train begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_train_end_log(): + """Called when the train ends.""" + return None + + @staticmethod + def _on_pretrain_routine_start_log(): + """Called when the train begins.""" + return None + + @staticmethod + def _on_pretrain_routine_end_log(): + """Called when the train ends.""" + return None + + @staticmethod + def _on_batch_start_log(): + """Called when the training batch begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_batch_end_log(): + """Called when the training batch ends.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_train_batch_start_log(): + """Called when the training batch begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_train_batch_end_log(): + """Called when the training batch ends.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_validation_batch_start_log(): + """Called when the validation batch begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_validation_batch_end_log(): + """Called when the validation batch ends.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_test_batch_start_log(): + """Called when the test batch begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_test_batch_end_log(): + """Called when the test batch ends.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_validation_start_log(): + """Called when the validation loop begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_validation_end_log(): + """Called when the validation loop ends.""" + return {"on_step": [False], "on_epoch": [False, True]} + + @staticmethod + def _on_test_start_log(): + """Called when the test begins.""" + return {"on_step": [False, True], "on_epoch": [False, True]} + + @staticmethod + def _on_test_end_log(): + """Called when the test ends.""" + return None + + @staticmethod + def _on_keyboard_interrupt_log(): + """Called when the training is interrupted by KeyboardInterrupt.""" + return None + + @staticmethod + def _on_save_checkpoint_log(): + """Called when saving a model checkpoint.""" + return None + + @staticmethod + def _on_load_checkpoint_log(): + """Called when loading a model checkpoint.""" + return None diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py new file mode 100644 index 0000000000000..2a9d68807e694 --- /dev/null +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -0,0 +1,528 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from copy import deepcopy +from enum import Enum +from typing import Union, Tuple, Any, Mapping + +from pytorch_lightning.core.step_result import Result + + +# used to map boolean to right LoggerStage values +class FrozenDict(dict): + def __init__(self, *args, **kwargs): + self._hash = None + super(FrozenDict, self).__init__(*args, **kwargs) + + def __hash__(self): + if self._hash is None: + self._hash = hash(tuple(sorted(self.items()))) # iteritems() on py2 + return self._hash + + def _immutable(self, *args, **kws): + raise TypeError('cannot change object - object is immutable') + + __setitem__ = _immutable + __delitem__ = _immutable + pop = _immutable + popitem = _immutable + clear = _immutable + update = _immutable + setdefault = _immutable + + +LOOKUP_TABLE = FrozenDict({"1": "test", "0": "validation", "True": "test", "False": "validation"}) + + +class LoggerStages(Enum): + TRAIN = "train" + VAL = "validation" + TEST = "test" + + +class ResultStoreType(Enum): + INSIDE_BATCH_TRAIN_LOOP = "inside_batch_train_loop" + OUTSIDE_BATCH_TRAIN_LOOP = "outside_batch_train_loop" + + +class HookResultStore: + """ + This class is defined for internal usage. + It holds all metrics logged using the self.log function + in the scope of ModelHooks or Callback functions. + + We need to differiante 3 different scenarios: + - (1): We are outside of a batch loop + * It means no dataloader_idx, no optimizer idx, etc.. + - (2): We are inside the training batch loop + * We have an optimizer idx and split idx to track + - (3): We are inside the evaluation loop + * We have a dataloader_idx to track + + The data store `Result` objects for those 3 scenarios in `self._internals`. + + (1): self._internals = {"dataloader_idx": [Result(), ..., Result()]} + * dataloader_idx not being defined, it is set to 0 b default + (2): self._internals = {"dataloader_idx": + {"optimizer_idx": + {"batch_idx": + [Result(), Result()] + } + } + } + (3): Same as (1) for simplicity + + Those data structures enables us to reduce properly Result object when batch loop is finished. + """ + def __init__(self, fx_name): + self._fx_name = fx_name + self._internals = {} + self._internals_reduced = {} + self._internal_type = None + self.has_reduced = False + + def get_reduced_metrics(self): + return self._internals_reduced + + def add_dataloader_idx(self): + return len(self._internals) > 1 + + @property + def num_dataloaders(self): + return len(self._internals) + + def get_latest_from_dict(self, dl_idx): + num_opt_idx = len(self._internals[dl_idx]) - 1 + assert num_opt_idx >= 0 + num_opt_idx = str(num_opt_idx) + num_batch_idx = len(self._internals[dl_idx][num_opt_idx]) - 1 + batch_indexes = [*self._internals[dl_idx][num_opt_idx].keys()] + # sort them by increasing order + batch_indexes.sort(key=float) + assert num_batch_idx >= 0 + return self._internals[dl_idx][num_opt_idx][batch_indexes[-1]][-1] + + def check_dataloader_idx(self, result: Result) -> bool: + add_dataloader_idx = False + try: + if len(result.keys()) > 1: + random_key = [*result.keys()][-1] + add_dataloader_idx = result["meta"][random_key]["dataloader_idx"] is not None + return add_dataloader_idx + return add_dataloader_idx + except Exception: + return add_dataloader_idx + + def get_lastest_from_func_name(self, func_name, *args, latest=True, **kwargs): + results = {} + if latest: + for dl_idx in range(self.num_dataloaders): + dl_idx = str(dl_idx) + if self._internal_type == ResultStoreType.OUTSIDE_BATCH_TRAIN_LOOP: + latest_result = self._internals[dl_idx][-1] + else: + latest_result = self.get_latest_from_dict(dl_idx) + add_dataloader_idx = self.check_dataloader_idx(latest_result) + func = getattr(latest_result, func_name) + results.update(func(*args, add_dataloader_idx=add_dataloader_idx, **kwargs)) + return results + raise NotImplementedError + + def get_batch_pbar_metrics(self, latest=True, *args, **kwargs): + return self.get_lastest_from_func_name("get_batch_pbar_metrics", *args, latest=latest, **kwargs) + + def get_batch_log_metrics(self, latest=True, *args, **kwargs): + return self.get_lastest_from_func_name("get_batch_log_metrics", *args, latest=latest, **kwargs) + + def run_epoch_func(self, results, opt_metric, func_name, *args, **kwargs) -> None: + if isinstance(opt_metric, Result): + func = getattr(opt_metric, func_name) + metrics_to_log = func( + *args, + add_dataloader_idx=self.add_dataloader_idx, + **kwargs) + results.update(metrics_to_log) + else: + raise Exception("The provided opt_metric should be a Result Object. Something is wrong") + + def get_epoch_from_func_name(self, func_name, *args, **kwargs) -> Mapping: + results = {} + for dl_idx in range(self.num_dataloaders): + dl_idx = str(dl_idx) + opt_metrics = self._internals_reduced[dl_idx] + if isinstance(opt_metrics, defaultdict): + for opt_metric in opt_metrics.values(): + self.run_epoch_func(results, opt_metric, func_name, *args, **kwargs) + else: + self.run_epoch_func(results, opt_metrics, func_name, *args, **kwargs) + return results + + def get_epoch_pbar_metrics(self, *args, **kwargs) -> Mapping: + return self.get_epoch_from_func_name("get_epoch_pbar_metrics") + + def get_epoch_log_metrics(self, *args, **kwargs) -> Mapping: + return self.get_epoch_from_func_name("get_epoch_log_metrics") + + def get_forked_metrics(self, *args, **kwargs) -> Mapping: + return self.get_epoch_from_func_name("get_forked_metrics") + + @staticmethod + def _append_to_structure(primary_dict, opt_idx, batch_idx, result) -> None: + if opt_idx not in primary_dict: + primary_dict[opt_idx] = {} + + if batch_idx not in primary_dict[opt_idx]: + primary_dict[opt_idx][batch_idx] = [] + + primary_dict[opt_idx][batch_idx].append(result) + + def append(self, result, dataloader_idx=None, extra_info: dict = {}) -> None: + + assert isinstance(result, Result) + + if dataloader_idx is None: + dataloader_idx = 0 + + primary_key = f"{dataloader_idx}" + + # [dataloader_idx][optimizer_idx][training_step_idx] is a list + if len(extra_info) > 0: + self._internal_type = ResultStoreType.INSIDE_BATCH_TRAIN_LOOP + # initialize dictionary + if primary_key not in self._internals: + self._internals[primary_key] = {} + self._internals_reduced[primary_key] = defaultdict(dict) + + # extract infos + opt_idx = str(extra_info["opt_idx"]) + batch_idx = str(extra_info["batch_idx"]) + + self._append_to_structure(self._internals[primary_key], opt_idx, batch_idx, result) + + # [dataloader_idx] is a list + else: + self._internal_type = ResultStoreType.OUTSIDE_BATCH_TRAIN_LOOP + if primary_key not in self._internals: + self._internals[primary_key] = [] + self._internals[primary_key].append(result) + + def auto_reduce_results_on_epoch_end(self) -> None: + """ + This function is called to reduce `self._internals` Result object. + The reduced Result object will be saved into `self._internals_reduced` + The `self._internals` stored Result objects will be deleted to save memory. + """ + if not self.has_reduced: + epoch_log_metrics = {} + epoch_progress_bar_metrics = {} + + for dl_idx in range(self.num_dataloaders): + dl_idx = str(dl_idx) + epoch_metrics = self._internals[dl_idx] + + if self._internal_type == ResultStoreType.INSIDE_BATCH_TRAIN_LOOP: + + num_opt_idx = len(self._internals[dl_idx]) - 1 + + # Make sure we didn't create key + assert num_opt_idx >= 0 + + for opt_idx in range(num_opt_idx + 1): + opt_idx = str(opt_idx) + # TODO: Figure out to reduce memory + # TODO: How to start training in middle of epoch + opt_outputs = epoch_metrics[opt_idx] + + num_batch_idx = len(self._internals[dl_idx][str(num_opt_idx)]) - 1 + assert num_batch_idx >= 0 + batch_indexes = self._internals[dl_idx][str(num_opt_idx)].keys() + + # reduce across time first + time_reduced_outputs = [] + for batch_idx in batch_indexes: + batch_idx = str(batch_idx) + tbptt_outs = opt_outputs[str(batch_idx)] + tbptt_outs = tbptt_outs[0].__class__.reduce_across_time(tbptt_outs) + if len(tbptt_outs) > 1: + time_reduced_outputs.append(tbptt_outs) + + if len(time_reduced_outputs) == 0: + continue + + # reduce across training steps + opt_outputs = time_reduced_outputs[0].__class__.reduce_on_epoch_end(time_reduced_outputs) + + # with manual opt need 1 + metrics because meta is always there + if opt_outputs.minimize is not None: + opt_outputs.minimize = opt_outputs.minimize.mean() + + self._internals_reduced[dl_idx][str(opt_idx)] = opt_outputs + + # free memory + del self._internals[dl_idx] + else: + # no need to reduce as called only once + if len(epoch_metrics) == 1: + reduced_epoch_metrics = epoch_metrics[0] + else: + reduced_epoch_metrics = epoch_metrics[0].__class__.reduce_on_epoch_end(epoch_metrics) + + self._internals_reduced[dl_idx] = reduced_epoch_metrics + + # free memory + del self._internals[dl_idx] + + self.has_reduced = True + + def __getitem__(self, key: str) -> Any: + try: + if key in self._internals: + return self._internals[key] + return self[key] + except KeyError: + return None + + def __repr__(self): + return self._internals.__repr__() + + +class EpochResultStore: + """ + This class is defined for internal usage. + + It holds all metrics logged using the self.log function using `HookResultStore` object. + + The internal datastructure is as follow: + + self._internals = {"fx_name_0": HookResultStore(), ..., "fx_name_n": HookResultStore()} + + Pseudo Code Example: + ``` + model._current_fx_name = 'something' + model._results = Result() + model.log('a', ...) + epoch_result_store.cache_result() + ``` + + """ + def __init__(self, trainer, stage): + self.trainer = trainer + self._stage = stage + self.reset() + + def __getitem__(self, key: str) -> Any: + try: + if key in self._internals: + return self._internals[key] + return None + except KeyError: + return None + + @property + def has_split_and_opt_idx(self): + """ + This function informs if we are running within training batch loop + """ + if self._split_idx is not None and self._opt_idx is not None: + return True + return False + + @property + def extra_info(self): + """ + This function provides necessary parameters to properly configure HookResultStore obj + """ + return {"batch_idx": self.trainer.batch_idx, + "split_idx": self._split_idx, + "opt_idx": self._opt_idx} + + def reset_model(self): + """ + This function is used to reset model state at the end of the capture + """ + model_ref = self.trainer.get_model() + model_ref._results = Result() + model_ref._current_hook_fx_name = None + model_ref._current_fx_name = '' + + def current_model_info(self): + """ + This function is used to extract + information related to current function scoping `self.log` call. + """ + model_ref = self.trainer.get_model() + # extract hook information + fx_name = model_ref._current_hook_fx_name + if fx_name == '': + fx_name = model_ref._current_fx_name + dataloader_idx = model_ref._current_dataloader_idx + return fx_name, dataloader_idx + + def cache_result(self) -> None: + """ + This function is called after every hook + and store the result object + """ + model_ref = self.trainer.get_model() + + # extract hook results + hook_result = model_ref._results + + # extract model information + fx_name, dataloader_idx = self.current_model_info() + + # add only if anything as been logged + # default len is 1 due to _internals + if len(hook_result) > 1: + + if fx_name not in self._internals: + self._internals[fx_name] = HookResultStore(fx_name) + + extra_info = {} + if self.has_split_and_opt_idx: + extra_info = self.extra_info + + # attach capture batch_size + Result.attach_batch_size(self._batch_size, hook_result) + + self._internals[fx_name].append( + deepcopy(hook_result), + dataloader_idx=dataloader_idx, + extra_info=extra_info) + + # update logged_metrics, progress_bar_metrics, callback_metrics + self.update_logger_connector(fx_name) + + # reset _results, fx_name + self.reset_model() + + def update_logger_connector(self, fx_name: str = None) -> None: + """ + This function is called every time we capture a hook + It automatically updates the logger_connector followings: + - progress_bar_metrics with pbar_metrics + - logged_metrics with log_metrics + - callback_metrics with progress_bar_metrics + logged_metrics + """ + + logger_connector = self.trainer.logger_connector + + callback_metrics = {} + + if not self._has_batch_loop_finished: + # get pbar + batch_pbar_metrics = self.get_latest_batch_pbar_metrics() + logger_connector.add_progress_bar_metrics(batch_pbar_metrics) + + if self._stage in LoggerStages.TRAIN.value: + # Only log and add to callback epoch step during evaluation, test. + batch_log_metrics = self.get_latest_batch_log_metrics() + logger_connector.logged_metrics.update(batch_log_metrics) + + callback_metrics.update(batch_pbar_metrics) + callback_metrics.update(batch_log_metrics) + else: + epoch_dict = {"epoch": self.trainer.current_epoch} + + # get pbar + epoch_pbar_metrics = self.get_epoch_pbar_metrics() + logger_connector.add_progress_bar_metrics(epoch_pbar_metrics) + + # get logged_metrics + epoch_log_metrics = self.get_epoch_log_metrics() + logger_connector.logged_metrics.update(epoch_log_metrics) + logger_connector.logged_metrics.update(epoch_dict) + + # get forked_metrics + forked_metrics = self.get_forked_metrics() + + callback_metrics.update(epoch_pbar_metrics) + callback_metrics.update(epoch_log_metrics) + callback_metrics.update(forked_metrics) + + # update callback_metrics + logger_connector.callback_metrics.update(callback_metrics) + logger_connector.callback_metrics.pop("epoch", None) + + def run_batch_from_func_name(self, func_name) -> Mapping: + results = {} + for fx_name, hook_result in self._internals.items(): + func = getattr(hook_result, func_name) + results.update(func(latest=True, include_forked_originals=False)) + return results + + def get_latest_batch_log_metrics(self) -> Mapping: + return self.run_batch_from_func_name("get_batch_log_metrics") + + def get_latest_batch_pbar_metrics(self) -> Mapping: + return self.run_batch_from_func_name("get_batch_pbar_metrics") + + @property + def has_reduced(self) -> bool: + hook_results = self._internals.values() + return len(hook_results) == sum([h.has_reduced for h in hook_results]) + + def auto_reduce_results_on_epoch_end(self) -> None: + if not self.has_reduced: + for fx_name, hook_result in self._internals.items(): + hook_result.auto_reduce_results_on_epoch_end() + + @property + def has_batch_loop_finished(self) -> bool: + return self._has_batch_loop_finished + + @has_batch_loop_finished.setter + def has_batch_loop_finished(self, has_batch_loop_finished): + if has_batch_loop_finished: + # If batch loop has finished, reduce metrics + self.auto_reduce_results_on_epoch_end() + + # batch_size should be none as we finished batch loop + self._batch_size = None + + self._has_batch_loop_finished = has_batch_loop_finished + self.update_logger_connector() + + def run_epoch_by_func_name(self, func_name) -> Mapping: + if not self.has_reduced: + self.auto_reduce_results_on_epoch_end() + results = {} + for fx_name, hook_result in self._internals.items(): + func = getattr(hook_result, func_name) + results.update(func()) + return results + + def get_epoch_pbar_metrics(self) -> Mapping: + return self.run_epoch_by_func_name("get_epoch_pbar_metrics") + + def get_epoch_log_metrics(self) -> Mapping: + return self.run_epoch_by_func_name("get_epoch_log_metrics") + + def get_forked_metrics(self) -> Mapping: + return self.run_epoch_by_func_name("get_forked_metrics") + + def get_reduced_metrics(self) -> Mapping: + return self.run_epoch_by_func_name("get_reduced_metrics") + + def reset(self): + self._internals = {} + self._dataloader_idx: Union[int, None] = None + self._split_idx: Union[int, None] = None + self._opt_idx: Union[int, None] = None + self._batch_size: Union[int, None] = None + self._has_batch_loop_finished = False + + def __repr__(self): + return f"{self.__class__.__name__}(stage={self._stage}, internals={self._internals})" diff --git a/pytorch_lightning/trainer/connectors/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py similarity index 84% rename from pytorch_lightning/trainer/connectors/logger_connector.py rename to pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 893eab5a16a3d..5c699ecffa464 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +from pprint import pprint +from typing import Iterable, Union, cast +from copy import deepcopy +from collections import ChainMap import torch from pytorch_lightning.core import memory from pytorch_lightning.loggers import TensorBoardLogger, LoggerCollection @@ -19,10 +23,12 @@ from pytorch_lightning.utilities.model_utils import is_overridden from pytorch_lightning.core.step_result import EvalResult, Result from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pprint import pprint -from typing import Iterable -from copy import deepcopy -from collections import ChainMap +from pytorch_lightning.trainer.connectors.logger_connector.callback_hook_validator import CallbackHookNameValidator +from pytorch_lightning.trainer.connectors.logger_connector.epoch_result_store import ( + EpochResultStore, + LoggerStages, + LOOKUP_TABLE +) class LoggerConnector: @@ -33,6 +39,70 @@ def __init__(self, trainer): self.logged_metrics = {} self.progress_bar_metrics = {} self.eval_loop_results = [] + self._stages = sorted([s.value for s in LoggerStages]) + self._cached_results = {stage: EpochResultStore(trainer, stage) for stage in self._stages} + self._callback_hook_validator = CallbackHookNameValidator() + self._current_stage = None + + def cached_results(self, stage_or_testing: Union[str, bool]) -> Union[EpochResultStore, None]: + """ Function to access cached_results using str or bool. Bool is used only for testing""" + stage_or_testing = str(stage_or_testing) + stages = self._stages + if stage_or_testing in self._stages: + return self._cached_results[stage_or_testing] + if stage_or_testing in LOOKUP_TABLE: + # Acces using trainer.testing + stage = LOOKUP_TABLE[stage_or_testing] + return self._cached_results[stage] + raise MisconfigurationException( + f"Provide stage_or_testing {stage_or_testing} doesn't belong either to {self._stages}" + f" or {LOOKUP_TABLE.keys()}" + ) + + def set_stage(self, stage_or_testing: str, reset:bool = False) -> None: + self._current_stage = self._determine_stage(stage_or_testing) + if reset: + self.cached_results(stage_or_testing).reset() + + def check_logging_in_callbacks(self, hook_fx_name, on_step: bool = None, on_epoch: bool = None) -> None: + self._callback_hook_validator.check_logging_in_callbacks(current_hook_fx_name=hook_fx_name, + on_step=on_step, + on_epoch=on_epoch) + + def on_evaluation_batch_start(self, testing, batch, dataloader_idx, num_dataloaders): + # reset the result of the PL module + model = self.trainer.get_model() + model._current_dataloader_idx = dataloader_idx if num_dataloaders > 1 else None + + # track batch_size + self.cached_results(testing)._batch_size = Result.extract_batch_size(batch) + + def on_batch_start(self, split_idx: int, opt_idx: int, split_batch) -> None: + self._cached_results["train"]._split_idx = split_idx + self._cached_results["train"]._opt_idx = opt_idx + self._cached_results["train"]._batch_size = Result.extract_batch_size(split_batch) + + def on_train_batch_end(self) -> None: + self._cached_results["train"]._split_idx = None + self._cached_results["train"]._opt_idx = None + self._cached_results["train"]._batch_size = None + + def _determine_stage(self, stage_or_testing: Union[str, bool]) -> str: + stage_or_testing = str(stage_or_testing) + stages = self._stages + if stage_or_testing in stages: + return stage_or_testing + if stage_or_testing in LOOKUP_TABLE: + # Acces using trainer.testing + return LOOKUP_TABLE[stage_or_testing] + raise MisconfigurationException( + f"Provide stage_or_testing {stage_or_testing} doesn't belong either to {stages}" + f" or {LOOKUP_TABLE.keys()}" + ) + + def cache_logged_metrics(self) -> Union[EpochResultStore, None]: + if self._current_stage is not None: + self._cached_results[self._current_stage].cache_result() def on_trainer_init(self, logger, flush_logs_every_n_steps, log_every_n_steps): # logging @@ -179,12 +249,15 @@ def _log_on_evaluation_epoch_end_metrics(self, epoch_logs): continue reduced_epoch_metrics = dl_metrics[0].__class__.reduce_on_epoch_end(dl_metrics) - # make the keys 'k/dl' - reduced_epoch_metrics = self.__rename_keys_by_dataloader_idx(reduced_epoch_metrics, dl_idx, num_loaders) # track the metrics logger_metrics = reduced_epoch_metrics.get_epoch_log_metrics() pbar_metrics = reduced_epoch_metrics.get_epoch_pbar_metrics() + + # make the keys 'k/dl' + logger_metrics = self.__rename_keys_by_dataloader_idx(logger_metrics, dl_idx, num_loaders) + pbar_metrics = self.__rename_keys_by_dataloader_idx(pbar_metrics, dl_idx, num_loaders) + self.logged_metrics.update(logger_metrics) self.add_progress_bar_metrics(pbar_metrics) @@ -229,6 +302,7 @@ def _track_callback_metrics(self, eval_results, using_eval_result): else: self.trainer.logger_connector.callback_metrics.update(eval_results.callback_metrics) else: + flat = {} if isinstance(eval_results, list): for eval_result in eval_results: # with a scalar return, auto set it to "val_loss" for callbacks @@ -451,8 +525,7 @@ def __auto_reduce_results_on_epoch_end(self, epoch_output): for opt_outputs in epoch_output: # reduce across time first time_reduced_outputs = [] - for train_step_idx in range(len(opt_outputs)): - tbptt_outs = opt_outputs[train_step_idx] + for tbptt_outs in opt_outputs: tbptt_outs = tbptt_outs[0].__class__.reduce_across_time(tbptt_outs) if len(tbptt_outs) > 1: time_reduced_outputs.append(tbptt_outs) @@ -482,8 +555,7 @@ def __prepare_epoch_end_inputs(self, epoch_output): for opt_outputs in epoch_output: # gather across time first time_gathered_outputs = [] - for train_step_idx in range(len(opt_outputs)): - tbptt_outs = opt_outputs[train_step_idx] + for tbptt_outs in opt_outputs: result = [] for x in tbptt_outs: out = x.extra @@ -511,8 +583,7 @@ def __gather_result_across_time_and_optimizers(self, epoch_output): for opt_outputs in epoch_output: # gather across time first time_gathered_outputs = [] - for train_step_idx in range(len(opt_outputs)): - tbptt_outs = opt_outputs[train_step_idx] + for tbptt_outs in opt_outputs: tbptt_outs = tbptt_outs[0].__class__.gather(tbptt_outs) time_gathered_outputs.append(tbptt_outs) diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index ffc72f8f0022e..89a242dbfd886 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -29,6 +29,7 @@ def __init__(self, trainer): self.predictions = None self.max_batches = None self.warning_cache = WarningCache() + self.num_dataloaders = None def on_trainer_init(self): self.trainer.num_val_batches = [] @@ -108,6 +109,9 @@ def on_evaluation_end(self, *args, **kwargs): else: self.trainer.call_hook('on_validation_end', *args, **kwargs) + # reset stage to train + self.trainer.logger_connector.set_stage("train") + def reload_evaluation_dataloaders(self): model = self.trainer.get_model() if self.testing: @@ -133,6 +137,7 @@ def setup(self, model, max_batches, dataloaders): max_batches = [max_batches] * len(dataloaders) self.max_batches = max_batches + self.num_dataloaders = self._get_num_dataloaders(dataloaders) def on_evaluation_epoch_start(self, *args, **kwargs): if self.testing: @@ -293,16 +298,20 @@ def __auto_reduce_result_objs(self, outputs): return eval_results - def on_evaluation_batch_start(self, *args, **kwargs): + def on_evaluation_batch_start(self, batch, batch_idx, dataloader_idx): # reset the result of the PL module model = self.trainer.get_model() model._results = Result() model._current_fx_name = 'evaluation_step' + # set dataloader_idx and track batch_size + self.trainer.logger_connector.on_evaluation_batch_start( + self.testing, batch, dataloader_idx, self.num_dataloaders) + if self.testing: - self.trainer.call_hook('on_test_batch_start', *args, **kwargs) + self.trainer.call_hook('on_test_batch_start', batch, batch_idx, dataloader_idx) else: - self.trainer.call_hook('on_validation_batch_start', *args, **kwargs) + self.trainer.call_hook('on_validation_batch_start', batch, batch_idx, dataloader_idx) def on_evaluation_batch_end(self, *args, **kwargs): if self.testing: diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py index b585647fb5a0e..ae4d280d54649 100644 --- a/pytorch_lightning/trainer/logging.py +++ b/pytorch_lightning/trainer/logging.py @@ -14,7 +14,7 @@ from abc import ABC import inspect -from typing import Union, Iterable +from typing import Union, Iterable, Mapping import torch @@ -92,7 +92,7 @@ def process_dict_result(self, output, train=False): # --------------- # all keys not progress_bar or log are candidates for callbacks callback_metrics = {} - if output: + if isinstance(output, Mapping): for k, v in output.items(): if k not in ['progress_bar', 'log', 'hiddens']: callback_metrics[k] = v @@ -156,7 +156,7 @@ def process_dict_result(self, output, train=False): # --------------- # EXTRACT HIDDEN # --------------- - hiddens = output.get('hiddens') if output else None + hiddens = output.get('hiddens', None) if isinstance(output, Mapping) else None # use every metric passed in as a candidate for callback callback_metrics.update(progress_bar_metrics) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 1f6b97e7892f9..d3cc2f2e7278f 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -22,7 +22,8 @@ from pytorch_lightning.callbacks import Callback, ModelCheckpoint from pytorch_lightning.core.datamodule import LightningDataModule from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.core.step_result import EvalResult +from pytorch_lightning.core.memory import ModelSummary +from pytorch_lightning.core.step_result import Result, EvalResult from pytorch_lightning.loggers import LightningLoggerBase from pytorch_lightning.profiler import BaseProfiler from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin @@ -465,6 +466,9 @@ def fit( def train(self): self.run_sanity_check(self.get_model()) + # set stage for logging + self.logger_connector.set_stage("train") + self.checkpoint_connector.has_trained = False # enable train mode @@ -528,16 +532,25 @@ def train(self): self.train_loop.on_train_end() def run_evaluation(self, test_mode: bool = False, max_batches=None): + + # used to know if we are logging for val, test + reset cached results + self.logger_connector.set_stage(test_mode, reset=True) + # bookkeeping self.evaluation_loop.testing = test_mode + + # prepare dataloaders dataloaders, max_batches = self.evaluation_loop.get_evaluation_dataloaders(max_batches) + + # check if we want to skip this evaluation if self.evaluation_loop.should_skip_evaluation(dataloaders, max_batches): return [], [] - # enable eval mode + no grads + # ref model model = self.get_model() - self.evaluation_loop.on_evaluation_model_eval() + # enable eval mode + no grads + self.evaluation_loop.on_evaluation_model_eval() model.zero_grad() torch.set_grad_enabled(False) @@ -701,6 +714,8 @@ def test( # -------------------- self.verbose_test = verbose + self.logger_connector.set_stage("test") + # If you supply a datamodule you can't supply train_dataloader or val_dataloaders if test_dataloaders and datamodule: raise MisconfigurationException( diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 0d269c333b269..10bab2843b7eb 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -656,20 +656,12 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx): splits = self.tbptt_split_batch(batch) for split_idx, split_batch in enumerate(splits): - self.trainer.split_idx = split_idx - # in manual optimization we loop over all optimizers at once - optimizers = self.get_optimizers_iterable() - if not self.automatic_optimization: - optimizers = [optimizers[0]] + # create an iterable for optimizers and loop over them + for opt_idx, optimizer in self.prepare_optimizers(): - # loop over optimizers - for opt_idx, optimizer in optimizers: - # make sure only the gradients of the current optimizer's parameters are calculated - # in the training step to prevent dangling gradients in multiple-optimizer setup. - if self.automatic_optimization and len(self.trainer.optimizers) > 1: - model = self.trainer.get_model() - model.toggle_optimizer(optimizer, opt_idx) + # toggle model params + set info to logger_connector + self.run_train_split_start(split_idx, split_batch, opt_idx, optimizer) if self.should_accumulate(): # For gradient accumulation @@ -724,6 +716,7 @@ def train_step_and_backward_closure(): opt_idx=opt_idx, ) + # todo: Properly aggregate grad_norm accros opt_idx and split_idx grad_norm_dic = self._cur_grad_norm_dict self._cur_grad_norm_dict = None @@ -733,14 +726,8 @@ def train_step_and_backward_closure(): # clear gradients self.optimizer_zero_grad(batch_idx, optimizer, opt_idx) - accumulated_loss = self.accumulated_loss.mean() - - if accumulated_loss is not None: - # calculate running loss for display - self.running_loss.append(self.accumulated_loss.mean() * self.trainer.accumulate_grad_batches) - - # reset for next set of accumulated grads - self.accumulated_loss.reset() + # update running loss + reset accumulated loss + self.update_running_loss() # collapse all metrics into one dict batch_log_metrics = {k: v for d in batch_log_metrics for k, v in d.items()} @@ -941,3 +928,33 @@ def process_train_step_outputs(self, all_train_step_outputs, early_stopping_accu epoch_end_outputs.append(optimizer_idx_outputs) return epoch_end_outputs + + def prepare_optimizers(self): + # in manual optimization we loop over all optimizers at once + optimizers = self.get_optimizers_iterable() + if not self.automatic_optimization: + optimizers = [optimizers[0]] + return optimizers + + def run_train_split_start(self, split_idx, split_batch, opt_idx, optimizer): + # set split_idx to trainer for tracking + self.trainer.split_idx = split_idx + + # make sure only the gradients of the current optimizer's parameters are calculated + # in the training step to prevent dangling gradients in multiple-optimizer setup. + if self.automatic_optimization and len(self.trainer.optimizers) > 1: + model = self.trainer.get_model() + model.toggle_optimizer(optimizer, opt_idx) + + # use to track metrics internally + self.trainer.logger_connector.on_batch_start(split_idx, opt_idx, split_batch) + + def update_running_loss(self): + accumulated_loss = self.accumulated_loss.mean() + + if accumulated_loss is not None: + # calculate running loss for display + self.running_loss.append(self.accumulated_loss.mean() * self.trainer.accumulate_grad_batches) + + # reset for next set of accumulated grads + self.accumulated_loss.reset() diff --git a/tests/trainer/logging/test_logger_connector.py b/tests/trainer/logging/test_logger_connector.py new file mode 100644 index 0000000000000..0f27f2ca4fef4 --- /dev/null +++ b/tests/trainer/logging/test_logger_connector.py @@ -0,0 +1,249 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Tests to ensure that the training loop works with a dict (1.0) +""" +import os +import torch +import pytest + +from pytorch_lightning.trainer import Trainer +from pytorch_lightning.core.step_result import Result +from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector +from tests.base.boring_model import BoringModel, RandomDataset + + +class Helper: + def decorator_with_arguments(fx_name='', hook_fx_name=''): + def decorator(func): + def wrapper(self, *args, **kwargs): + # Set information + self._current_fx_name = fx_name + self._current_hook_fx_name = hook_fx_name + self._results = Result() + + result = func(self, *args, **kwargs) + + # cache metrics + self.trainer.logger_connector.cache_logged_metrics() + return result + return wrapper + + return decorator + + +def test__logger_connector__epoch_result_store__train(tmpdir): + """ + Tests that LoggerConnector will properly capture logged information + and reduce them + """ + + os.environ['PL_DEV_DEBUG'] = '1' + + class TestModel(BoringModel): + + train_losses = [] + + @Helper.decorator_with_arguments(fx_name="training_step") + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + + self.train_losses.append(loss) + + self.log("train_loss", loss, on_step=True, on_epoch=True) + return {"loss": loss} + + def val_dataloader(self): + return [torch.utils.data.DataLoader(RandomDataset(32, 64)), + torch.utils.data.DataLoader(RandomDataset(32, 64))] + + model = TestModel() + model.val_dataloader = None + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=2, + limit_val_batches=4, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + ) + trainer.fit(model) + + assert len(trainer.logger_connector.cached_results("train")['training_step']['0']['0']) == 2 + assert trainer.logger_connector.cached_results("train")['training_step']['0']['0']['0'][0]["train_loss"] == model.train_losses[0] + assert trainer.logger_connector.cached_results("train")['training_step']['0']['0']['1'][0]["train_loss"] == model.train_losses[1] + + # assert reduction didn't happen yet + assert trainer.logger_connector.cached_results("train").has_reduced is False + + # Launch reduction + trainer.logger_connector.cached_results("train").has_batch_loop_finished = True + + # assert reduction did happen + assert trainer.logger_connector.cached_results("train").has_reduced is True + + assert trainer.logger_connector.cached_results("train")["training_step"]\ + ._internals_reduced["0"]["0"]['train_loss_epoch'].item() == torch.stack(model.train_losses).mean().item() + + +def test__logger_connector__epoch_result_store__train__ttbt(tmpdir): + """ + Tests that LoggerConnector will properly capture logged information with ttbt + and reduce them + """ + truncated_bptt_steps = 2 + sequence_size = 30 + batch_size = 30 + + x_seq = torch.rand(batch_size, sequence_size, 1) + y_seq_list = torch.rand(batch_size, sequence_size, 1).tolist() + + class MockSeq2SeqDataset(torch.utils.data.Dataset): + def __getitem__(self, i): + return x_seq, y_seq_list + + def __len__(self): + return 1 + + class TestModel(BoringModel): + + train_losses = [] + + def __init__(self): + super().__init__() + self.test_hidden = None + self.layer = torch.nn.Linear(2, 2) + + @Helper.decorator_with_arguments(fx_name="training_step") + def training_step(self, batch, batch_idx, hiddens): + try: + assert hiddens == self.test_hidden, "Hidden state not persistent between tbptt steps" + except Exception as e: + print(e) + + self.test_hidden = torch.rand(1) + + x_tensor, y_list = batch + assert x_tensor.shape[1] == truncated_bptt_steps, "tbptt split Tensor failed" + + y_tensor = torch.tensor(y_list, dtype=x_tensor.dtype) + assert y_tensor.shape[1] == truncated_bptt_steps, "tbptt split list failed" + + pred = self(x_tensor.view(batch_size, truncated_bptt_steps)) + loss = torch.nn.functional.mse_loss( + pred, y_tensor.view(batch_size, truncated_bptt_steps)) + + self.train_losses.append(loss) + + self.log('a', loss, on_epoch=True) + + return {'loss': loss, 'hiddens': self.test_hidden} + + def on_train_epoch_start(self) -> None: + self.test_hidden = None + + def train_dataloader(self): + return torch.utils.data.DataLoader( + dataset=MockSeq2SeqDataset(), + batch_size=batch_size, + shuffle=False, + sampler=None, + ) + + model = TestModel() + model.training_epoch_end = None + model.example_input_array = torch.randn(5, truncated_bptt_steps) + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=10, + limit_val_batches=0, + truncated_bptt_steps=truncated_bptt_steps, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + ) + trainer.fit(model) + + assert len(trainer.logger_connector.cached_results("train")['training_step']['0']['0']['0']) == len(model.train_losses) + + # assert reduction didn't happen yet + assert trainer.logger_connector.cached_results("train").has_reduced is False + + # Launch reduction + trainer.logger_connector.cached_results("train").has_batch_loop_finished = True + + # assert reduction did happen + assert trainer.logger_connector.cached_results("train").has_reduced is True + + assert trainer.logger_connector.cached_results("train")['training_step']\ + ._internals_reduced['0']['0']["a_epoch"].item() == torch.stack(model.train_losses).mean().item() + + +@pytest.mark.parametrize('num_dataloaders', [1, 2]) +def test__logger_connector__epoch_result_store__test_multi_dataloaders(tmpdir, num_dataloaders): + """ + Tests that LoggerConnector will properly capture logged information in multi_dataloaders scenario + """ + + os.environ['PL_DEV_DEBUG'] = '1' + + class TestModel(BoringModel): + + test_losses = {} + + @Helper.decorator_with_arguments(fx_name="test_step") + def test_step(self, batch, batch_idx, dataloader_idx=0): + output = self.layer(batch) + loss = self.loss(batch, output) + + primary_key = str(dataloader_idx) + if primary_key not in self.test_losses: + self.test_losses[primary_key] = [] + + self.test_losses[primary_key].append(loss) + + self.log("test_loss", loss, on_step=True, on_epoch=True) + return {"test_loss": loss} + + def test_dataloader(self): + return [torch.utils.data.DataLoader(RandomDataset(32, 64)) for _ in range(num_dataloaders)] + + model = TestModel() + model.val_dataloader = None + model.test_epoch_end = None + + limit_test_batches = 4 + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=0, + limit_val_batches=0, + limit_test_batches=limit_test_batches, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + ) + trainer.test(model) + + assert len(trainer.logger_connector.cached_results("test")["test_step"]._internals) == num_dataloaders + for dl_idx in range(num_dataloaders): + assert len(trainer.logger_connector.cached_results("test")["test_step"]._internals[str(dl_idx)]) == limit_test_batches + trainer.logger_connector.cached_results("test").has_batch_loop_finished = True + for dl_idx in range(num_dataloaders): + expected = torch.stack(model.test_losses[str(dl_idx)]).mean() + generated = trainer.logger_connector.cached_results("test")["test_step"]._internals_reduced[str(dl_idx)]["test_loss_epoch"] + assert abs(expected.item() - generated.item()) < 1e-6 From 01ab2a933d1c3d77c13e52754a692011a45b3c3d Mon Sep 17 00:00:00 2001 From: Ananya Harsh Jha Date: Mon, 2 Nov 2020 17:13:34 -0500 Subject: [PATCH 40/88] [bug] [docs] Clearer optimizer_step override instructions (#4455) * fix * flags * remove defaults --- pytorch_lightning/accelerators/accelerator.py | 11 +++---- .../accelerators/tpu_accelerator.py | 10 ++++--- pytorch_lightning/core/lightning.py | 30 +++++++++++-------- 3 files changed, 30 insertions(+), 21 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 8ece6c4ec1b10..e69addf234a36 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -113,11 +113,12 @@ def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure): # model hook model_ref.optimizer_step( - self.trainer.current_epoch, - batch_idx, - optimizer, - opt_idx, - lambda_closure, + epoch=self.trainer.current_epoch, + batch_idx=batch_idx, + optimizer=optimizer, + optimizer_idx=opt_idx, + optimizer_closure=lambda_closure, + on_tpu=False, # TPUAccelerator class sets this as True using_native_amp=native_amp, using_lbfgs=is_lbfgs ) diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py index 1988f83601b8c..b60cd5a9dfc9c 100644 --- a/pytorch_lightning/accelerators/tpu_accelerator.py +++ b/pytorch_lightning/accelerators/tpu_accelerator.py @@ -242,11 +242,13 @@ def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure): # model hook model_ref.optimizer_step( - self.trainer.current_epoch, - batch_idx, optimizer, - opt_idx, - lambda_closure, + epoch=self.trainer.current_epoch, + batch_idx=batch_idx, + optimizer=optimizer, + optimizer_idx=opt_idx, + optimizer_closure=lambda_closure, on_tpu=True, + using_native_amp=False, using_lbfgs=is_lbfgs ) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 3f2ca7ce27bfc..f185626646803 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1143,10 +1143,10 @@ def optimizer_step( batch_idx: int, optimizer: Optimizer, optimizer_idx: int, - optimizer_closure: Optional[Callable] = None, - on_tpu: bool = False, - using_native_amp: bool = False, - using_lbfgs: bool = False, + optimizer_closure: Optional[Callable], + on_tpu: bool, + using_native_amp: bool, + using_lbfgs: bool, ) -> None: r""" Override this method to adjust the default way the @@ -1154,6 +1154,12 @@ def optimizer_step( By default, Lightning calls ``step()`` and ``zero_grad()`` as shown in the example once per optimizer. + Warning: + If you are overriding this method, make sure that you pass the ``optimizer_closure`` parameter + to ``optimizer.step()`` function as shown in the examples. This ensures that + ``train_step_and_backward_closure`` is called within + :meth:`~pytorch_lightning.trainer.training_loop.TrainLoop.run_training_batch`. + Args: epoch: Current epoch batch_idx: Index of current batch @@ -1168,23 +1174,23 @@ def optimizer_step( .. code-block:: python # DEFAULT - def optimizer_step(self, current_epoch, batch_idx, optimizer, optimizer_idx, + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs): - optimizer.step() + optimizer.step(closure=optimizer_closure) # Alternating schedule for optimizer steps (i.e.: GANs) - def optimizer_step(self, current_epoch, batch_idx, optimizer, optimizer_idx, + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs): # update generator opt every 2 steps if optimizer_idx == 0: if batch_idx % 2 == 0 : - optimizer.step() + optimizer.step(closure=optimizer_closure) optimizer.zero_grad() # update discriminator opt every 4 steps if optimizer_idx == 1: if batch_idx % 4 == 0 : - optimizer.step() + optimizer.step(closure=optimizer_closure) optimizer.zero_grad() # ... @@ -1197,8 +1203,8 @@ def optimizer_step(self, current_epoch, batch_idx, optimizer, optimizer_idx, .. code-block:: python # learning rate warm-up - def optimizer_step(self, current_epoch, batch_idx, optimizer, - optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs): + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, + optimizer_closure, on_tpu, using_native_amp, using_lbfgs): # warm up lr if self.trainer.global_step < 500: lr_scale = min(1., float(self.trainer.global_step + 1) / 500.) @@ -1206,7 +1212,7 @@ def optimizer_step(self, current_epoch, batch_idx, optimizer, pg['lr'] = lr_scale * self.learning_rate # update params - optimizer.step() + optimizer.step(closure=optimizer_closure) optimizer.zero_grad() Note: From 958aa1aee7c180703b5f9caae2ff4a0ca24e8a56 Mon Sep 17 00:00:00 2001 From: chaton Date: Mon, 2 Nov 2020 23:44:11 +0000 Subject: [PATCH 41/88] [test] Accumulated gradient optimization tests (#4477) * adding tests * wip * update * Update tests/trainer/test_trainer.py Co-authored-by: Sean Naren Co-authored-by: Sean Naren --- pytorch_lightning/trainer/supporters.py | 2 +- pytorch_lightning/trainer/training_loop.py | 1 + pytorch_lightning/utilities/debugging.py | 1 - tests/loggers/test_tensorboard.py | 4 +-- tests/trainer/test_trainer.py | 29 +++++++++++----------- 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/pytorch_lightning/trainer/supporters.py b/pytorch_lightning/trainer/supporters.py index ee98e3614da67..84c7982700df0 100644 --- a/pytorch_lightning/trainer/supporters.py +++ b/pytorch_lightning/trainer/supporters.py @@ -43,7 +43,7 @@ class TensorRunningAccum(object): def __init__(self, window_length: int): self.window_length = window_length - self.memory = torch.Tensor(self.window_length) + self.memory = torch.zeros(self.window_length) self.current_idx: int = 0 self.last_idx: Optional[int] = None self.rotated: bool = False diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 10bab2843b7eb..0306bf43ec368 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -602,6 +602,7 @@ def run_training_epoch(self): # progress global step according to grads progress self.increment_accumulated_grad_global_step() + # used during checkpointing for current_epoch and global_step self.trainer.checkpoint_connector.has_trained = True # log epoch metrics diff --git a/pytorch_lightning/utilities/debugging.py b/pytorch_lightning/utilities/debugging.py index 242f3105d780c..f7b9e79b7f932 100644 --- a/pytorch_lightning/utilities/debugging.py +++ b/pytorch_lightning/utilities/debugging.py @@ -37,7 +37,6 @@ def wrapped_fn(self, *args, **kwargs): class InternalDebugger(object): def __init__(self, trainer): - self.enabled = os.environ.get('PL_DEV_DEBUG', '0') == '1' self.trainer = trainer self.logged_metrics = [] diff --git a/tests/loggers/test_tensorboard.py b/tests/loggers/test_tensorboard.py index bc2e198601ee9..b7688b781539c 100644 --- a/tests/loggers/test_tensorboard.py +++ b/tests/loggers/test_tensorboard.py @@ -21,9 +21,9 @@ from omegaconf import OmegaConf from tensorboard.backend.event_processing.event_accumulator import EventAccumulator -from pytorch_lightning import Trainer +from pytorch_lightning import Trainer, seed_everything from pytorch_lightning.loggers import TensorBoardLogger -from tests.base import EvalModelTemplate +from tests.base import EvalModelTemplate, BoringModel @pytest.mark.skipif( diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 6fceae4b5e59d..d45c1c50cc060 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -37,7 +37,7 @@ from pytorch_lightning.utilities.cloud_io import load as pl_load from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities import NATIVE_AMP_AVALAIBLE -from tests.base import EvalModelTemplate +from tests.base import EvalModelTemplate, BoringModel @pytest.mark.parametrize("url_ckpt", [True, False]) @@ -305,28 +305,27 @@ def _optimizer_step( def test_gradient_accumulation_scheduling_last_batch(tmpdir, accumulate_grad_batches, limit_train_batches): """ Verify optimizer.step() applied to last batch while grad accumulation """ - class CurrentModel(EvalModelTemplate): - def on_after_backward(self): - self.loss_backward = deepcopy(self.state_dict()) + class CurrentModel(BoringModel): - def on_before_zero_grad(self, optimizer): - self.opt_step = self.state_dict() + def on_batch_start(self, batch, batch_idx, dataloader_idx): + self.on_train_batch_start_state_dict = self.state_dict() - def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): - _exclude_keys = ["num_batches_tracked", "running_mean", "running_var"] - - if (batch_idx + 1) == self.trainer.num_training_batches: - for key in self.loss_backward.keys(): - # exclude the check for batch_norm parameters - if not any([k in key for k in _exclude_keys]): - assert not torch.equal(self.loss_backward[key], self.opt_step[key]) + def on_batch_end(self, outputs, batch, batch_idx, dataloader_idx): + self.on_train_batch_start_end_dict = self.state_dict() + for key in self.on_train_batch_start_end_dict.keys(): + if (batch_idx + 1) == self.trainer.num_training_batches: + assert torch.equal(self.on_train_batch_start_state_dict[key], self.on_train_batch_start_end_dict[key]) + else: + assert not torch.equal(self.on_train_batch_start_state_dict[key], self.on_train_batch_start_end_dict[key]) model = CurrentModel() trainer = Trainer( accumulate_grad_batches=accumulate_grad_batches, - max_epochs=4, + max_epochs=2, limit_train_batches=limit_train_batches, + limit_val_batches=0, + limit_test_batches=0, default_root_dir=tmpdir, ) From ad2556b669ea9fa555a5f2fb3ad070da4728c746 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Tue, 3 Nov 2020 11:38:32 +0530 Subject: [PATCH 42/88] Disable saving checkpoints if not trained (#4372) * Disable saving checkpoints if not trained * chlog * update test * fix Co-authored-by: chaton --- CHANGELOG.md | 3 + .../connectors/checkpoint_connector.py | 5 +- pytorch_lightning/trainer/training_loop.py | 6 +- tests/checkpointing/test_model_checkpoint.py | 121 +++++++++++------- 4 files changed, 79 insertions(+), 56 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 95417dd8aa9ae..cc430356191c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Disable saving checkpoints if not trained ([#4372](https://github.com/PyTorchLightning/pytorch-lightning/pull/4372)) + - Fixed error using `auto_select_gpus=True` with `gpus=-1` ([#4209](https://github.com/PyTorchLightning/pytorch-lightning/pull/4209)) - Fixed that metrics do not store computational graph for all seen data ([#4313](https://github.com/PyTorchLightning/pytorch-lightning/pull/4313)) @@ -88,6 +90,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [1.0.3] - 2020-10-20 ### Added + - Added persistent flag to `Metric.add_state` ([#4195](https://github.com/PyTorchLightning/pytorch-lightning/pull/4195)) ### Changed diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index 1bbdb4abac282..8d670e9388284 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -255,9 +255,8 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict: has_reached_max_steps = self.trainer.max_steps and self.trainer.max_steps <= global_step global_step += 1 - if self.has_trained: - if not has_reached_max_steps: - current_epoch += 1 + if not has_reached_max_steps: + current_epoch += 1 checkpoint = { 'epoch': current_epoch, diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 0306bf43ec368..0a931257f560f 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -203,7 +203,7 @@ def on_train_end(self): def check_checkpoint_callback(self, should_save, is_last=False): # TODO bake this logic into the checkpoint callback - if should_save: + if should_save and self.trainer.checkpoint_connector.has_trained: checkpoint_callbacks = [c for c in self.trainer.callbacks if isinstance(c, ModelCheckpoint)] if is_last and any(c.save_last for c in checkpoint_callbacks): rank_zero_info("Saving latest checkpoint...") @@ -579,6 +579,7 @@ def run_training_epoch(self): monitor_metrics = deepcopy(self.trainer.logger_connector.callback_metrics) monitor_metrics.update(batch_output.batch_log_metrics) self.update_train_loop_lr_schedulers(monitor_metrics=monitor_metrics) + self.trainer.checkpoint_connector.has_trained = True # max steps reached, end training if self.trainer.max_steps is not None and self.trainer.max_steps == self.trainer.global_step + 1: @@ -602,9 +603,6 @@ def run_training_epoch(self): # progress global step according to grads progress self.increment_accumulated_grad_global_step() - # used during checkpointing for current_epoch and global_step - self.trainer.checkpoint_connector.has_trained = True - # log epoch metrics self.trainer.logger_connector.log_train_epoch_end_metrics( epoch_output, self.checkpoint_accumulator, self.early_stopping_accumulator, self.num_optimizers diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index d3d5f67bcfeaa..0e5eb0997b57d 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -693,66 +693,89 @@ def validation_step(self, batch, batch_idx): loss = self.loss(batch, output) return {"val_loss": loss} - model = ExtendedBoringModel() - model.validation_step_end = None - model.validation_epoch_end = None - trainer = pl.Trainer(default_root_dir=tmpdir, - max_epochs=1, - limit_train_batches=2, - limit_val_batches=2, - limit_test_batches=2, - ) - - assert trainer.checkpoint_connector.has_trained is not True - assert trainer.current_epoch == 0 - trainer.fit(model) - assert trainer.checkpoint_connector.has_trained is True - assert trainer.global_step == 2 - assert trainer.current_epoch == 0 - trainer.test(model) - assert trainer.current_epoch == 0 - assert str(os.listdir(osp.join(tmpdir, 'lightning_logs'))) == "['version_0']" - - def get_last_checkpoint(): - logs_dir = osp.join(tmpdir, 'lightning_logs') - versions = os.listdir(logs_dir) - versions.sort() - - last_version = versions[-1] - ckpt_dir = osp.join(logs_dir, last_version, "checkpoints") + def assert_trainer_init(trainer): + assert not trainer.checkpoint_connector.has_trained + assert trainer.global_step == 0 + assert trainer.current_epoch == 0 + def get_last_checkpoint(ckpt_dir): ckpts = os.listdir(ckpt_dir) ckpts.sort() - return osp.join(ckpt_dir, ckpts[-1]) - def assert_checkpoint_content(): - chk = pl_load(get_last_checkpoint()) - assert chk["epoch"] == 1 - assert chk["global_step"] == 2 + def assert_checkpoint_content(ckpt_dir): + chk = pl_load(get_last_checkpoint(ckpt_dir)) + assert chk["epoch"] == epochs + assert chk["global_step"] == 4 + + def assert_checkpoint_log_dir(idx): + lightning_logs_path = osp.join(tmpdir, 'lightning_logs') + assert sorted(os.listdir(lightning_logs_path)) == [f'version_{i}' for i in range(idx + 1)] + assert len(os.listdir(ckpt_dir)) == epochs + + def get_model(): + model = ExtendedBoringModel() + model.validation_step_end = None + model.validation_epoch_end = None + return model + + ckpt_dir = osp.join(tmpdir, 'checkpoints') + checkpoint_cb = ModelCheckpoint(dirpath=ckpt_dir, save_top_k=-1) + epochs = 2 + limit_train_batches = 2 + + model = get_model() + + trainer_config = dict( + default_root_dir=tmpdir, + max_epochs=epochs, + limit_train_batches=limit_train_batches, + limit_val_batches=3, + limit_test_batches=4, + ) + + trainer = pl.Trainer( + **trainer_config, + checkpoint_callback=checkpoint_cb, + ) + assert_trainer_init(trainer) + + trainer.fit(model) + assert trainer.checkpoint_connector.has_trained + assert trainer.global_step == epochs * limit_train_batches + assert trainer.current_epoch == epochs - 1 + assert_checkpoint_log_dir(0) + + trainer.test(model) + assert trainer.current_epoch == epochs - 1 - assert_checkpoint_content() + assert_checkpoint_content(ckpt_dir) for idx in range(1, 5): + chk = get_last_checkpoint(ckpt_dir) + assert_checkpoint_content(ckpt_dir) + + checkpoint_cb = ModelCheckpoint(dirpath=ckpt_dir, save_top_k=-1) + model = get_model() + # load from checkpoint - chk = get_last_checkpoint() - assert_checkpoint_content() - model = BoringModel.load_from_checkpoint(chk) - trainer = pl.Trainer(default_root_dir=tmpdir, - max_epochs=1, - limit_train_batches=2, - limit_val_batches=2, - limit_test_batches=2, - resume_from_checkpoint=chk) - assert trainer.checkpoint_connector.has_trained is not True - assert trainer.global_step == 0 + trainer = pl.Trainer( + **trainer_config, + resume_from_checkpoint=chk, + checkpoint_callback=checkpoint_cb, + ) + assert_trainer_init(trainer) + trainer.test(model) - assert trainer.global_step == 2 + assert not trainer.checkpoint_connector.has_trained + assert trainer.global_step == epochs * limit_train_batches + assert trainer.current_epoch == epochs + trainer.fit(model) - assert trainer.global_step == 2 - assert trainer.checkpoint_connector.has_trained is not True - lightning_logs_path = osp.join(tmpdir, 'lightning_logs') - assert sorted(os.listdir(lightning_logs_path)) == [f"version_{i}" for i in range(idx + 1)] + assert not trainer.checkpoint_connector.has_trained + assert trainer.global_step == epochs * limit_train_batches + assert trainer.current_epoch == epochs + assert_checkpoint_log_dir(idx) @pytest.mark.parametrize( From 360b3d88440d455e2235da33616014f4f45d8bd0 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Tue, 3 Nov 2020 12:10:35 +0530 Subject: [PATCH 43/88] Disable training when limit_train_batches=0 (#4371) * Disable training when limit_train_batches=0 * chlog * pep * limit_train_batches * BoringModel Co-authored-by: Roger Shieh --- CHANGELOG.md | 3 ++ pytorch_lightning/trainer/trainer.py | 4 ++ pytorch_lightning/trainer/training_loop.py | 11 +++- tests/callbacks/test_progress_bar.py | 2 +- tests/trainer/test_trainer.py | 62 ++++++++++++++++++++++ 5 files changed, 80 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cc430356191c3..8d9c3d8a1f186 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,10 +41,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed error using `auto_select_gpus=True` with `gpus=-1` ([#4209](https://github.com/PyTorchLightning/pytorch-lightning/pull/4209)) +- Disabled training when `limit_train_batches=0` ([#4371](https://github.com/PyTorchLightning/pytorch-lightning/pull/4371)) + - Fixed that metrics do not store computational graph for all seen data ([#4313](https://github.com/PyTorchLightning/pytorch-lightning/pull/4313)) - Fixed AMP unscale for `on_after_backward` ([#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) + ## [1.0.4] - 2020-10-27 ### Added diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index d3cc2f2e7278f..49cf232f76ac7 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -482,6 +482,10 @@ def train(self): # hook self.train_loop.on_train_start() + if self.train_loop.should_skip_training(): + self.train_loop.on_train_end() + return + try: # run all epochs for epoch in range(self.current_epoch, self.max_epochs): diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 0a931257f560f..3845b7eb728ac 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -77,6 +77,15 @@ def num_optimizers(self): num_optimizers = len(self.get_optimizers_iterable()) return num_optimizers + def should_skip_training(self): + if self.trainer.current_epoch >= self.trainer.max_epochs: + return True + + if self.trainer.limit_train_batches == 0: + return True + + return False + def on_train_start(self): # clear cache before training if self.trainer.on_gpu and self.trainer.root_gpu is not None: @@ -597,7 +606,7 @@ def run_training_epoch(self): self.trainer.total_batch_idx += 1 # stop epoch if we limited the number of training batches - if batch_idx + 1 >= self.trainer.num_training_batches: + if (batch_idx + 1) >= self.trainer.num_training_batches: break # progress global step according to grads progress diff --git a/tests/callbacks/test_progress_bar.py b/tests/callbacks/test_progress_bar.py index d354b59682240..221844244ad75 100644 --- a/tests/callbacks/test_progress_bar.py +++ b/tests/callbacks/test_progress_bar.py @@ -231,7 +231,7 @@ def on_validation_epoch_end(self, trainer, pl_module): default_root_dir=tmpdir, max_epochs=1, num_sanity_val_steps=2, - limit_train_batches=0, + limit_train_batches=1, limit_val_batches=limit_val_batches, callbacks=[progress_bar], logger=False, diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index d45c1c50cc060..51fa89ee5539b 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -747,6 +747,68 @@ def test_test_checkpoint_path(tmpdir, ckpt_path, save_top_k): assert trainer.tested_ckpt_path == ckpt_path +def test_disabled_training(tmpdir): + """Verify that `limit_train_batches=0` disables the training loop unless `fast_dev_run=True`.""" + + class CurrentModel(BoringModel): + + training_step_invoked = False + training_epoch_end_invoked = False + + def training_step(self, *args, **kwargs): + self.training_step_invoked = True + return super().training_step(*args, **kwargs) + + def training_epoch_end(self, *args, **kwargs): + self.training_epoch_end_invoked = True + return super().training_epoch_end(*args, **kwargs) + + model = CurrentModel() + + trainer_options = dict( + default_root_dir=tmpdir, + progress_bar_refresh_rate=0, + max_epochs=2, + limit_train_batches=0.0, + limit_val_batches=0.2, + fast_dev_run=False, + ) + + before_state_dict = deepcopy(model.state_dict()) + + trainer = Trainer(**trainer_options) + result = trainer.fit(model) + + after_state_dict = model.state_dict() + + for key in before_state_dict.keys(): + assert torch.all(torch.eq(before_state_dict[key], after_state_dict[key])) + + # check that limit_train_batches=0 turns off training + assert result == 1, "training failed to complete" + assert trainer.current_epoch == 0 + assert not model.training_step_invoked, "`training_step` should not run when `limit_train_batches=0`" + assert not model.training_epoch_end_invoked, "`training_epoch_end` should not run when `limit_train_batches=0`" + + # check that limit_train_batches has no influence when fast_dev_run is turned on + model = CurrentModel() + trainer_options.update(fast_dev_run=True) + before_state_dict = deepcopy(model.state_dict()) + + trainer = Trainer(**trainer_options) + result = trainer.fit(model) + + after_state_dict = model.state_dict() + + for key in before_state_dict.keys(): + assert not torch.all(torch.eq(before_state_dict[key], after_state_dict[key])) + + assert result == 1, "training failed to complete" + assert trainer.current_epoch == 0 + assert model.training_step_invoked, "did not run `training_step` with `fast_dev_run=True`" + assert model.training_epoch_end_invoked, "did not run `training_epoch_end` with `fast_dev_run=True`" + + def test_disabled_validation(tmpdir): """Verify that `limit_val_batches=0` disables the validation loop unless `fast_dev_run=True`.""" From 1396321b4d779dba837749dfba5e12fe5dac00d0 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Tue, 3 Nov 2020 15:09:40 +0530 Subject: [PATCH 44/88] Add fsspec to tuner (#4458) * Add fsspec to tuner * suggestions * pathlib * pep * missed pep --- pytorch_lightning/tuner/batch_size_scaling.py | 10 ++++-- pytorch_lightning/tuner/lr_finder.py | 10 ++++-- pytorch_lightning/tuner/tuning.py | 33 ++++++++++++------- tests/trainer/test_lr_finder.py | 3 ++ tests/trainer/test_trainer_tricks.py | 3 ++ 5 files changed, 41 insertions(+), 18 deletions(-) diff --git a/pytorch_lightning/tuner/batch_size_scaling.py b/pytorch_lightning/tuner/batch_size_scaling.py index 10de8a2d289e5..b468209171393 100644 --- a/pytorch_lightning/tuner/batch_size_scaling.py +++ b/pytorch_lightning/tuner/batch_size_scaling.py @@ -22,6 +22,7 @@ from pytorch_lightning.utilities.memory import is_oom_error, garbage_collection_cuda from pytorch_lightning.loggers.base import DummyLogger from pytorch_lightning import _logger as log +from pytorch_lightning.utilities.cloud_io import get_filesystem def scale_batch_size(trainer, @@ -90,7 +91,7 @@ def scale_batch_size(trainer, __scale_batch_reset_params(trainer, model, steps_per_trial) # Save initial model, that is loaded after batch size is found - save_path = os.path.join(trainer.default_root_dir, 'temp_model.ckpt') + save_path = os.path.join(trainer.default_root_dir, 'scale_batch_size_temp_model.ckpt') trainer.save_checkpoint(str(save_path)) if trainer.progress_bar_callback: @@ -109,8 +110,11 @@ def scale_batch_size(trainer, log.info(f'Finished batch size finder, will continue with full run using batch size {new_size}') # Restore initial state of model - trainer.checkpoint_connector.restore(str(save_path), on_gpu=trainer.on_gpu) - os.remove(save_path) + if trainer.is_global_zero: + trainer.checkpoint_connector.restore(str(save_path), on_gpu=trainer.on_gpu) + fs = get_filesystem(str(save_path)) + if fs.exists(save_path): + fs.rm(save_path) # Finish by resetting variables so trainer is ready to fit model __scale_batch_restore_params(trainer) diff --git a/pytorch_lightning/tuner/lr_finder.py b/pytorch_lightning/tuner/lr_finder.py index 3107f9f44824a..54e7e082bc0a8 100644 --- a/pytorch_lightning/tuner/lr_finder.py +++ b/pytorch_lightning/tuner/lr_finder.py @@ -29,6 +29,7 @@ from pytorch_lightning.loggers.base import DummyLogger from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.parsing import lightning_hasattr, lightning_setattr +from pytorch_lightning.utilities.cloud_io import get_filesystem # check if ipywidgets is installed before importing tqdm.auto # to ensure it won't fail and a progress bar is displayed @@ -130,7 +131,7 @@ def lr_find( trainer.fit(model) """ - save_path = os.path.join(trainer.default_root_dir, 'lr_find_temp.ckpt') + save_path = os.path.join(trainer.default_root_dir, 'lr_find_temp_model.ckpt') __lr_finder_dump_params(trainer, model) @@ -181,8 +182,11 @@ def lr_find( lr_finder._total_batch_idx = trainer.total_batch_idx # for debug purpose # Reset model state - trainer.checkpoint_connector.restore(str(save_path), on_gpu=trainer.on_gpu) - os.remove(save_path) + if trainer.is_global_zero: + trainer.checkpoint_connector.restore(str(save_path), on_gpu=trainer.on_gpu) + fs = get_filesystem(str(save_path)) + if fs.exists(save_path): + fs.rm(save_path) # Finish by resetting variables so trainer is ready to fit model __lr_finder_restore_params(trainer, model) diff --git a/pytorch_lightning/tuner/tuning.py b/pytorch_lightning/tuner/tuning.py index 9929249804309..88009427bc3c6 100644 --- a/pytorch_lightning/tuner/tuning.py +++ b/pytorch_lightning/tuner/tuning.py @@ -54,17 +54,19 @@ def tune(self, model, train_dataloader, val_dataloaders, datamodule): # Run learning rate finder: if self.trainer.auto_lr_find: - self.internal_find_lr(self.trainer, model) + self.internal_find_lr(model) model.logger = self.trainer.logger # reset logger binding - def scale_batch_size(self, - model, - mode: str = 'power', - steps_per_trial: int = 3, - init_val: int = 2, - max_trials: int = 25, - batch_arg_name: str = 'batch_size', - **fit_kwargs): + def scale_batch_size( + self, + model, + mode: str = 'power', + steps_per_trial: int = 3, + init_val: int = 2, + max_trials: int = 25, + batch_arg_name: str = 'batch_size', + **fit_kwargs + ): r""" Will iteratively try to find the largest batch size for a given model that does not give an out of memory (OOM) error. @@ -102,7 +104,14 @@ def scale_batch_size(self, """ return scale_batch_size( - self.trainer, model, mode, steps_per_trial, init_val, max_trials, batch_arg_name, **fit_kwargs + self.trainer, + model, + mode, + steps_per_trial, + init_val, + max_trials, + batch_arg_name, + **fit_kwargs, ) def lr_find( @@ -130,8 +139,8 @@ def lr_find( datamodule, ) - def internal_find_lr(self, trainer, model: LightningModule): - return _run_lr_finder_internally(trainer, model) + def internal_find_lr(self, model: LightningModule): + return _run_lr_finder_internally(self.trainer, model) def pick_multiple_gpus(self, num_gpus: int): return pick_multiple_gpus(num_gpus) diff --git a/tests/trainer/test_lr_finder.py b/tests/trainer/test_lr_finder.py index 7c128dc42c673..401584d920c9b 100755 --- a/tests/trainer/test_lr_finder.py +++ b/tests/trainer/test_lr_finder.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os from copy import deepcopy import pytest import torch @@ -58,6 +59,8 @@ def test_model_reset_correctly(tmpdir): assert torch.all(torch.eq(before_state_dict[key], after_state_dict[key])), \ 'Model was not reset correctly after learning rate finder' + assert not os.path.exists(tmpdir / 'lr_find_temp_model.ckpt') + def test_trainer_reset_correctly(tmpdir): """ Check that all trainer parameters are reset correctly after lr_find() """ diff --git a/tests/trainer/test_trainer_tricks.py b/tests/trainer/test_trainer_tricks.py index 8743ba4e5ba7b..dd8b2689d8ed5 100755 --- a/tests/trainer/test_trainer_tricks.py +++ b/tests/trainer/test_trainer_tricks.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os from copy import deepcopy import pytest import torch @@ -229,6 +230,8 @@ def test_auto_scale_batch_size_trainer_arg(tmpdir, scale_arg): assert before_batch_size != after_batch_size, \ 'Batch size was not altered after running auto scaling of batch size' + assert not os.path.exists(tmpdir / 'scale_batch_size_temp_model.ckpt') + @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") @pytest.mark.parametrize('use_hparams', [True, False]) From 9b7f01654a96e72df29cbae36ffa430a8baa8f63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 3 Nov 2020 12:13:10 +0100 Subject: [PATCH 45/88] Update old "module_arguments" and "hparams" references in docs (#4417) * replace module_arguments refernces * update hparams docs * add missing save_hyperparameters in example * deprecate instead of remove * Update docs/source/hyperparameters.rst Co-authored-by: chaton * Update docs/source/hyperparameters.rst Co-authored-by: Teddy Koker Co-authored-by: chaton Co-authored-by: Sean Naren --- docs/source/hyperparameters.rst | 106 +++++++++--------- docs/source/logging.rst | 4 +- docs/source/weights_loading.rst | 7 +- pytorch_lightning/core/lightning.py | 2 +- pytorch_lightning/core/saving.py | 8 +- .../connectors/checkpoint_connector.py | 6 +- tests/models/test_restore.py | 2 +- tests/trainer/test_trainer.py | 2 +- 8 files changed, 69 insertions(+), 68 deletions(-) diff --git a/docs/source/hyperparameters.rst b/docs/source/hyperparameters.rst index 4d1d7ee45583d..7e5f349ba0ca8 100644 --- a/docs/source/hyperparameters.rst +++ b/docs/source/hyperparameters.rst @@ -112,87 +112,87 @@ Often times we train many versions of a model. You might share that model or com at which point it is very useful to know how that model was trained (i.e.: what learning rate, neural network, etc...). Lightning has a few ways of saving that information for you in checkpoints and yaml files. The goal here is to -improve readability and reproducibility +improve readability and reproducibility. -1. The first way is to ask lightning to save the values of anything in the __init__ for you to the checkpoint. This also -makes those values available via `self.hparams`. +1. The first way is to ask lightning to save the values of anything in the __init__ for you to the checkpoint. This also + makes those values available via `self.hparams`. -.. code-block:: python + .. code-block:: python - class LitMNIST(LightningModule): + class LitMNIST(LightningModule): - def __init__(self, layer_1_dim=128, learning_rate=1e-2, **kwargs): - super().__init__() - # call this to save (layer_1_dim=128, learning_rate=1e-4) to the checkpoint - self.save_hyperparameters() + def __init__(self, layer_1_dim=128, learning_rate=1e-2, **kwargs): + super().__init__() + # call this to save (layer_1_dim=128, learning_rate=1e-4) to the checkpoint + self.save_hyperparameters() - # equivalent - self.save_hyperparameters('layer_1_dim', 'learning_rate') + # equivalent + self.save_hyperparameters('layer_1_dim', 'learning_rate') - # this now works - self.hparams.layer_1_dim + # Now possible to access layer_1_dim from hparams + self.hparams.layer_1_dim -2. Sometimes your init might have objects or other parameters you might not want to save. -In that case, choose only a few +2. Sometimes your init might have objects or other parameters you might not want to save. + In that case, choose only a few -.. code-block:: python + .. code-block:: python - class LitMNIST(LightningModule): + class LitMNIST(LightningModule): - def __init__(self, loss_fx, generator_network, layer_1_dim=128 **kwargs): - super().__init__() - self.layer_1_dim = layer_1_dim - self.loss_fx = loss_fx + def __init__(self, loss_fx, generator_network, layer_1_dim=128 **kwargs): + super().__init__() + self.layer_1_dim = layer_1_dim + self.loss_fx = loss_fx - # call this to save (layer_1_dim=128) to the checkpoint - self.save_hyperparameters('layer_1_dim') + # call this to save (layer_1_dim=128) to the checkpoint + self.save_hyperparameters('layer_1_dim') - # to load specify the other args - model = LitMNIST.load_from_checkpoint(PATH, loss_fx=torch.nn.SomeOtherLoss, generator_network=MyGenerator()) + # to load specify the other args + model = LitMNIST.load_from_checkpoint(PATH, loss_fx=torch.nn.SomeOtherLoss, generator_network=MyGenerator()) -3. Assign to `self.hparams`. Anything assigned to `self.hparams` will also be saved automatically +3. Assign to `self.hparams`. Anything assigned to `self.hparams` will also be saved automatically. -.. code-block:: python + .. code-block:: python - # using a argparse.Namespace - class LitMNIST(LightningModule): + # using a argparse.Namespace + class LitMNIST(LightningModule): + def __init__(self, hparams, *args, **kwargs): + super().__init__() + self.hparams = hparams + self.layer_1 = torch.nn.Linear(28 * 28, self.hparams.layer_1_dim) + self.layer_2 = torch.nn.Linear(self.hparams.layer_1_dim, self.hparams.layer_2_dim) + self.layer_3 = torch.nn.Linear(self.hparams.layer_2_dim, 10) + def train_dataloader(self): + return DataLoader(mnist_train, batch_size=self.hparams.batch_size) - def __init__(self, hparams, *args, **kwargs): - super().__init__() - self.hparams = hparams + .. warning:: Deprecated. This method of assigning hyperparameters to the LightningModule is no longer + recommended and will not be supported in future versions of Lightning. - self.layer_1 = torch.nn.Linear(28 * 28, self.hparams.layer_1_dim) - self.layer_2 = torch.nn.Linear(self.hparams.layer_1_dim, self.hparams.layer_2_dim) - self.layer_3 = torch.nn.Linear(self.hparams.layer_2_dim, 10) - def train_dataloader(self): - return DataLoader(mnist_train, batch_size=self.hparams.batch_size) +4. You can also save full objects such as `dict` or `Namespace` to the checkpoint. -4. You can also save full objects such as `dict` or `Namespace` to the checkpoint. + .. code-block:: python -.. code-block:: python + # using a argparse.Namespace + class LitMNIST(LightningModule): - # using a argparse.Namespace - class LitMNIST(LightningModule): + def __init__(self, conf, *args, **kwargs): + super().__init__() + self.save_hyperparameters(conf) - def __init__(self, conf, *args, **kwargs): - super().__init__() - self.hparams = conf + self.layer_1 = torch.nn.Linear(28 * 28, self.hparams.layer_1_dim) + self.layer_2 = torch.nn.Linear(self.hparams.layer_1_dim, self.hparams.layer_2_dim) + self.layer_3 = torch.nn.Linear(self.hparams.layer_2_dim, 10) - # equivalent - self.save_hyperparameters(conf) + conf = OmegaConf.create(...) + model = LitMNIST(conf) - self.layer_1 = torch.nn.Linear(28 * 28, self.hparams.layer_1_dim) - self.layer_2 = torch.nn.Linear(self.hparams.layer_1_dim, self.hparams.layer_2_dim) - self.layer_3 = torch.nn.Linear(self.hparams.layer_2_dim, 10) + # Now possible to access any stored variables from hparams + model.hparams.anything - conf = OmegaConf.create(...) - model = LitMNIST(conf) - # this works - model.hparams.anything ---------- diff --git a/docs/source/logging.rst b/docs/source/logging.rst index a1d16a4ddc771..ae1a0487a468b 100644 --- a/docs/source/logging.rst +++ b/docs/source/logging.rst @@ -260,12 +260,12 @@ Logging hyperparameters *********************** When training a model, it's useful to know what hyperparams went into that model. -When Lightning creates a checkpoint, it stores a key "hparams" with the hyperparams. +When Lightning creates a checkpoint, it stores a key "hyper_parameters" with the hyperparams. .. code-block:: python lightning_checkpoint = torch.load(filepath, map_location=lambda storage, loc: storage) - hyperparams = lightning_checkpoint['hparams'] + hyperparams = lightning_checkpoint['hyper_parameters'] Some loggers also allow logging the hyperparams used in the experiment. For instance, when using the TestTubeLogger or the TensorBoardLogger, all hyperparams will show diff --git a/docs/source/weights_loading.rst b/docs/source/weights_loading.rst index e3e4349253a54..cddee7e1feb3f 100644 --- a/docs/source/weights_loading.rst +++ b/docs/source/weights_loading.rst @@ -111,7 +111,7 @@ You can disable checkpointing by passing The Lightning checkpoint also saves the arguments passed into the LightningModule init -under the `module_arguments` key in the checkpoint. +under the `hyper_parameters` key in the checkpoint. .. code-block:: python @@ -119,10 +119,11 @@ under the `module_arguments` key in the checkpoint. def __init__(self, learning_rate, *args, **kwargs): super().__init__() + self.save_hyperparameters() # all init args were saved to the checkpoint checkpoint = torch.load(CKPT_PATH) - print(checkpoint['module_arguments']) + print(checkpoint['hyper_parameters']) # {'learning_rate': the_value} Manual saving @@ -140,7 +141,7 @@ You can manually save checkpoints and restore your model from the checkpointed s Checkpoint loading ****************** -To load a model along with its weights, biases and `module_arguments` use the following method: +To load a model along with its weights, biases and hyperparameters use the following method: .. code-block:: python diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index f185626646803..78b9c7025711a 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1415,7 +1415,7 @@ def _auto_collect_arguments(cls, frame=None) -> Tuple[Dict, Dict]: frame_args = collect_init_args(frame.f_back, []) self_arguments = frame_args[-1] - # set module_arguments in child + # set hyper_parameters in child self_arguments = self_arguments parents_arguments = {} diff --git a/pytorch_lightning/core/saving.py b/pytorch_lightning/core/saving.py index 7124007cd3f2a..2662aa6758332 100644 --- a/pytorch_lightning/core/saving.py +++ b/pytorch_lightning/core/saving.py @@ -60,9 +60,9 @@ def load_from_checkpoint( ): r""" Primary way of loading a model from a checkpoint. When Lightning saves a checkpoint - it stores the arguments passed to `__init__` in the checkpoint under `module_arguments` + it stores the arguments passed to `__init__` in the checkpoint under `hyper_parameters` - Any arguments specified through \*args and \*\*kwargs will override args stored in `hparams`. + Any arguments specified through \*args and \*\*kwargs will override args stored in `hyper_parameters`. Args: checkpoint_path: Path to checkpoint. This can also be a URL, or file-like object @@ -89,8 +89,8 @@ def load_from_checkpoint( `hparams` as :class:`~dict`. strict: Whether to strictly enforce that the keys in :attr:`checkpoint_path` match the keys returned by this module's state dict. Default: `True`. - hparam_overrides: A dictionary with keys to override in the hparams - kwargs: Any keyword args needed to init the model. + kwargs: Any extra keyword args needed to init the model. Can also be used to override saved + hyperparameter values. Return: :class:`LightningModule` with loaded weights and hyperparameters (if available). diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index 8d670e9388284..1689ad245eb78 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -234,7 +234,7 @@ def hpc_save(self, folderpath: str, logger): if LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in checkpoint: del checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] rank_zero_warn( - 'warning, `module_arguments` dropped from checkpoint.' f' An attribute is not picklable {err}' + 'warning, `hyper_parameters` dropped from checkpoint.' f' An attribute is not picklable {err}' ) atomic_save(checkpoint, filepath) @@ -288,7 +288,7 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict: elif self.trainer.amp_backend == AMPType.APEX: checkpoint['amp_scaling_state'] = amp.state_dict() - # add the module_arguments and state_dict from the model + # add the hyper_parameters and state_dict from the model model = self.trainer.get_model() checkpoint['state_dict'] = model.state_dict() @@ -366,6 +366,6 @@ def save_checkpoint(self, filepath, weights_only: bool = False): if LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in checkpoint: del checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] rank_zero_warn( - 'Warning, `module_arguments` dropped from checkpoint.' f' An attribute is not picklable {err}' + 'Warning, `hyper_parameters` dropped from checkpoint.' f' An attribute is not picklable {err}' ) atomic_save(checkpoint, filepath) diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py index 848d6127c4cdb..024a0ee531580 100644 --- a/tests/models/test_restore.py +++ b/tests/models/test_restore.py @@ -300,7 +300,7 @@ def test_load_model_from_checkpoint(tmpdir, model_template): # Since `EvalModelTemplate` has `_save_hparams = True` by default, check that ckpt has hparams ckpt = torch.load(last_checkpoint) - assert model_template.CHECKPOINT_HYPER_PARAMS_KEY in ckpt.keys(), 'module_arguments missing from checkpoints' + assert model_template.CHECKPOINT_HYPER_PARAMS_KEY in ckpt.keys(), 'hyper_parameters missing from checkpoints' # Ensure that model can be correctly restored from checkpoint pretrained_model = model_template.load_from_checkpoint(last_checkpoint) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 51fa89ee5539b..801e3df73068c 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -68,7 +68,7 @@ def test_no_val_module(monkeypatch, tmpdir, tmpdir_server, url_ckpt): # assert ckpt has hparams ckpt = torch.load(new_weights_path) - assert LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in ckpt.keys(), "module_arguments missing from checkpoints" + assert LightningModule.CHECKPOINT_HYPER_PARAMS_KEY in ckpt.keys(), "hyper_parameters missing from checkpoints" # load new model hparams_path = tutils.get_data_path(logger, path_dir=tmpdir) From 7b375ed1d3ce5f55c1021038d2892dce6ae8bd66 Mon Sep 17 00:00:00 2001 From: tarepan Date: Wed, 4 Nov 2020 01:39:29 +0900 Subject: [PATCH 46/88] Add CheckpointConnector internal commentaries (#4421) * Add CheckpointConnector commentaries * Fix comment format * Fix save/load schema as function comments Co-authored-by: chaton --- .../connectors/checkpoint_connector.py | 62 ++++++++++++++----- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index 1689ad245eb78..3bf76e4c30630 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -54,10 +54,10 @@ def __init__(self, trainer): def restore_weights(self, model: LightningModule): """ - We attempt to restore weights in this order: - 1. HPC weights. - 2. if no HPC weights restore checkpoint_path weights - 3. otherwise don't restore weights + Attempt to restore a checkpoint (e.g. weights) in this priority: + 1. from HPC weights + 2. from `resume_from_checkpoint` file + 3. don't restore """ # clear cache before restore if self.trainer.on_gpu: @@ -83,20 +83,23 @@ def restore_weights(self, model: LightningModule): def restore(self, checkpoint_path: str, on_gpu: bool): """ - Restore training state from checkpoint. + Load model/training states from the checkpoint file through file-read and state-restore. Also restores all training state like: - epoch - callbacks - schedulers - optimizer + In detail, check return value description of `dump_checkpoint` """ # if on_gpu: # checkpoint = torch.load(checkpoint_path) # else: # load on CPU first + # read a checkpoint dictionary object from the checkpoint file at `checkpoint_path` checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage) + # restore states from the checkpoint dictionary object # load model state model = self.trainer.get_model() @@ -104,10 +107,10 @@ def restore(self, checkpoint_path: str, on_gpu: bool): if self.trainer.datamodule is not None: self.trainer.datamodule.on_load_checkpoint(checkpoint) - # give model a chance to load something + # give model a chance to restore something model.on_load_checkpoint(checkpoint) - # load the state_dict on the model automatically + # restore the state_dict on the model model.load_state_dict(checkpoint['state_dict']) if on_gpu: @@ -129,6 +132,7 @@ def restore_training_state(self, checkpoint): :param checkpoint: :return: """ + # validation if 'optimizer_states' not in checkpoint or 'lr_schedulers' not in checkpoint: raise KeyError( 'Trying to restore training state but checkpoint contains only the model.' @@ -143,7 +147,7 @@ def restore_training_state(self, checkpoint): " where `model.ckpt` is your checkpoint file." ) - # load callback states + # restore callback states self.trainer.on_load_checkpoint(checkpoint) self.trainer.global_step = checkpoint['global_step'] @@ -241,15 +245,31 @@ def hpc_save(self, folderpath: str, logger): return filepath def dump_checkpoint(self, weights_only: bool = False) -> dict: - """Creating model checkpoint. + """Creating a model checkpoint dictionary object from various component states. Args: weights_only: saving model weights only Return: - structured dictionary + structured dictionary: { + 'epoch': training epoch + 'global_step': training global step + 'pytorch-lightning_version': PyTorch Lightning's version + 'callbacks': "callback specific state"[] # if not weights_only + 'optimizer_states': "PT optim's state_dict"[] # if not weights_only + 'lr_schedulers': "PT sched's state_dict"[] # if not weights_only + 'native_amp_scaling_state': PT amp's state_dict # if not weights_only and use native amp + 'amp_scaling_state': Apex's state_dict # if not weights_only and use apex amp + 'state_dict': Model's state_dict (e.g. network weights) + CHECKPOINT_HYPER_PARAMS_NAME: + CHECKPOINT_HYPER_PARAMS_KEY: + CHECKPOINT_HYPER_PARAMS_TYPE: + something_cool_i_want_to_save: anything you define through model.on_save_checkpoint + LightningDataModule.__class__.__name__: pl DataModule's state + } """ + # dump epoch/global_step/pytorch-lightning_version current_epoch = self.trainer.current_epoch global_step = self.trainer.global_step has_reached_max_steps = self.trainer.max_steps and self.trainer.max_steps <= global_step @@ -266,23 +286,23 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict: if not weights_only: - # save callbacks + # dump callbacks callback_states = self.trainer.on_save_checkpoint() checkpoint['callbacks'] = callback_states - # save optimizers + # dump optimizers optimizer_states = [] for i, optimizer in enumerate(self.trainer.optimizers): optimizer_states.append(optimizer.state_dict()) checkpoint['optimizer_states'] = optimizer_states - # save lr schedulers + # dump lr schedulers lr_schedulers = [] for scheduler in self.trainer.lr_schedulers: lr_schedulers.append(scheduler['scheduler'].state_dict()) checkpoint['lr_schedulers'] = lr_schedulers - # save native amp scaling + # dump amp scaling if self.trainer.amp_backend == AMPType.NATIVE and not self.trainer.use_tpu and self.trainer.scaler is not None: checkpoint['native_amp_scaling_state'] = self.trainer.scaler.state_dict() elif self.trainer.amp_backend == AMPType.APEX: @@ -291,12 +311,13 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict: # add the hyper_parameters and state_dict from the model model = self.trainer.get_model() + # dump the module_arguments and state_dict from the model checkpoint['state_dict'] = model.state_dict() if model.hparams: if hasattr(model, '_hparams_name'): checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_NAME] = model._hparams_name - # add arguments to the checkpoint + # dump arguments if OMEGACONF_AVAILABLE: checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] = model.hparams if isinstance(model.hparams, Container): @@ -304,7 +325,7 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict: else: checkpoint[LightningModule.CHECKPOINT_HYPER_PARAMS_KEY] = dict(model.hparams) - # give the model a chance to add a few things + # give the model a chance to dump a few things model.on_save_checkpoint(checkpoint) if self.trainer.datamodule is not None: self.trainer.datamodule.on_save_checkpoint(checkpoint) @@ -356,10 +377,17 @@ def max_ckpt_in_folder(self, path, name_key='ckpt_'): return max(ckpt_vs) def save_checkpoint(self, filepath, weights_only: bool = False): + """Save model/training states as a checkpoint file through state-dump and file-write. + + Args: + filepath: write-target file's path + weights_only: saving model weights only + """ + # dump states as a checkpoint dictionary object checkpoint = self.dump_checkpoint(weights_only) if self.trainer.is_global_zero: - # do the actual save + # write the checkpoint dictionary on the file try: atomic_save(checkpoint, filepath) except AttributeError as err: From a32bffcdea3c12fb369f2cb87cbcf8b7e91396e6 Mon Sep 17 00:00:00 2001 From: chaton Date: Tue, 3 Nov 2020 17:10:51 +0000 Subject: [PATCH 47/88] feature/ Add note about Argparse. (#4321) * add a note about argparse * update Co-authored-by: Jeff Yang Co-authored-by: Rohit Gupta --- pl_examples/bug_report_model.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py index 27ecf774623af..dbea2013d1110 100644 --- a/pl_examples/bug_report_model.py +++ b/pl_examples/bug_report_model.py @@ -105,7 +105,14 @@ def configure_optimizers(self): return [optimizer], [lr_scheduler] +# NOTE: If you are using a cmd line to run your script, +# provide the cmd line as below. +# opt = "--max_epochs 1 --limit_train_batches 1".split(" ") +# parser = ArgumentParser() +# args = parser.parse_args(opt) + def run_test(): + class TestModel(BoringModel): def on_train_epoch_start(self) -> None: From ee414d25be003aba40b193c766b6105efad4d20c Mon Sep 17 00:00:00 2001 From: Jeff Yang Date: Wed, 4 Nov 2020 00:31:51 +0630 Subject: [PATCH 48/88] Switch to PyTorch 1.6 in Drone CI (#4393) * switch to 1.6 * readme * 1.7 * back to normal [ci skip] * horovodrun --verbose * try with apex * add apex test * change base * description * test with 1.7 * back to 1.6 * no gradient_clip_val * re-add gradient_clip_val * no amp * temp skip torch.cuda.amp + horovod test * Apply suggestion from code review Co-authored-by: Sean Naren * Fix formatting * ddp * Moved extended model outside of function to prevent pickling issue for drone * typo * resolve bug * extract automatic_automization Co-authored-by: Sean Naren Co-authored-by: SeanNaren Co-authored-by: chaton --- .drone.yml | 2 +- README.md | 2 +- pytorch_lightning/plugins/native_amp.py | 6 +++-- tests/models/test_horovod.py | 29 +++++++++++++++++++- tests/plugins/test_amp_plugin.py | 35 ++++++++++++------------- 5 files changed, 51 insertions(+), 23 deletions(-) diff --git a/.drone.yml b/.drone.yml index bb4d8a74b28f5..84e97150752e7 100644 --- a/.drone.yml +++ b/.drone.yml @@ -20,7 +20,7 @@ name: torch-GPU steps: - name: testing - image: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.5 + image: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6 environment: CODECOV_TOKEN: diff --git a/README.md b/README.md index 33ee544802914..30079df931759 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ Lightning can automatically export to ONNX or TorchScript for those cases. | System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 | 1.7 (latest) | | :---: | :---: | :---: | :---: | :---: | :---: | | Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | -| Linux py3.7 [GPUs**] | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | - | +| Linux py3.7 [GPUs**] | - | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | | Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | | Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | | OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py index b016b6c5d24fb..98bc8dfc87d25 100644 --- a/pytorch_lightning/plugins/native_amp.py +++ b/pytorch_lightning/plugins/native_amp.py @@ -29,8 +29,10 @@ def connect(self, model, optimizers): def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): closure_loss = self.trainer.scaler.scale(closure_loss) + automatic_optimization = self.trainer.train_loop.automatic_optimization + # do backward pass - if self.trainer.train_loop.automatic_optimization: + if automatic_optimization: model = self.trainer.get_model() model.backward(closure_loss, optimizer, opt_idx) else: @@ -40,7 +42,7 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): closure_loss = closure_loss.detach() # unscale gradient to allow analyze within `on_after_backward` - if not self.trainer.train_loop.should_accumulate(): + if not self.trainer.train_loop.should_accumulate() and automatic_optimization: self.trainer.scaler.unscale_(optimizer) return closure_loss diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 449bc8c712ddb..d09d9387ea485 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -25,6 +25,7 @@ import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer +from pytorch_lightning.utilities import APEX_AVAILABLE, NATIVE_AMP_AVALAIBLE from tests.base import EvalModelTemplate from tests.base.models import BasicGAN @@ -126,8 +127,33 @@ def test_horovod_multi_gpu(tmpdir): @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") @pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +@pytest.mark.skipif(not APEX_AVAILABLE, reason="test requires apex") +def test_horovod_apex(tmpdir): + """Test Horovod with multi-GPU support using apex amp.""" + trainer_options = dict( + default_root_dir=str(tmpdir), + weights_save_path=str(tmpdir), + gradient_clip_val=1.0, + progress_bar_refresh_rate=0, + max_epochs=1, + limit_train_batches=0.4, + limit_val_batches=0.2, + gpus=2, + deterministic=True, + distributed_backend='horovod', + amp_backend='apex', + precision=16, + ) + _run_horovod(trainer_options, on_gpu=True) + + +@pytest.mark.skip(reason="Skip till Horovod fixes integration with Native torch.cuda.amp") +@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") +@pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +@pytest.mark.skipif(not NATIVE_AMP_AVALAIBLE, reason="test requires torch.cuda.amp") def test_horovod_amp(tmpdir): - """Test Horovod with multi-GPU support.""" + """Test Horovod with multi-GPU support using native amp.""" trainer_options = dict( default_root_dir=str(tmpdir), weights_save_path=str(tmpdir), @@ -139,6 +165,7 @@ def test_horovod_amp(tmpdir): gpus=2, deterministic=True, distributed_backend='horovod', + amp_backend='native', precision=16, ) _run_horovod(trainer_options, on_gpu=True) diff --git a/tests/plugins/test_amp_plugin.py b/tests/plugins/test_amp_plugin.py index 6fd000b61d97f..6d7f0252d94c3 100644 --- a/tests/plugins/test_amp_plugin.py +++ b/tests/plugins/test_amp_plugin.py @@ -86,20 +86,19 @@ def on_fit_start(self, trainer, pl_module): trainer.fit(model) +class GradientUnscaleBoringModel(BoringModel): + def on_after_backward(self): + norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2) + if not (torch.isinf(norm) or torch.isnan(norm)): + assert norm.item() < 15. + + @pytest.mark.skipif( LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Minimal PT version is set to 1.6") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_amp_gradient_unscale(tmpdir): - - class ExtendedBoringModel(BoringModel): - - def on_after_backward(self): - norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2) - if not (torch.isinf(norm) or torch.isnan(norm)): - assert norm.item() < 15. - - model = ExtendedBoringModel() + model = GradientUnscaleBoringModel() trainer = Trainer( max_epochs=2, @@ -117,19 +116,19 @@ def on_after_backward(self): trainer.fit(model) +class UnscaleAccumulateGradBatchesBoringModel(BoringModel): + + def on_after_backward(self): + norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2) + if not (torch.isinf(norm) or torch.isnan(norm)): + assert norm.item() < 15. + + @pytest.mark.skipif( LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Minimal PT version is set to 1.6") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") def test_amp_gradient_unscale_accumulate_grad_batches(tmpdir): - - class ExtendedBoringModel(BoringModel): - - def on_after_backward(self): - norm = torch.nn.utils.clip_grad_norm_(self.parameters(), 2) - if not (torch.isinf(norm) or torch.isnan(norm)): - assert norm.item() < 15. - - model = ExtendedBoringModel() + model = UnscaleAccumulateGradBatchesBoringModel() trainer = Trainer( max_epochs=2, From 5d08559c0331ae700c3de4f4df9ecfd2f8782c83 Mon Sep 17 00:00:00 2001 From: ananthsub Date: Tue, 3 Nov 2020 14:02:02 -0800 Subject: [PATCH 49/88] Avoid torchscript export for Metric forward (#4428) * Update metric.py * add test * Update CHANGELOG.md * Update test_metric_lightning.py * Update test_metric_lightning.py Co-authored-by: Jirka Borovec --- CHANGELOG.md | 3 +- pytorch_lightning/metrics/metric.py | 1 + tests/metrics/test_metric_lightning.py | 41 +++++++++++++++++++++++++- 3 files changed, 43 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d9c3d8a1f186..194d50ef37d48 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added timeout for `tpu_device_exists` to ensure process does not hang indefinitely ([#4340](https://github.com/PyTorchLightning/pytorch-lightning/pull/4340)) -- Added global step indexing to the checkpoint name for a better sub-epoch checkpointing experience ([#3807](https://github.com/PyTorchLightning/pytorch-lightning/pull/3807)) +- Added global step indexing to the checkpoint name for a better sub-epoch checkpointing experience ([#3807](https://github.com/PyTorchLightning/pytorch-lightning/pull/3807)) ### Changed @@ -47,6 +47,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed AMP unscale for `on_after_backward` ([#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) +- Fixed TorchScript export when module includes Metrics ([#4428](https://github.com/PyTorchLightning/pytorch-lightning/pull/4428)) ## [1.0.4] - 2020-10-27 diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py index f003e0d3da72a..b716817427230 100644 --- a/pytorch_lightning/metrics/metric.py +++ b/pytorch_lightning/metrics/metric.py @@ -145,6 +145,7 @@ def add_state( self._defaults[name] = deepcopy(default) self._reductions[name] = dist_reduce_fx + @torch.jit.unused def forward(self, *args, **kwargs): """ Automatically calls ``update()``. Returns the metric value over inputs if ``compute_on_step`` is True. diff --git a/tests/metrics/test_metric_lightning.py b/tests/metrics/test_metric_lightning.py index 7a860ea6c16fd..3c6938734be10 100644 --- a/tests/metrics/test_metric_lightning.py +++ b/tests/metrics/test_metric_lightning.py @@ -1,5 +1,6 @@ -import torch +import os +import torch from pytorch_lightning import Trainer from pytorch_lightning.metrics import Metric from tests.base.boring_model import BoringModel @@ -78,3 +79,41 @@ def training_step(self, batch, batch_idx): logged = trainer.logged_metrics assert torch.allclose(torch.tensor(logged["sum"]), model.sum) + + +def test_scriptable(tmpdir): + class TestModel(BoringModel): + def __init__(self): + super().__init__() + # the metric is not used in the module's `forward` + # so the module should be exportable to TorchScript + self.metric = SumMetric() + self.sum = 0.0 + + def training_step(self, batch, batch_idx): + x = batch + self.metric(x.sum()) + self.sum += x.sum() + self.log("sum", self.metric, on_epoch=True, on_step=False) + return self.step(x) + + model = TestModel() + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=2, + limit_val_batches=2, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + logger=False, + checkpoint_callback=False, + ) + trainer.fit(model) + rand_input = torch.randn(10, 32) + + script_model = model.to_torchscript() + + # test that we can still do inference + output = model(rand_input) + script_output = script_model(rand_input) + assert torch.allclose(output, script_output) From fc78ffa622fbcc91015d5f669fcff9c2e8bbf530 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 4 Nov 2020 10:08:37 +0100 Subject: [PATCH 50/88] extend release testing (#4506) * extend release testing * Drone * also PR to release * actions versions --- .drone.yml | 1 + .github/workflows/ci_dockers.yml | 12 ++++++------ .github/workflows/ci_pkg-install.yml | 6 +++--- .github/workflows/ci_test-base.yml | 6 +++--- .github/workflows/ci_test-conda.yml | 6 +++--- .github/workflows/ci_test-full.yml | 6 +++--- .github/workflows/ci_test-tpu.yml | 4 ++-- .github/workflows/code-formatting.yml | 4 ++-- .github/workflows/docker-builds.yml | 2 +- .github/workflows/docs-checks.yml | 8 ++++---- .github/workflows/nightly.yml | 8 ++++---- .github/workflows/pypi-release.yml | 8 ++++---- 12 files changed, 36 insertions(+), 35 deletions(-) diff --git a/.drone.yml b/.drone.yml index 84e97150752e7..5e6c08f7a8256 100644 --- a/.drone.yml +++ b/.drone.yml @@ -46,6 +46,7 @@ steps: trigger: branch: - master + - release/* event: include: - push diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 1cbc99d1b68b8..81800b325e075 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -4,9 +4,9 @@ name: CI build Docker # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x pull_request: - branches: [master] + branches: [master, "release/*"] jobs: build-PL: @@ -24,7 +24,7 @@ jobs: # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command - uses: docker/setup-buildx-action@v1 - name: Build PL Docker - # publish master + # publish master/release uses: docker/build-push-action@v2 with: build-args: | @@ -49,7 +49,7 @@ jobs: # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command - uses: docker/setup-buildx-action@v1 - name: Build XLA Docker - # publish master + # publish master/release uses: docker/build-push-action@v2 with: build-args: | @@ -86,7 +86,7 @@ jobs: # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command - uses: docker/setup-buildx-action@v1 - name: Build CUDA Docker - # publish master + # publish master/release uses: docker/build-push-action@v2 with: build-args: | @@ -129,7 +129,7 @@ jobs: # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command - uses: docker/setup-buildx-action@v1 - name: Build CUDA Docker - # publish master + # publish master/release uses: docker/build-push-action@v2 with: build-args: | diff --git a/.github/workflows/ci_pkg-install.yml b/.github/workflows/ci_pkg-install.yml index aea8d6509db95..4d70beddf3f1b 100644 --- a/.github/workflows/ci_pkg-install.yml +++ b/.github/workflows/ci_pkg-install.yml @@ -3,9 +3,9 @@ name: Install pkg # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x pull_request: - branches: [master] + branches: [master, "release/*"] jobs: @@ -20,7 +20,7 @@ jobs: python-version: [3.6, 3.8] steps: - - uses: actions/checkout@master + - uses: actions/checkout@v2 - uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/ci_test-base.yml b/.github/workflows/ci_test-base.yml index 9b490dec76ee6..de88e94914292 100644 --- a/.github/workflows/ci_test-base.yml +++ b/.github/workflows/ci_test-base.yml @@ -3,9 +3,9 @@ name: CI base testing # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x pull_request: - branches: [master] + branches: [master, "release/*"] jobs: doctest: @@ -72,7 +72,7 @@ jobs: coverage run --source pytorch_lightning -m pytest pytorch_lightning -v --color=yes --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml - name: Upload pytest test results - uses: actions/upload-artifact@master + uses: actions/upload-artifact@v2 with: name: pytest-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }} path: junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index b165b0a604344..c86806785880b 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -3,9 +3,9 @@ name: PyTorch & Conda # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x pull_request: - branches: [master] + branches: [master, "release/*"] jobs: conda: @@ -46,7 +46,7 @@ jobs: shell: bash -l {0} - name: Upload pytest test results - uses: actions/upload-artifact@master + uses: actions/upload-artifact@v2 with: name: pytest-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }} path: junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index 05db07c2a6c34..cd07c655094f3 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -3,9 +3,9 @@ name: CI complete testing # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x pull_request: - branches: [master] + branches: [master, "release/*"] jobs: pytest: @@ -122,7 +122,7 @@ jobs: coverage run --source pytorch_lightning -m pytest pytorch_lightning tests pl_examples -v --color=yes --durations=0 --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml - name: Upload pytest test results - uses: actions/upload-artifact@master + uses: actions/upload-artifact@v2 with: name: pytest-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }} path: junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml diff --git a/.github/workflows/ci_test-tpu.yml b/.github/workflows/ci_test-tpu.yml index 3d58542c490f3..ffedd9728bb06 100644 --- a/.github/workflows/ci_test-tpu.yml +++ b/.github/workflows/ci_test-tpu.yml @@ -2,7 +2,7 @@ name: TPU tests on: push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x # TODO: temporal disable TPU testing until we find way how to pass credentials to forked PRs # pull_request: # branches: @@ -119,7 +119,7 @@ jobs: # pycobertura show coverage.xml - name: Upload coverage results - uses: actions/upload-artifact@master + uses: actions/upload-artifact@v2 with: name: coverage-TPU path: coverage diff --git a/.github/workflows/code-formatting.yml b/.github/workflows/code-formatting.yml index 598b7a5df9aa8..e549f5b8f1cfb 100644 --- a/.github/workflows/code-formatting.yml +++ b/.github/workflows/code-formatting.yml @@ -2,9 +2,9 @@ name: "Check Code Format" on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x pull_request: - branches: [master] + branches: [master, "release/*"] jobs: code-black: diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 673e65d5ccfae..b8ca5d8723b39 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -3,7 +3,7 @@ name: Publish Docker Releases # https://github.com/docker/build-push-action on: push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x release: types: [created] diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml index fa7580e0f9726..b060d1ace6600 100644 --- a/.github/workflows/docs-checks.yml +++ b/.github/workflows/docs-checks.yml @@ -3,9 +3,9 @@ name: "Docs check" on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x pull_request: - branches: [master] + branches: [master, "release/*"] jobs: check-docs: @@ -24,7 +24,7 @@ jobs: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@master + - uses: actions/checkout@v2 - uses: actions/setup-python@v2 with: python-version: 3.7 @@ -68,7 +68,7 @@ jobs: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@master + - uses: actions/checkout@v2 - uses: actions/setup-python@v2 with: python-version: 3.7 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index f60ecd09dfb84..e4d21efae8944 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -29,7 +29,7 @@ jobs: # We do this, since failures on test.pypi aren't that bad - name: Publish to Test PyPI - uses: pypa/gh-action-pypi-publish@master + uses: pypa/gh-action-pypi-publish@v1.4 with: user: __token__ password: ${{ secrets.test_pypi_password }} @@ -57,7 +57,7 @@ jobs: password: ${{ secrets.DOCKER_PASSWORD }} - name: Publish XLA to Docker Hub - # publish master + # publish master/release uses: docker/build-push-action@v2 with: build-args: | @@ -104,7 +104,7 @@ jobs: id: extend - name: Publish CUDA to Docker Hub - # publish master + # publish master/release uses: docker/build-push-action@v2 with: build-args: | @@ -119,7 +119,7 @@ jobs: timeout-minutes: 55 - name: Publish Conda to Docker Hub - # publish master + # publish master/release uses: docker/build-push-action@v2 with: build-args: | diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml index 7579fc0b199a4..e3e766d1c3db9 100644 --- a/.github/workflows/pypi-release.yml +++ b/.github/workflows/pypi-release.yml @@ -3,9 +3,9 @@ name: PyPI Release # https://help.github.com/en/actions/reference/events-that-trigger-workflows on: # Trigger the workflow on push or pull request, but only for the master branch push: - branches: [master] + branches: [master, "release/*"] # include release branches like release/1.0.x release: - types: [created] + types: [created, "release/*"] jobs: @@ -30,7 +30,7 @@ jobs: # We do this, since failures on test.pypi aren't that bad - name: Publish to Test PyPI if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' - uses: pypa/gh-action-pypi-publish@master + uses: pypa/gh-action-pypi-publish@v1.4 with: user: __token__ password: ${{ secrets.test_pypi_password }} @@ -39,7 +39,7 @@ jobs: - name: Publish distribution 📦 to PyPI if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' - uses: pypa/gh-action-pypi-publish@master + uses: pypa/gh-action-pypi-publish@v1.4 with: user: __token__ password: ${{ secrets.pypi_password }} From db3cf2e1f4ced299c9e2afb2e0c92c40a2a7d2c0 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Wed, 4 Nov 2020 15:37:42 +0530 Subject: [PATCH 51/88] Update default filename (#4500) Co-authored-by: Jirka Borovec Co-authored-by: Jeff Yang --- pytorch_lightning/callbacks/model_checkpoint.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index f3eabf5611cf0..413e96a39afcd 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -366,7 +366,8 @@ def _format_checkpoint_name( ) -> str: if not filename: # filename is not set, use default name - filename = "{epoch}-{step}" + filename = "{epoch}" + cls.CHECKPOINT_JOIN_CHAR + "{step}" + # check and parse user passed keys in the string groups = re.findall(r"(\{.*?)[:\}]", filename) if len(groups) >= 0: From f79fb3716cfd4ba27052879b38381a05eb3db526 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 4 Nov 2020 16:55:30 +0100 Subject: [PATCH 52/88] update contrib notes (#4514) Co-authored-by: chaton --- .github/BECOMING_A_CORE_CONTRIBUTOR.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/BECOMING_A_CORE_CONTRIBUTOR.md b/.github/BECOMING_A_CORE_CONTRIBUTOR.md index 1c1a4136ea557..3fa357ef062ca 100644 --- a/.github/BECOMING_A_CORE_CONTRIBUTOR.md +++ b/.github/BECOMING_A_CORE_CONTRIBUTOR.md @@ -53,7 +53,10 @@ different fields contributing! The first 5 core contributors will fit this profile. Thus if you overlap strongly with experiences and expertise as someone else on the team, you might have to wait until the next set of contributors are added. #### Summary: Requirements to apply -- Solve 10 Github issues. The goal is to be inline with expectations for solving issues by the last one so you can do them on your own. If not, I might ask you to solve a few more specific ones. -- Do 10 PR reviews. The goal is to be inline with expectations for solving issues by the last one so you can do them on your own. If not, I might ask you to solve a few more specific ones. +The goal is to be inline with expectations for solving issues by the last one so you can do them on your own. If not, I might ask you to solve a few more specific ones. -If you want to be considered, ping me on gitter and start [tracking your progress here](https://docs.google.com/spreadsheets/d/15D58gp8DvI0Z6qbbYVRuaWioiwzafcP58-UlbuO_CMU/edit?usp=sharing). +- Solve 10+ Github issues. +- Create 5+ meaningful PRs which solves some reported issue - bug, +- Perform 10+ PR reviews from other contributors. + +If you want to be considered, ping me on [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A). From 0d1365c442860f7429f4b1c7eea6b1436273ea0c Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 4 Nov 2020 18:06:31 +0100 Subject: [PATCH 53/88] release v1 (#4516) --- .github/workflows/nightly.yml | 2 +- .github/workflows/pypi-release.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index e4d21efae8944..621713beb28a5 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -29,7 +29,7 @@ jobs: # We do this, since failures on test.pypi aren't that bad - name: Publish to Test PyPI - uses: pypa/gh-action-pypi-publish@v1.4 + uses: pypa/gh-action-pypi-publish@v1 with: user: __token__ password: ${{ secrets.test_pypi_password }} diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml index e3e766d1c3db9..fb99ae2284f76 100644 --- a/.github/workflows/pypi-release.yml +++ b/.github/workflows/pypi-release.yml @@ -30,7 +30,7 @@ jobs: # We do this, since failures on test.pypi aren't that bad - name: Publish to Test PyPI if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' - uses: pypa/gh-action-pypi-publish@v1.4 + uses: pypa/gh-action-pypi-publish@v1 with: user: __token__ password: ${{ secrets.test_pypi_password }} @@ -39,7 +39,7 @@ jobs: - name: Publish distribution 📦 to PyPI if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' - uses: pypa/gh-action-pypi-publish@v1.4 + uses: pypa/gh-action-pypi-publish@v1 with: user: __token__ password: ${{ secrets.pypi_password }} From 945d6f5f317c92dcc490f66137cda6861ef2b97d Mon Sep 17 00:00:00 2001 From: Jon Tamir Date: Wed, 4 Nov 2020 12:10:57 -0600 Subject: [PATCH 54/88] is_picklable: catch AttributeError (addresses #3771) (#4508) * is_picklable: catch AttributeError (addresses #3771) * edit Co-authored-by: Jirka Borovec Co-authored-by: chaton --- pytorch_lightning/utilities/parsing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/utilities/parsing.py b/pytorch_lightning/utilities/parsing.py index c562f780e88ee..c8230205752d4 100644 --- a/pytorch_lightning/utilities/parsing.py +++ b/pytorch_lightning/utilities/parsing.py @@ -61,7 +61,7 @@ def is_picklable(obj: object) -> bool: try: pickle.dumps(obj) return True - except pickle.PicklingError: + except (pickle.PicklingError, AttributeError): return False From 11dc5264cdb888a576ea17fc2d4b6ff743fe403d Mon Sep 17 00:00:00 2001 From: chaton Date: Wed, 4 Nov 2020 19:35:07 +0000 Subject: [PATCH 55/88] Bugfix/4449 dict attribute error (#4480) * resolve a bug * resolve a bug * remove todo * resolve more bugs * update tests * Update pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py Co-authored-by: Sean Naren * Update pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py Co-authored-by: Sean Naren * resolve pyright Co-authored-by: Teddy Koker Co-authored-by: Sean Naren --- .../logger_connector/logger_connector.py | 67 +++++++++++++------ tests/base/__init__.py | 2 +- .../test_eval_loop_dict_return.py | 6 +- .../test_eval_loop_logging_1_0.py | 63 ++++++++++++++++- 4 files changed, 113 insertions(+), 25 deletions(-) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 5c699ecffa464..0a1cb836eda6d 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -180,11 +180,12 @@ def add_progress_bar_metrics(self, metrics): def on_evaluation_epoch_end(self, deprecated_eval_results, epoch_logs, using_eval_result, test_mode): self._track_callback_metrics(deprecated_eval_results, using_eval_result) - self._log_on_evaluation_epoch_end_metrics(epoch_logs) # TODO: deprecate parts of this for 1.0 (when removing results) self.__process_eval_epoch_end_results_and_log_legacy(deprecated_eval_results, test_mode) + self._log_on_evaluation_epoch_end_metrics(epoch_logs) + # get the final loop results eval_loop_results = self._get_evaluate_epoch_results(test_mode) return eval_loop_results @@ -249,14 +250,15 @@ def _log_on_evaluation_epoch_end_metrics(self, epoch_logs): continue reduced_epoch_metrics = dl_metrics[0].__class__.reduce_on_epoch_end(dl_metrics) - # track the metrics logger_metrics = reduced_epoch_metrics.get_epoch_log_metrics() pbar_metrics = reduced_epoch_metrics.get_epoch_pbar_metrics() + forked_metrics = reduced_epoch_metrics.get_forked_metrics() # make the keys 'k/dl' logger_metrics = self.__rename_keys_by_dataloader_idx(logger_metrics, dl_idx, num_loaders) pbar_metrics = self.__rename_keys_by_dataloader_idx(pbar_metrics, dl_idx, num_loaders) + forked_metrics = self.__rename_keys_by_dataloader_idx(forked_metrics, dl_idx, num_loaders) self.logged_metrics.update(logger_metrics) self.add_progress_bar_metrics(pbar_metrics) @@ -266,11 +268,10 @@ def _log_on_evaluation_epoch_end_metrics(self, epoch_logs): self.callback_metrics.update(pbar_metrics) # forked metrics were dropped, enable them for callbacks - forked_metrics = reduced_epoch_metrics.get_forked_metrics() self.callback_metrics.update(forked_metrics) # track the final results for the dataloader - self.eval_loop_results.append(deepcopy(self.callback_metrics)) + self.add_to_eval_loop_results(dl_idx, num_loaders) # actually log if len(logger_metrics) > 0: @@ -281,6 +282,22 @@ def _log_on_evaluation_epoch_end_metrics(self, epoch_logs): if len(metrics_to_log) > 0: self.log_metrics(metrics_to_log, {}) + def add_to_eval_loop_results(self, dl_idx, num_loaders): + callback_metrics = deepcopy(self.callback_metrics) + if num_loaders == 1: + if len(self.eval_loop_results) > 0: + self.eval_loop_results[0].update(callback_metrics) + else: + self.eval_loop_results.append(callback_metrics) + return + + for key in list(callback_metrics.keys()): + if "dataloader_idx" in key: + if f"dataloader_idx_{dl_idx}" not in key: + # remove dl_idx from self.callback_metrics not belonging to this dataset. + del callback_metrics[key] + self.eval_loop_results.append(callback_metrics) + def __rename_keys_by_dataloader_idx(self, metrics, dataloader_idx, num_loaders): if num_loaders == 1: return metrics @@ -329,6 +346,25 @@ def _track_callback_metrics(self, eval_results, using_eval_result): flat['early_stop_on'] = flat['val_loss'] self.trainer.logger_connector.callback_metrics.update(flat) + def __process_eval_epoch_end_results_and_log_legacy_update(self, prog_bar_metrics, log_metrics, callback_metrics): + # eval loop returns all metrics + dataloader_result_metrics = {**prog_bar_metrics, **log_metrics, **callback_metrics} + + # add metrics to prog bar + self.trainer.logger_connector.add_progress_bar_metrics(prog_bar_metrics) + + # log metrics + if len(log_metrics) > 0: + self.trainer.logger_connector.log_metrics(log_metrics, {}) + + # track metrics for callbacks (all prog bar, logged and callback metrics) + self.trainer.logger_connector.callback_metrics.update(callback_metrics) + self.trainer.logger_connector.callback_metrics.update(log_metrics) + self.trainer.logger_connector.callback_metrics.update(prog_bar_metrics) + + if len(dataloader_result_metrics) > 0: + self.eval_loop_results.append(dataloader_result_metrics) + def __process_eval_epoch_end_results_and_log_legacy(self, eval_results, test_mode): if self.trainer.running_sanity_check: return @@ -339,6 +375,9 @@ def __process_eval_epoch_end_results_and_log_legacy(self, eval_results, test_mod if not isinstance(eval_results, list): eval_results = [eval_results] + num_loaders: int = self.trainer.evaluation_loop.num_dataloaders + prog_bar_metrics, log_metrics, callback_metrics = {}, {}, {} + for result_idx, result in enumerate(eval_results): if isinstance(result, EvalResult): prog_bar_metrics = result.epoch_pbar_metrics @@ -351,23 +390,11 @@ def __process_eval_epoch_end_results_and_log_legacy(self, eval_results, test_mod else: _, prog_bar_metrics, log_metrics, callback_metrics, _ = self.trainer.process_dict_result(result) - # eval loop returns all metrics - dataloader_result_metrics = {**prog_bar_metrics, **log_metrics, **callback_metrics} - - # add metrics to prog bar - self.trainer.logger_connector.add_progress_bar_metrics(prog_bar_metrics) - - # log metrics - if len(log_metrics) > 0: - self.trainer.logger_connector.log_metrics(log_metrics, {}) - - # track metrics for callbacks (all prog bar, logged and callback metrics) - self.trainer.logger_connector.callback_metrics.update(callback_metrics) - self.trainer.logger_connector.callback_metrics.update(log_metrics) - self.trainer.logger_connector.callback_metrics.update(prog_bar_metrics) + if num_loaders > 1: + self.__process_eval_epoch_end_results_and_log_legacy_update(prog_bar_metrics, log_metrics, callback_metrics) - if len(dataloader_result_metrics) > 0: - self.eval_loop_results.append(dataloader_result_metrics) + if num_loaders == 1: + self.__process_eval_epoch_end_results_and_log_legacy_update(prog_bar_metrics, log_metrics, callback_metrics) def on_train_epoch_end(self, epoch_output): pass diff --git a/tests/base/__init__.py b/tests/base/__init__.py index a337d443b4384..faefa623dfee7 100644 --- a/tests/base/__init__.py +++ b/tests/base/__init__.py @@ -3,4 +3,4 @@ from tests.base.datasets import TrialMNIST from tests.base.model_template import EvalModelTemplate, GenericEvalModelTemplate from tests.base.simple_model import SimpleModule -from tests.base.boring_model import BoringModel +from tests.base.boring_model import BoringModel, RandomDataset diff --git a/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py b/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py index 47356e4bd684c..31b60e1d0be8b 100644 --- a/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py +++ b/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py @@ -106,7 +106,7 @@ def test_validation_step_arbitrary_dict_return(tmpdir): # out are the results of the full loop # eval_results are output of _evaluate callback_metrics, eval_results = trainer.run_evaluation(test_mode=False) - assert len(callback_metrics) == 2 + assert len(callback_metrics) == 1 assert len(eval_results) == 2 assert eval_results[0]['some'] == 171 assert eval_results[1]['some'] == 171 @@ -143,7 +143,7 @@ def test_validation_step_dict_return(tmpdir): # out are the results of the full loop # eval_results are output of _evaluate callback_metrics, eval_results = trainer.run_evaluation(test_mode=False) - assert len(callback_metrics) == 2 + assert len(callback_metrics) == 1 assert len(callback_metrics[0]) == 5 assert len(eval_results) == 2 assert eval_results[0]['log']['log_acc1'] == 12 @@ -215,7 +215,7 @@ def test_val_step_step_end(tmpdir): # out are the results of the full loop # eval_results are output of _evaluate callback_metrics, eval_results = trainer.run_evaluation(test_mode=False) - assert len(callback_metrics) == 2 + assert len(callback_metrics) == 1 assert len(callback_metrics[0]) == 6 callback_metrics = callback_metrics[0] diff --git a/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py b/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py index bce4a23dda157..0f3217b3f004c 100644 --- a/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py +++ b/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py @@ -18,7 +18,7 @@ from pytorch_lightning import Trainer from pytorch_lightning import callbacks, seed_everything from tests.base.deterministic_model import DeterministicModel -from tests.base import SimpleModule, BoringModel +from tests.base import SimpleModule, BoringModel, RandomDataset import os import torch import pytest @@ -358,3 +358,64 @@ def test_monitor_val_epoch_end(tmpdir): checkpoint_callback=checkpoint_callback, ) trainer.fit(model) + + +def test_multi_dataloaders_add_suffix_properly(tmpdir): + class TestModel(BoringModel): + + def test_step(self, batch, batch_idx, dataloader_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.log("test_loss", loss, on_step=True, on_epoch=True) + return {"y": loss} + + def test_dataloader(self): + return [torch.utils.data.DataLoader(RandomDataset(32, 64)), + torch.utils.data.DataLoader(RandomDataset(32, 64))] + + model = TestModel() + model.test_epoch_end = None + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=0, + limit_val_batches=0, + limit_test_batches=2, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + ) + results = trainer.test(model) + assert len(results[0]) == len(results[1]) + assert "test_loss_epoch/dataloader_idx_0" in results[0] + assert "test_loss_epoch/dataloader_idx_1" in results[1] + + +def test_single_dataloader_no_suffix_added(tmpdir): + class TestModel(BoringModel): + + def test_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.log("test_loss", loss, on_step=True, on_epoch=True) + return {"y": loss} + + def test_dataloader(self): + return torch.utils.data.DataLoader(RandomDataset(32, 64)) + + model = TestModel() + model.test_epoch_end = None + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=0, + limit_val_batches=0, + limit_test_batches=5, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + ) + results = trainer.test(model) + assert len(results) == 1 + # error : It is wrong there. `y` should equal test_loss_epoch + assert results[0]['test_loss'] == results[0]['y'] From 41c6a1307bccdff6825eee90067843e361528423 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 4 Nov 2020 21:56:47 +0100 Subject: [PATCH 56/88] update changelog after 1.0.5 (#4505) * update changelog * update Co-authored-by: chaton Co-authored-by: Sean Naren --- CHANGELOG.md | 64 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 194d50ef37d48..42d78b40b0332 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,87 +9,101 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- Added PyTorch 1.7 Stable support ([#3821](https://github.com/PyTorchLightning/pytorch-lightning/pull/3821)) - - Added "monitor" key to saved `ModelCheckpoints` ([#4383](https://github.com/PyTorchLightning/pytorch-lightning/pull/4383)) + - Added `ConfusionMatrix` class interface ([#4348](https://github.com/PyTorchLightning/pytorch-lightning/pull/4348)) + - Added multiclass AUROC metric ([#4236](https://github.com/PyTorchLightning/pytorch-lightning/pull/4236)) -- Added timeout for `tpu_device_exists` to ensure process does not hang indefinitely ([#4340](https://github.com/PyTorchLightning/pytorch-lightning/pull/4340)) - Added global step indexing to the checkpoint name for a better sub-epoch checkpointing experience ([#3807](https://github.com/PyTorchLightning/pytorch-lightning/pull/3807)) + +- Added optimizer hooks in callbacks ([#4379](https://github.com/PyTorchLightning/pytorch-lightning/pull/4379)) + + +- Added option to log momentum ([#4384](https://github.com/PyTorchLightning/pytorch-lightning/pull/4384)) + + +- Added `fsspec` to tuner ([#4458](https://github.com/PyTorchLightning/pytorch-lightning/pull/4458)) + + ### Changed -- W&B log in sync with Trainer step ([#4405](https://github.com/PyTorchLightning/pytorch-lightning/pull/4405)) -- Hook `on_after_backward` is called only when `optimizer_step` is being called ([#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) +### Deprecated + + + +### Removed + + + +### Fixed + + + +## [1.0.5] - 2020-11-03 + +### Added + +- Added PyTorch 1.7 Stable support ([#3821](https://github.com/PyTorchLightning/pytorch-lightning/pull/3821)) +- Added timeout for `tpu_device_exists` to ensure process does not hang indefinitely ([#4340](https://github.com/PyTorchLightning/pytorch-lightning/pull/4340)) + +### Changed + +- W&B log in sync with `Trainer` step ([#4405](https://github.com/PyTorchLightning/pytorch-lightning/pull/4405)) +- Hook `on_after_backward` is called only when `optimizer_step` is being called ([#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) - Moved `track_and_norm_grad` into `training loop` and called only when `optimizer_step` is being called ([#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) +- Changed type checker with explicit cast of `ref_model` object ([#4457](https://github.com/PyTorchLightning/pytorch-lightning/pull/4457)) ### Deprecated - Deprecated passing `ModelCheckpoint` instance to `checkpoint_callback` Trainer argument ([#4336](https://github.com/PyTorchLightning/pytorch-lightning/pull/4336)) -### Removed - ### Fixed - Disable saving checkpoints if not trained ([#4372](https://github.com/PyTorchLightning/pytorch-lightning/pull/4372)) - - Fixed error using `auto_select_gpus=True` with `gpus=-1` ([#4209](https://github.com/PyTorchLightning/pytorch-lightning/pull/4209)) - - Disabled training when `limit_train_batches=0` ([#4371](https://github.com/PyTorchLightning/pytorch-lightning/pull/4371)) - - Fixed that metrics do not store computational graph for all seen data ([#4313](https://github.com/PyTorchLightning/pytorch-lightning/pull/4313)) - - Fixed AMP unscale for `on_after_backward` ([#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) - - Fixed TorchScript export when module includes Metrics ([#4428](https://github.com/PyTorchLightning/pytorch-lightning/pull/4428)) +- Fixed CSV logger warning ([#4419](https://github.com/PyTorchLightning/pytorch-lightning/pull/4419)) +- Fixed skip DDP parameter sync ([#4301](https://github.com/PyTorchLightning/pytorch-lightning/pull/4301)) ## [1.0.4] - 2020-10-27 ### Added - Added `dirpath` and `filename` parameter in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213)) - - Added plugins docs and DDPPlugin to customize ddp across all accelerators ([#4258](https://github.com/PyTorchLightning/pytorch-lightning/pull/4285)) - - Added `strict` option to the scheduler dictionary ([#3586](https://github.com/PyTorchLightning/pytorch-lightning/pull/3586)) - - Added `fsspec` support for profilers ([#4162](https://github.com/PyTorchLightning/pytorch-lightning/pull/4162)) - - Added autogenerated helptext to `Trainer.add_argparse_args` ([#4344](https://github.com/PyTorchLightning/pytorch-lightning/pull/4344)) - - Added support for string values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) ### Changed - Improved error messages for invalid `configure_optimizers` returns ([#3587](https://github.com/PyTorchLightning/pytorch-lightning/pull/3587)) - - Allow changing the logged step value in `validation_step` ([#4130](https://github.com/PyTorchLightning/pytorch-lightning/pull/4130)) - - Allow setting `replace_sampler_ddp=True` with a distributed sampler already added ([#4273](https://github.com/PyTorchLightning/pytorch-lightning/pull/4273)) - - Fixed santized parameters for `WandbLogger.log_hyperparams` ([#4320](https://github.com/PyTorchLightning/pytorch-lightning/pull/4320)) ### Deprecated - Deprecated `filepath` in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213)) - - Deprecated `reorder` parameter of the `auc` metric ([#4237](https://github.com/PyTorchLightning/pytorch-lightning/pull/4237)) - - Deprecated bool values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) ### Fixed - Fixed setting device ids in DDP ([#4297](https://github.com/PyTorchLightning/pytorch-lightning/pull/4297)) - - Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/PyTorchLightning/pytorch-lightning/pull/4323)) - -- Fixed WandbLogger not uploading checkpoint artifacts at the end of training ([#4341](https://github.com/PyTorchLightning/pytorch-lightning/pull/4341)) +- Fixed `WandbLogger` not uploading checkpoint artifacts at the end of training ([#4341](https://github.com/PyTorchLightning/pytorch-lightning/pull/4341)) ## [1.0.3] - 2020-10-20 From e81707ba0242f12f47d742e86a982f529a7ae65b Mon Sep 17 00:00:00 2001 From: Jeff Yang Date: Thu, 5 Nov 2020 04:38:17 +0630 Subject: [PATCH 57/88] [dockers] use inline cache (#4511) Co-authored-by: Jirka Borovec Co-authored-by: Nicki Skafte Co-authored-by: Sean Naren --- .github/workflows/ci_dockers.yml | 6 +++--- .github/workflows/nightly.yml | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 81800b325e075..ef410a9afd9a8 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -55,7 +55,7 @@ jobs: build-args: | PYTHON_VERSION=${{ matrix.python_version }} XLA_VERSION=${{ matrix.xla_version }} - cache-from: pytorchlightning/pytorch_lightning:base-xla-cache-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} + cache-from: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} file: dockers/base-xla/Dockerfile push: false timeout-minutes: 50 @@ -93,7 +93,7 @@ jobs: PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} CUDA_VERSION=${{ steps.extend.outputs.CUDA }} - cache-from: pytorchlightning/pytorch_lightning:base-cuda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + cache-from: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} file: dockers/base-cuda/Dockerfile push: false timeout-minutes: 50 @@ -137,7 +137,7 @@ jobs: PYTORCH_VERSION=${{ matrix.pytorch_version }} PYTORCH_CHANNEL=${{ steps.extend.outputs.CHANNEL }} CUDA_VERSION=${{ steps.extend.outputs.CUDA }} - cache-from: pytorchlightning/pytorch_lightning:base-conda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + cache-from: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} file: dockers/base-conda/Dockerfile push: false timeout-minutes: 50 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 621713beb28a5..b14f40a6c4339 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -63,8 +63,8 @@ jobs: build-args: | PYTHON_VERSION=${{ matrix.python_version }} XLA_VERSION=${{ matrix.xla_version }} - cache-from: pytorchlightning/pytorch_lightning:base-xla-cache-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} - cache-to: pytorchlightning/pytorch_lightning:base-xla-cache-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} + cache-from: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} + cache-to: type=inline file: dockers/base-xla/Dockerfile push: true tags: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} @@ -111,8 +111,8 @@ jobs: PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} CUDA_VERSION=${{ steps.extend.outputs.CUDA }} - cache-from: pytorchlightning/pytorch_lightning:base-cuda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} - cache-to: pytorchlightning/pytorch_lightning:base-cuda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + cache-from: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + cache-to: type=inline file: dockers/base-cuda/Dockerfile push: true tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} @@ -127,8 +127,8 @@ jobs: PYTORCH_VERSION=${{ matrix.pytorch_version }} PYTORCH_CHANNEL=${{ steps.extend.outputs.CHANNEL }} CUDA_VERSION=${{ steps.extend.outputs.CUDA }} - cache-from: pytorchlightning/pytorch_lightning:base-conda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} - cache-to: pytorchlightning/pytorch_lightning:base-conda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + cache-from: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + cache-to: type=inline file: dockers/base-conda/Dockerfile push: true tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} From 51cc7a89ee9f1c0ad35c147ce37a5825ec6d77e5 Mon Sep 17 00:00:00 2001 From: Travis Addair Date: Thu, 5 Nov 2020 09:52:02 -0800 Subject: [PATCH 58/88] Horovod: fixed early stopping and added metrics aggregation (#3775) * Fixed early stopping for Horovod * Refactored to sync_dist_if_available * Bump min Horovod version to support hvd.is_initialized * Changelog * Added back change for Horovod * Removed redundant checks for initialization * Implement metrics gathering for Horovod * Added test for EvalResult * Renamed ddp_sync_on_step -> dist_sync_on_step * Added metric test for Horovod * Added option pass callable allgather function to metric base class * Added dist_sync_fn * Fixed calls to private _sync_dist * Fixed Horovod test * Added sync_tensor to the distributed backend * Skip Windows * Insert test path * Removed redundant import * Updated drone * Unset HOROVOD_GPU_ALLREDUCE * Unset * No cache dir * No uninstall * Unset variables * Uninstall Horovod during initialization * Replaced more references to ddp_sync_on_step * Fixed imports * Fixed attribute * Added back default * Lint * Added back docstring * Made gather_all_tensors default * Added whitespace * Update tests/models/test_horovod.py Co-authored-by: Jirka Borovec * Update pytorch_lightning/metrics/metric.py Co-authored-by: Jirka Borovec * Update CHANGELOG.md Co-authored-by: Teddy Koker Co-authored-by: Sean Naren Co-authored-by: Jirka Borovec --- CHANGELOG.md | 3 + pytorch_lightning/accelerators/accelerator.py | 24 +++- .../accelerators/ddp_accelerator.py | 11 +- .../accelerators/ddp_cpu_slurm_accelerator.py | 11 +- .../accelerators/ddp_cpu_spawn_accelerator.py | 12 +- .../ddp_cpu_torchelastic_accelerator.py | 11 +- .../accelerators/ddp_slurm_accelerator.py | 12 +- .../accelerators/ddp_spawn_accelerator.py | 11 +- .../ddp_torchelastic_accelerator.py | 11 +- .../accelerators/horovod_accelerator.py | 42 ++++++- pytorch_lightning/core/lightning.py | 3 + pytorch_lightning/core/step_result.py | 6 +- .../metrics/classification/accuracy.py | 5 + pytorch_lightning/metrics/metric.py | 28 +++-- .../metrics/regression/explained_variance.py | 4 +- .../metrics/regression/mean_absolute_error.py | 2 + .../metrics/regression/mean_squared_error.py | 4 +- .../regression/mean_squared_log_error.py | 4 +- pytorch_lightning/utilities/distributed.py | 67 ++++++----- requirements/extra.txt | 3 +- tests/README.md | 2 +- tests/models/test_horovod.py | 113 +++++++++++++++++- 22 files changed, 324 insertions(+), 65 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 42d78b40b0332..1f4defbd5cc30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,6 +30,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `fsspec` to tuner ([#4458](https://github.com/PyTorchLightning/pytorch-lightning/pull/4458)) +- Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775)) + + ### Changed diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index e69addf234a36..408c979d1ff2e 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -14,7 +14,7 @@ import os import math from enum import Enum -from typing import Any, Optional +from typing import Any, Optional, Union import torch @@ -30,6 +30,12 @@ except ImportError: amp = None +if torch.distributed.is_available(): + from torch.distributed import ReduceOp +else: + class ReduceOp: + SUM = None + EPSILON = 1e-6 EPSILON_FP16 = 1e-5 @@ -209,6 +215,22 @@ def init_ddp_connection( torch_backend, rank=global_rank, world_size=world_size ) + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + """ + Function to reduce a tensor from several distributed processes to one aggregated tensor. + Args: + tensor: the tensor to sync and reduce + group: the process group to gather results from. Defaults to all processes (world) + reduce_op: the reduction operation. Defaults to sum. + Can also be a string of 'avg', 'mean' to calculate the mean during reduction. + Return: + reduced value + """ + raise NotImplementedError() + def __getstate__(self): return { 'trainer': self.trainer, diff --git a/pytorch_lightning/accelerators/ddp_accelerator.py b/pytorch_lightning/accelerators/ddp_accelerator.py index b9f01b5ddc167..b127fdd40c934 100644 --- a/pytorch_lightning/accelerators/ddp_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_accelerator.py @@ -18,17 +18,18 @@ import sys from os.path import abspath from time import sleep -from typing import Optional, List +from typing import Any, Optional, List, Union import numpy as np from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.distributed import find_free_network_port from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities.distributed import sync_ddp_if_available from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.seed import seed_everything from torch.nn.parallel import DistributedDataParallel @@ -298,3 +299,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) return model + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + return sync_ddp_if_available(tensor, group, reduce_op) diff --git a/pytorch_lightning/accelerators/ddp_cpu_slurm_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_slurm_accelerator.py index 2aad005a07847..c80e8a4ec355c 100644 --- a/pytorch_lightning/accelerators/ddp_cpu_slurm_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_cpu_slurm_accelerator.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License import os -from typing import List, Optional +from typing import Any, List, Optional, Union import torch import torch.distributed as torch_distrib @@ -20,10 +20,11 @@ from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities.distributed import sync_ddp_if_available from pytorch_lightning.distributed.dist import LightningDistributed @@ -199,3 +200,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) return model + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + return sync_ddp_if_available(tensor, group, reduce_op) diff --git a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py index f1813361c5eec..64e326b7ee0fc 100644 --- a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License import os -from typing import List, Optional +from typing import Any, List, Optional, Union import torch import torch.distributed as torch_distrib @@ -21,11 +21,11 @@ from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_warn -from pytorch_lightning.utilities.distributed import find_free_network_port +from pytorch_lightning.utilities.distributed import find_free_network_port, sync_ddp_if_available from pytorch_lightning.distributed.dist import LightningDistributed try: @@ -229,3 +229,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) return model + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + return sync_ddp_if_available(tensor, group, reduce_op) diff --git a/pytorch_lightning/accelerators/ddp_cpu_torchelastic_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_torchelastic_accelerator.py index 6b27e7da330ea..a90d7750eaeea 100644 --- a/pytorch_lightning/accelerators/ddp_cpu_torchelastic_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_cpu_torchelastic_accelerator.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License import os -from typing import List, Optional +from typing import Any, List, Optional, Union import torch import torch.distributed as torch_distrib @@ -20,11 +20,12 @@ from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities.distributed import sync_ddp_if_available try: from hydra.utils import to_absolute_path, get_original_cwd @@ -198,3 +199,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) return model + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + return sync_ddp_if_available(tensor, group, reduce_op) diff --git a/pytorch_lightning/accelerators/ddp_slurm_accelerator.py b/pytorch_lightning/accelerators/ddp_slurm_accelerator.py index 8a6326d3d5cb8..4960445edd27d 100644 --- a/pytorch_lightning/accelerators/ddp_slurm_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_slurm_accelerator.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License import os -from typing import List +from typing import Any, List, Optional, Union import torch import torch.distributed as torch_distrib @@ -20,11 +20,11 @@ from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning.utilities import AMPType -from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities.distributed import rank_zero_only, sync_ddp_if_available from pytorch_lightning.utilities.seed import seed_everything try: @@ -205,3 +205,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) return model + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + return sync_ddp_if_available(tensor, group, reduce_op) diff --git a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py index b204494773362..2e0bac46c4c20 100644 --- a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py @@ -13,7 +13,7 @@ # limitations under the License import os import re -from typing import List, Optional +from typing import Any, List, Optional, Union import torch import torch.multiprocessing as mp @@ -22,11 +22,12 @@ from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.cloud_io import atomic_save, load as pl_load from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_warn, find_free_network_port +from pytorch_lightning.utilities.distributed import sync_ddp_if_available from pytorch_lightning.utilities.seed import seed_everything from pytorch_lightning.distributed.dist import LightningDistributed @@ -254,3 +255,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) return model + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + return sync_ddp_if_available(tensor, group, reduce_op) diff --git a/pytorch_lightning/accelerators/ddp_torchelastic_accelerator.py b/pytorch_lightning/accelerators/ddp_torchelastic_accelerator.py index 8a9e6ac77e574..e54ad905de80e 100644 --- a/pytorch_lightning/accelerators/ddp_torchelastic_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_torchelastic_accelerator.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License import os -from typing import List, Optional +from typing import Any, List, Optional, Union import torch import torch.distributed as torch_distrib @@ -20,11 +20,12 @@ from torch.nn.parallel import DistributedDataParallel from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities.distributed import sync_ddp_if_available try: @@ -201,3 +202,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) return model + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + return sync_ddp_if_available(tensor, group, reduce_op) diff --git a/pytorch_lightning/accelerators/horovod_accelerator.py b/pytorch_lightning/accelerators/horovod_accelerator.py index 91a5400999f6e..e5314a983f9db 100644 --- a/pytorch_lightning/accelerators/horovod_accelerator.py +++ b/pytorch_lightning/accelerators/horovod_accelerator.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import ExitStack -from typing import Optional +from typing import Any, Optional, Union import torch from torch.optim.lr_scheduler import _LRScheduler -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.distributed import rank_zero_only @@ -161,3 +161,41 @@ def barrier(self, name: Optional[str] = None): def broadcast(self, obj, src=0): obj = hvd.broadcast_object(obj, src) return obj + + def gather_all_tensors(self, result: Union[torch.Tensor], group: Optional[Any] = None): + if group is not None: + raise ValueError( + "Horovod does not support allgather using a subcommunicator at this time. " + "Unset `group`." + ) + + if len(result.shape) == 0: + # Convert scalars to single dimension tensors + result = result.reshape(1) + + # sync and gather all + hvd.join() + gathered = hvd.allgather(result) + gathered_result = list(gathered.split(1, dim=0)) + return gathered_result + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + if group is not None: + raise ValueError( + "Horovod does not support allreduce using a subcommunicator at this time. " + "Unset `group`." + ) + + if reduce_op is None or reduce_op == "sum": + reduce_op = hvd.Sum + elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"): + reduce_op = hvd.Average + else: + raise ValueError(f"unrecognized `reduce_op`: {reduce_op}") + + # sync all processes before reduction + hvd.join() + return hvd.allreduce(tensor, op=reduce_op) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 78b9c7025711a..05eb8ee86be63 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -258,6 +258,8 @@ def log( raise MisconfigurationException( f"Logged key: {name} should not contain information about dataloader_idx.") + accelerator = self.trainer.accelerator_backend + self._results.log( name, value, @@ -272,6 +274,7 @@ def log( sync_dist, sync_dist_op, sync_dist_group, + accelerator.sync_tensor, self._current_dataloader_idx, ) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 059c724aa75a9..0eca72095e0e0 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -124,15 +124,17 @@ def log( sync_dist: bool = False, sync_dist_op: Union[Any, str] = 'mean', sync_dist_group: Optional[Any] = None, + sync_fn: Callable = None, dataloader_idx: Optional[int] = None, ): # no metrics should be logged with graphs if not enable_graph and isinstance(value, torch.Tensor): value = value.detach() - # sync across ddp + # sync across workers when using distributed training + sync_fn = sync_fn or sync_ddp_if_available if sync_dist and isinstance(value, (torch.Tensor, numbers.Number)): - value = sync_ddp_if_available(value, group=sync_dist_group, reduce_op=sync_dist_op) + value = sync_fn(value, group=sync_dist_group, reduce_op=sync_dist_op) if 'meta' not in self: self.__setitem__('meta', {}) diff --git a/pytorch_lightning/metrics/classification/accuracy.py b/pytorch_lightning/metrics/classification/accuracy.py index c7255c6e4497e..0f01fb9813407 100644 --- a/pytorch_lightning/metrics/classification/accuracy.py +++ b/pytorch_lightning/metrics/classification/accuracy.py @@ -50,6 +50,9 @@ class Accuracy(Metric): before returning the value at the step. default: False process_group: Specify the process group on which synchronization is called. default: None (which selects the entire world) + dist_sync_fn: + Callback that performs the allgather operation on the metric state. When `None`, DDP + will be used to perform the allgather. default: None Example: @@ -67,11 +70,13 @@ def __init__( compute_on_step: bool = True, dist_sync_on_step: bool = False, process_group: Optional[Any] = None, + dist_sync_fn: Callable = None, ): super().__init__( compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group, + dist_sync_fn=dist_sync_fn, ) self.add_state("correct", default=torch.tensor(0), dist_reduce_fx="sum") diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py index b716817427230..1a568bab37209 100644 --- a/pytorch_lightning/metrics/metric.py +++ b/pytorch_lightning/metrics/metric.py @@ -24,7 +24,7 @@ from torch import nn from pytorch_lightning.utilities.apply_func import apply_to_collection -from pytorch_lightning.utilities.distributed import gather_all_tensors_if_available +from pytorch_lightning.utilities.distributed import gather_all_tensors from pytorch_lightning.metrics.utils import _flatten, dim_zero_cat, dim_zero_mean, dim_zero_sum @@ -53,21 +53,26 @@ class Metric(nn.Module, ABC): Forward only calls ``update()`` and returns None if this is set to False. default: True dist_sync_on_step: Synchronize metric state across processes at each ``forward()`` - before returning the value at the step. default: False + before returning the value at the step. process_group: Specify the process group on which synchronization is called. default: None (which selects the entire world) + dist_sync_fn: + Callback that performs the allgather operation on the metric state. When `None`, DDP + will be used to perform the allgather. default: None """ def __init__( self, compute_on_step: bool = True, dist_sync_on_step: bool = False, process_group: Optional[Any] = None, + dist_sync_fn: Callable = None, ): super().__init__() self.dist_sync_on_step = dist_sync_on_step self.compute_on_step = compute_on_step self.process_group = process_group + self.dist_sync_fn = dist_sync_fn self._to_sync = True self.update = self._wrap_update(self.update) @@ -174,12 +179,12 @@ def forward(self, *args, **kwargs): return self._forward_cache - def _sync_dist(self): + def _sync_dist(self, dist_sync_fn=gather_all_tensors): input_dict = {attr: getattr(self, attr) for attr in self._reductions.keys()} output_dict = apply_to_collection( input_dict, torch.Tensor, - gather_all_tensors_if_available, + dist_sync_fn, group=self.process_group, ) @@ -208,12 +213,15 @@ def wrapped_func(*args, **kwargs): if self._computed is not None: return self._computed - if ( - self._to_sync - and torch.distributed.is_available() # noqa: W503 - and torch.distributed.is_initialized() # noqa: W503 - ): - self._sync_dist() + dist_sync_fn = self.dist_sync_fn + if (dist_sync_fn is None + and torch.distributed.is_available() + and torch.distributed.is_initialized()): + # User provided a bool, so we assume DDP if available + dist_sync_fn = gather_all_tensors + + if self._to_sync and dist_sync_fn is not None: + self._sync_dist(dist_sync_fn) self._computed = compute(*args, **kwargs) self.reset() diff --git a/pytorch_lightning/metrics/regression/explained_variance.py b/pytorch_lightning/metrics/regression/explained_variance.py index 79fc8b4c4e183..f59ce0b67de62 100644 --- a/pytorch_lightning/metrics/regression/explained_variance.py +++ b/pytorch_lightning/metrics/regression/explained_variance.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import torch -from typing import Any, Optional +from typing import Any, Callable, Optional from pytorch_lightning.metrics.metric import Metric from pytorch_lightning.utilities import rank_zero_warn @@ -74,11 +74,13 @@ def __init__( compute_on_step: bool = True, dist_sync_on_step: bool = False, process_group: Optional[Any] = None, + dist_sync_fn: Callable = None, ): super().__init__( compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group, + dist_sync_fn=dist_sync_fn, ) allowed_multioutput = ('raw_values', 'uniform_average', 'variance_weighted') if multioutput not in allowed_multioutput: diff --git a/pytorch_lightning/metrics/regression/mean_absolute_error.py b/pytorch_lightning/metrics/regression/mean_absolute_error.py index 89cb56d431ad4..ba6d2c6d79a08 100644 --- a/pytorch_lightning/metrics/regression/mean_absolute_error.py +++ b/pytorch_lightning/metrics/regression/mean_absolute_error.py @@ -49,11 +49,13 @@ def __init__( compute_on_step: bool = True, dist_sync_on_step: bool = False, process_group: Optional[Any] = None, + dist_sync_fn: Callable = None, ): super().__init__( compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group, + dist_sync_fn=dist_sync_fn, ) self.add_state("sum_abs_error", default=torch.tensor(0.0), dist_reduce_fx="sum") diff --git a/pytorch_lightning/metrics/regression/mean_squared_error.py b/pytorch_lightning/metrics/regression/mean_squared_error.py index 87c1fddf2674c..6da6d55d5dd1c 100644 --- a/pytorch_lightning/metrics/regression/mean_squared_error.py +++ b/pytorch_lightning/metrics/regression/mean_squared_error.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import torch -from typing import Any, Optional +from typing import Any, Callable, Optional from pytorch_lightning.metrics.metric import Metric from pytorch_lightning.metrics.functional.mean_squared_error import ( @@ -50,11 +50,13 @@ def __init__( compute_on_step: bool = True, dist_sync_on_step: bool = False, process_group: Optional[Any] = None, + dist_sync_fn: Callable = None, ): super().__init__( compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group, + dist_sync_fn=dist_sync_fn, ) self.add_state("sum_squared_error", default=torch.tensor(0.0), dist_reduce_fx="sum") diff --git a/pytorch_lightning/metrics/regression/mean_squared_log_error.py b/pytorch_lightning/metrics/regression/mean_squared_log_error.py index 256fac20365af..696ad01ca829d 100644 --- a/pytorch_lightning/metrics/regression/mean_squared_log_error.py +++ b/pytorch_lightning/metrics/regression/mean_squared_log_error.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import torch -from typing import Any, Optional +from typing import Any, Callable, Optional from pytorch_lightning.metrics.metric import Metric from pytorch_lightning.metrics.functional.mean_squared_log_error import ( @@ -50,11 +50,13 @@ def __init__( compute_on_step: bool = True, dist_sync_on_step: bool = False, process_group: Optional[Any] = None, + dist_sync_fn: Callable = None, ): super().__init__( compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group, + dist_sync_fn=dist_sync_fn, ) self.add_state("sum_squared_log_error", default=torch.tensor(0.0), dist_reduce_fx="sum") diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py index a29fd3e5a1059..98d322ce0a3a2 100644 --- a/pytorch_lightning/utilities/distributed.py +++ b/pytorch_lightning/utilities/distributed.py @@ -73,7 +73,7 @@ def find_free_network_port() -> int: return port -def gather_all_tensors_if_available(result: Union[torch.Tensor], group: Optional[Any] = None): +def gather_all_tensors(result: Union[torch.Tensor], group: Optional[Any] = None): """ Function to gather all tensors from several ddp processes onto a list that is broadcasted to all processes @@ -85,26 +85,41 @@ def gather_all_tensors_if_available(result: Union[torch.Tensor], group: Optional Return: gathered_result: list with size equal to the process group where gathered_result[i] corresponds to result tensor from process i - """ - if torch.distributed.is_available() and torch.distributed.is_initialized(): - if group is None: - group = torch.distributed.group.WORLD + if group is None: + group = torch.distributed.group.WORLD - world_size = torch.distributed.get_world_size(group) + world_size = torch.distributed.get_world_size(group) - gathered_result = [torch.zeros_like(result) for _ in range(world_size)] + gathered_result = [torch.zeros_like(result) for _ in range(world_size)] - # sync and broadcast all - torch.distributed.barrier(group=group) - torch.distributed.all_gather(gathered_result, result, group) + # sync and broadcast all + torch.distributed.barrier(group=group) + torch.distributed.all_gather(gathered_result, result, group) - result = gathered_result - return result + return gathered_result def sync_ddp_if_available( result: Union[torch.Tensor], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None +) -> torch.Tensor: + """ + Function to reduce a tensor across worker processes during distributed training + Args: + result: the value to sync and reduce (typically tensor or number) + group: the process group to gather results from. Defaults to all processes (world) + reduce_op: the reduction operation. Defaults to sum. + Can also be a string of 'avg', 'mean' to calculate the mean during reduction. + Return: + reduced value + """ + if torch.distributed.is_available() and torch.distributed.is_initialized(): + return sync_ddp(result, group=group, reduce_op=reduce_op) + return result + + +def sync_ddp( + result: Union[torch.Tensor], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None ) -> torch.Tensor: """ Function to reduce the tensors from several ddp processes to one master process @@ -118,24 +133,22 @@ def sync_ddp_if_available( Return: reduced value """ + divide_by_world_size = False - if torch.distributed.is_available() and torch.distributed.is_initialized(): - divide_by_world_size = False - - if group is None: - group = torch.distributed.group.WORLD + if group is None: + group = torch.distributed.group.WORLD - if reduce_op is None: - reduce_op = torch.distributed.ReduceOp.SUM - elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"): - reduce_op = torch.distributed.ReduceOp.SUM - divide_by_world_size = True + if reduce_op is None: + reduce_op = torch.distributed.ReduceOp.SUM + elif isinstance(reduce_op, str) and reduce_op in ("avg", "mean"): + reduce_op = torch.distributed.ReduceOp.SUM + divide_by_world_size = True - # sync all processes before reduction - torch.distributed.barrier(group=group) - torch.distributed.all_reduce(result, op=reduce_op, group=group, async_op=False) + # sync all processes before reduction + torch.distributed.barrier(group=group) + torch.distributed.all_reduce(result, op=reduce_op, group=group, async_op=False) - if divide_by_world_size: - result = result / torch.distributed.get_world_size(group) + if divide_by_world_size: + result = result / torch.distributed.get_world_size(group) return result diff --git a/requirements/extra.txt b/requirements/extra.txt index dbd5f7515109e..be21317a1d826 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -1,8 +1,7 @@ # extended list of package dependencies to reach full functionality matplotlib>=3.1.1 -# no need to install with [pytorch] as pytorch is already installed and torchvision is required only for Horovod examples -horovod>=0.20.1 # v0.20.0 has problem with building the wheel/installation +horovod>=0.20.2 # no need to install with [pytorch] as pytorch is already installed omegaconf>=2.0.0 # scipy>=0.13.3 scikit-learn>=0.22.2 diff --git a/tests/README.md b/tests/README.md index 7fd3c90c0241e..8ef006c4d879a 100644 --- a/tests/README.md +++ b/tests/README.md @@ -30,7 +30,7 @@ To test models that require GPU make sure to run the above command on a GPU mach The GPU machine must have: 1. At least 2 GPUs. 2. [NVIDIA-apex](https://github.com/NVIDIA/apex#linux) installed. -3. [Horovod with NCCL](https://horovod.readthedocs.io/en/stable/gpus_include.html) support: `HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL pip install horovod` +3. [Horovod with NCCL](https://horovod.readthedocs.io/en/stable/gpus_include.html) support: `HOROVOD_GPU_OPERATIONS=NCCL pip install horovod` ## Running Coverage diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index d09d9387ea485..d0ae17d8fee5d 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -17,19 +17,25 @@ import shlex import subprocess import sys -from unittest.mock import patch +import numpy as np import pytest import torch +from sklearn.metrics import accuracy_score + import tests.base.develop_pipelines as tpipes import tests.base.develop_utils as tutils from pytorch_lightning import Trainer +from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator +from pytorch_lightning.core.step_result import Result, TrainResult, EvalResult +from pytorch_lightning.metrics.classification.accuracy import Accuracy from pytorch_lightning.utilities import APEX_AVAILABLE, NATIVE_AMP_AVALAIBLE from tests.base import EvalModelTemplate from tests.base.models import BasicGAN try: + import horovod from horovod.common.util import nccl_built except ImportError: HOROVOD_AVAILABLE = False @@ -235,6 +241,111 @@ def get_optimizer_params(optimizer): assert get_model_params(model.generator) == get_optimizer_params(trainer.optimizers[0]) assert get_model_params(model.discriminator) == get_optimizer_params(trainer.optimizers[1]) + +@pytest.mark.skipif(not HOROVOD_AVAILABLE, reason="Horovod is unavailable") +@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") +def test_result_reduce_horovod(tmpdir): + """Make sure result logging works with Horovod. + + This test mirrors tests/core/test_results.py::_ddp_test_fn + """ + tutils.reset_seed() + tutils.set_random_master_port() + + def hvd_test_fn(): + path_here = os.path.abspath(os.path.dirname(__file__)) + path_root = os.path.abspath(os.path.join(path_here, '..', '..')) + sys.path.insert(0, os.path.abspath(path_root)) + + from tests.base.boring_model import BoringModel + + import horovod.torch as hvd + + class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + self.training_step_called = True + + tensor = torch.tensor([1.0]) + self.log("test_tensor", tensor, sync_dist=True, sync_dist_op='sum', + on_step=True, on_epoch=True) + + res = self._results + + # Check that `tensor` is summed across all ranks automatically + assert res["test_tensor"].item() == hvd.size(), \ + "Result-Log does not work properly with Horovod and Tensors" + + def training_epoch_end(self, outputs) -> None: + assert len(outputs) == 0 + + model = TestModel() + model.val_dataloader = None + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=2, + limit_val_batches=2, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + ) + + trainer.fit(model) + + horovod.run(hvd_test_fn, np=2) + + +@pytest.mark.skipif(not HOROVOD_AVAILABLE, reason="Horovod is unavailable") +@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") +def test_accuracy_metric_horovod(): + num_batches = 10 + batch_size = 16 + threshold = 0.5 + + def sk_metric(preds, target): + sk_preds = (preds.view(-1).numpy() >= threshold).astype(np.uint8) + sk_target = target.view(-1).numpy() + return accuracy_score(y_true=sk_target, y_pred=sk_preds) + + preds = torch.rand(num_batches, batch_size) + target = torch.randint(high=2, size=(num_batches, batch_size)) + + def _compute_batch(): + import horovod.torch as hvd + + trainer = Trainer( + fast_dev_run=True, + distributed_backend='horovod', + ) + + accelerator_backend = trainer.accelerator_connector.select_accelerator() + assert isinstance(accelerator_backend, HorovodAccelerator) + + metric = Accuracy(compute_on_step=True, + dist_sync_on_step=True, + dist_sync_fn=accelerator_backend.gather_all_tensors, + threshold=threshold) + + for i in range(hvd.rank(), num_batches, hvd.size()): + batch_result = metric(preds[i], target[i]) + if hvd.rank() == 0: + dist_preds = torch.stack([preds[i + r] for r in range(hvd.size())]) + dist_target = torch.stack([target[i + r] for r in range(hvd.size())]) + sk_batch_result = sk_metric(dist_preds, dist_target) + assert np.allclose(batch_result.numpy(), sk_batch_result) + + # check on all batches on all ranks + result = metric.compute() + assert isinstance(result, torch.Tensor) + + total_preds = torch.stack([preds[i] for i in range(num_batches)]) + total_target = torch.stack([target[i] for i in range(num_batches)]) + sk_result = sk_metric(total_preds, total_target) + + assert np.allclose(result.numpy(), sk_result) + + horovod.run(_compute_batch, np=2) + # @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") # def test_horovod_multi_optimizer_with_scheduling_stepping(tmpdir): # hparams = EvalModelTemplate.get_default_hparams() From 62ea4614f3b6e77ea2a05581796eb64d7780858e Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 5 Nov 2020 22:05:27 +0100 Subject: [PATCH 59/88] update PR template (#4523) * update PR template * Update .github/PULL_REQUEST_TEMPLATE.md Co-authored-by: Roger Shieh * Apply suggestions from code review Co-authored-by: edenlightning <66261195+edenlightning@users.noreply.github.com> Co-authored-by: chaton Co-authored-by: Roger Shieh Co-authored-by: edenlightning <66261195+edenlightning@users.noreply.github.com> --- .github/PULL_REQUEST_TEMPLATE.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 6e6a6af863d27..78c89cdae7e05 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -4,6 +4,8 @@ Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change. + +If we didn't discuss your PR in Github issues there's a high chance it will not be merged. --> Fixes # (issue) @@ -20,10 +22,14 @@ Fixes # (issue) ## PR review - - [ ] Is this pull request ready for review? (if not, please submit in draft mode) +Anyone in the community is free to review the PR once the tests have passed. +Before you start reviewing make sure you have read [Review guidelines](https://github.com/PyTorchLightning/pytorch-lightning/wiki/Review-guidelines). In in short, see following bullet-list: -Anyone in the community is free to review the PR once the tests have passed. -If we didn't discuss your PR in Github issues there's a high chance it will not be merged. + - [ ] Is this pull request ready for review? (if not, please submit in draft mode) + - [ ] Check that all items from **Before submitting** are resolved + - [ ] Make sure the title is self explanatory and the description concisely explains the PR + - [ ] Add labels and milestones (and optionally projects) to the PR so it can be classified; _Bugfixes should be including in bug-fix release milestones (m.f.X) and features should be included in (m.X.b) releases._ + ## Did you have fun? Make sure you had fun coding 🙃 From 9c8701f2e2c09a984ac0322942c8f742a7cc6bc9 Mon Sep 17 00:00:00 2001 From: chaton Date: Thu, 5 Nov 2020 22:27:04 +0000 Subject: [PATCH 60/88] [feat] Logging refactor 2/n - train (#4495) * update logging * solve more bugs * replace Mapping by Dict * update on comments * resolve pep8 * Apply suggestions from code review Co-authored-by: ananthsub * Update pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py Co-authored-by: Jirka Borovec * update on comments * typo * update for coverage * update test * update * Update tests/models/test_hooks.py Co-authored-by: Sean Naren * Update tests/models/test_hooks.py Co-authored-by: Sean Naren * update on comments * remove deepcopy * remove useless look for * another small optim * extra optim * remove lastest optim, can be source of bug * resolve bug * add docstring * optimize coverage * Update pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py Co-authored-by: Jirka Borovec * Update pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py Co-authored-by: Jirka Borovec * Update tests/trainer/logging_tests/test_distributed_logging.py Co-authored-by: Jirka Borovec * Update pytorch_lightning/trainer/evaluation_loop.py Co-authored-by: Jirka Borovec * Update tests/trainer/logging/test_logger_connector.py Co-authored-by: Jirka Borovec * Update tests/trainer/logging_tests/test_train_loop_logging_1_0.py Co-authored-by: Jirka Borovec * update on comments * update * update on comments * update parity speed * get it down to 0.65 * update * 0.8 max_dif Co-authored-by: Jirka Borovec Co-authored-by: ananthsub Co-authored-by: Sean Naren Co-authored-by: William Falcon --- benchmarks/test_parity.py | 2 +- .../callback_hook_validator.py | 2 +- .../logger_connector/epoch_result_store.py | 252 ++++++++++++------ .../logger_connector/logger_connector.py | 137 +++++++--- pytorch_lightning/trainer/evaluation_loop.py | 3 + pytorch_lightning/trainer/trainer.py | 24 +- pytorch_lightning/trainer/training_loop.py | 113 +++----- tests/base/develop_utils.py | 2 +- tests/models/test_hooks.py | 8 +- .../test_eval_loop_dict_return.py | 15 ++ .../test_trainer_steps_dict_return.py | 17 +- .../test_trainer_steps_scalar_return.py | 9 +- .../trainer/logging/test_logger_connector.py | 192 +++++++++++-- .../logging_tests/test_distributed_logging.py | 5 +- .../test_train_loop_logging_1_0.py | 209 ++++++++++++++- 15 files changed, 733 insertions(+), 257 deletions(-) diff --git a/benchmarks/test_parity.py b/benchmarks/test_parity.py index d2b30afb23946..d2bc97deff598 100644 --- a/benchmarks/test_parity.py +++ b/benchmarks/test_parity.py @@ -11,7 +11,7 @@ @pytest.mark.parametrize('cls_model,max_diff', [ (ParityModuleRNN, 0.05), - (ParityModuleMNIST, 0.70) + (ParityModuleMNIST, 0.8) ]) @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") def test_pytorch_parity(tmpdir, cls_model, max_diff): diff --git a/pytorch_lightning/trainer/connectors/logger_connector/callback_hook_validator.py b/pytorch_lightning/trainer/connectors/logger_connector/callback_hook_validator.py index 3ce4b523545c3..e9c33cea70b8a 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/callback_hook_validator.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/callback_hook_validator.py @@ -192,7 +192,7 @@ def _on_validation_start_log(): @staticmethod def _on_validation_end_log(): """Called when the validation loop ends.""" - return {"on_step": [False], "on_epoch": [False, True]} + return None @staticmethod def _on_test_start_log(): diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 2a9d68807e694..2980b037c95f7 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -11,12 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from collections import defaultdict -from copy import deepcopy +import os +from collections import defaultdict, ChainMap from enum import Enum -from typing import Union, Tuple, Any, Mapping - +from typing import Union, Tuple, Any, Dict, Optional, List +from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.core.step_result import Result @@ -92,73 +91,70 @@ def __init__(self, fx_name): self._internals_reduced = {} self._internal_type = None self.has_reduced = False + self._latest_ref = {} - def get_reduced_metrics(self): - return self._internals_reduced - - def add_dataloader_idx(self): - return len(self._internals) > 1 + @property + def has_several_dataloaders(self) -> bool: + return self.num_dataloaders > 1 @property - def num_dataloaders(self): - return len(self._internals) - - def get_latest_from_dict(self, dl_idx): - num_opt_idx = len(self._internals[dl_idx]) - 1 - assert num_opt_idx >= 0 - num_opt_idx = str(num_opt_idx) - num_batch_idx = len(self._internals[dl_idx][num_opt_idx]) - 1 - batch_indexes = [*self._internals[dl_idx][num_opt_idx].keys()] - # sort them by increasing order - batch_indexes.sort(key=float) - assert num_batch_idx >= 0 - return self._internals[dl_idx][num_opt_idx][batch_indexes[-1]][-1] + def num_dataloaders(self) -> int: + _inter = self._internals_reduced if self.has_reduced else self._internals + return len(_inter) def check_dataloader_idx(self, result: Result) -> bool: - add_dataloader_idx = False - try: - if len(result.keys()) > 1: - random_key = [*result.keys()][-1] - add_dataloader_idx = result["meta"][random_key]["dataloader_idx"] is not None - return add_dataloader_idx - return add_dataloader_idx - except Exception: - return add_dataloader_idx - - def get_lastest_from_func_name(self, func_name, *args, latest=True, **kwargs): + random_key = [*result.keys()][-1] + add_dataloader_idx = result["meta"][random_key]["dataloader_idx"] is not None + return add_dataloader_idx + + def get_lastest_from_func_name(self, latest_result, func_name: str, *args, **kwargs) -> Dict: results = {} - if latest: - for dl_idx in range(self.num_dataloaders): - dl_idx = str(dl_idx) - if self._internal_type == ResultStoreType.OUTSIDE_BATCH_TRAIN_LOOP: - latest_result = self._internals[dl_idx][-1] - else: - latest_result = self.get_latest_from_dict(dl_idx) - add_dataloader_idx = self.check_dataloader_idx(latest_result) - func = getattr(latest_result, func_name) - results.update(func(*args, add_dataloader_idx=add_dataloader_idx, **kwargs)) - return results - raise NotImplementedError + add_dataloader_idx = self.check_dataloader_idx(latest_result) + func = getattr(latest_result, func_name) + results.update(func(*args, add_dataloader_idx=add_dataloader_idx, **kwargs)) + return results - def get_batch_pbar_metrics(self, latest=True, *args, **kwargs): - return self.get_lastest_from_func_name("get_batch_pbar_metrics", *args, latest=latest, **kwargs) + def run_lastest_batch_metrics_with_func_name(self, func_name, *args, **kwargs) -> List[Dict]: + """ + This function used cache_ref and cache_result to optimize loading metrics - def get_batch_log_metrics(self, latest=True, *args, **kwargs): - return self.get_lastest_from_func_name("get_batch_log_metrics", *args, latest=latest, **kwargs) + Context: As we update the logger_connector metrics on every `self.log` call, + and it can be pretty time consuming, especially when logging outside batch loop. + + HookResultStore keeps track of its latest added result object, + and cache its pbar and log metrics if already called on, + """ + results = [] + for dl_idx in range(self.num_dataloaders): + dl_idx = str(dl_idx) + latest_result = self._latest_ref[dl_idx] + result = self.get_lastest_from_func_name(latest_result, func_name, *args, **kwargs) + results.append(result) + return results + + def get_batch_pbar_metrics(self, *args, **kwargs): + return self.run_lastest_batch_metrics_with_func_name("get_batch_pbar_metrics", + *args, + **kwargs) + + def get_batch_log_metrics(self, *args, **kwargs): + return self.run_lastest_batch_metrics_with_func_name("get_batch_log_metrics", + *args, + **kwargs) def run_epoch_func(self, results, opt_metric, func_name, *args, **kwargs) -> None: if isinstance(opt_metric, Result): func = getattr(opt_metric, func_name) metrics_to_log = func( *args, - add_dataloader_idx=self.add_dataloader_idx, + add_dataloader_idx=self.has_several_dataloaders, **kwargs) - results.update(metrics_to_log) + results.append(metrics_to_log) else: raise Exception("The provided opt_metric should be a Result Object. Something is wrong") - def get_epoch_from_func_name(self, func_name, *args, **kwargs) -> Mapping: - results = {} + def get_epoch_from_func_name(self, func_name, *args, **kwargs) -> List[Dict]: + results = [] for dl_idx in range(self.num_dataloaders): dl_idx = str(dl_idx) opt_metrics = self._internals_reduced[dl_idx] @@ -169,13 +165,13 @@ def get_epoch_from_func_name(self, func_name, *args, **kwargs) -> Mapping: self.run_epoch_func(results, opt_metrics, func_name, *args, **kwargs) return results - def get_epoch_pbar_metrics(self, *args, **kwargs) -> Mapping: + def get_epoch_pbar_metrics(self, *args, **kwargs) -> List[Dict]: return self.get_epoch_from_func_name("get_epoch_pbar_metrics") - def get_epoch_log_metrics(self, *args, **kwargs) -> Mapping: + def get_epoch_log_metrics(self, *args, **kwargs) -> List[Dict]: return self.get_epoch_from_func_name("get_epoch_log_metrics") - def get_forked_metrics(self, *args, **kwargs) -> Mapping: + def get_forked_metrics(self, *args, **kwargs) -> List[Dict]: return self.get_epoch_from_func_name("get_forked_metrics") @staticmethod @@ -211,6 +207,8 @@ def append(self, result, dataloader_idx=None, extra_info: dict = {}) -> None: self._append_to_structure(self._internals[primary_key], opt_idx, batch_idx, result) + self._latest_ref[primary_key] = result + # [dataloader_idx] is a list else: self._internal_type = ResultStoreType.OUTSIDE_BATCH_TRAIN_LOOP @@ -218,6 +216,8 @@ def append(self, result, dataloader_idx=None, extra_info: dict = {}) -> None: self._internals[primary_key] = [] self._internals[primary_key].append(result) + self._latest_ref[primary_key] = result + def auto_reduce_results_on_epoch_end(self) -> None: """ This function is called to reduce `self._internals` Result object. @@ -271,7 +271,7 @@ def auto_reduce_results_on_epoch_end(self) -> None: self._internals_reduced[dl_idx][str(opt_idx)] = opt_outputs # free memory - del self._internals[dl_idx] + del self._internals[dl_idx][opt_idx] else: # no need to reduce as called only once if len(epoch_metrics) == 1: @@ -301,13 +301,9 @@ def __repr__(self): class EpochResultStore: """ This class is defined for internal usage. - It holds all metrics logged using the self.log function using `HookResultStore` object. - The internal datastructure is as follow: - self._internals = {"fx_name_0": HookResultStore(), ..., "fx_name_n": HookResultStore()} - Pseudo Code Example: ``` model._current_fx_name = 'something' @@ -315,7 +311,6 @@ class EpochResultStore: model.log('a', ...) epoch_result_store.cache_result() ``` - """ def __init__(self, trainer, stage): self.trainer = trainer @@ -365,7 +360,7 @@ def current_model_info(self): model_ref = self.trainer.get_model() # extract hook information fx_name = model_ref._current_hook_fx_name - if fx_name == '': + if fx_name is None: fx_name = model_ref._current_fx_name dataloader_idx = model_ref._current_dataloader_idx return fx_name, dataloader_idx @@ -398,7 +393,7 @@ def cache_result(self) -> None: Result.attach_batch_size(self._batch_size, hook_result) self._internals[fx_name].append( - deepcopy(hook_result), + hook_result, dataloader_idx=dataloader_idx, extra_info=extra_info) @@ -456,18 +451,22 @@ def update_logger_connector(self, fx_name: str = None) -> None: logger_connector.callback_metrics.update(callback_metrics) logger_connector.callback_metrics.pop("epoch", None) - def run_batch_from_func_name(self, func_name) -> Mapping: - results = {} + def run_batch_from_func_name(self, func_name) -> Dict: + results = [] for fx_name, hook_result in self._internals.items(): func = getattr(hook_result, func_name) - results.update(func(latest=True, include_forked_originals=False)) - return results + results.append(func(include_forked_originals=False)) + return dict(ChainMap(*sum(results, []))) - def get_latest_batch_log_metrics(self) -> Mapping: - return self.run_batch_from_func_name("get_batch_log_metrics") + def get_latest_batch_log_metrics(self) -> Dict: + batch_log_metrics = self.run_batch_from_func_name("get_batch_log_metrics") + batch_log_metrics.update(self.legacy_batch_log_metrics) + return batch_log_metrics - def get_latest_batch_pbar_metrics(self) -> Mapping: - return self.run_batch_from_func_name("get_batch_pbar_metrics") + def get_latest_batch_pbar_metrics(self) -> Dict: + batch_pbar_metrics = self.run_batch_from_func_name("get_batch_pbar_metrics") + batch_pbar_metrics.update(self.legacy_batch_pbar_metrics) + return batch_pbar_metrics @property def has_reduced(self) -> bool: @@ -495,27 +494,24 @@ def has_batch_loop_finished(self, has_batch_loop_finished): self._has_batch_loop_finished = has_batch_loop_finished self.update_logger_connector() - def run_epoch_by_func_name(self, func_name) -> Mapping: + def run_epoch_by_func_name(self, func_name) -> Dict: if not self.has_reduced: self.auto_reduce_results_on_epoch_end() - results = {} + results = [] for fx_name, hook_result in self._internals.items(): func = getattr(hook_result, func_name) - results.update(func()) - return results + results.append(func()) + return dict(ChainMap(*sum(results, []))) - def get_epoch_pbar_metrics(self) -> Mapping: + def get_epoch_pbar_metrics(self) -> Dict: return self.run_epoch_by_func_name("get_epoch_pbar_metrics") - def get_epoch_log_metrics(self) -> Mapping: + def get_epoch_log_metrics(self) -> Dict: return self.run_epoch_by_func_name("get_epoch_log_metrics") - def get_forked_metrics(self) -> Mapping: + def get_forked_metrics(self) -> Dict: return self.run_epoch_by_func_name("get_forked_metrics") - def get_reduced_metrics(self) -> Mapping: - return self.run_epoch_by_func_name("get_reduced_metrics") - def reset(self): self._internals = {} self._dataloader_idx: Union[int, None] = None @@ -523,6 +519,96 @@ def reset(self): self._opt_idx: Union[int, None] = None self._batch_size: Union[int, None] = None self._has_batch_loop_finished = False + self.legacy_batch_log_metrics = {} + self.legacy_batch_pbar_metrics = {} + + def __call__( + self, + fx_name: Optional[Union[str, int]] = None, + dl_idx: Optional[Union[str, int]] = None, + opt_idx: Optional[Union[str, int]] = None, + batch_idx: Optional[Union[str, int]] = None, + split_idx: Optional[Union[str, int]] = None, + reduced: bool = False, + ): + """ + This function is an helper to access stored data + + It access data from the HookResultStore. Please, + check its data structure for better understanding + + Data can be accessed with the following chains: + + IF REDUCED: + * IF accessing a fx_name defined in batch training loop: + fx_name -> dl_idx -> opt_idx -> batch_idx -> split_idx + * ELSE fx_name -> dl_idx -> batch_idx + ELSE: + * IF accessing a fx_name defined in batch training loop: + fx_name -> dl_idx -> opt_idx + * ELSE fx_name -> dl_idx + + Note: + As soon as a param is None, it breaks the chain and returns associated stored data. + + Example:: + + result: Result = self(fx_name="training_step", dl_idx="0", opt_idx="0", reduced=True) + result['train_loss_epoch'] # aggregated train_loss over one epoch. + + Args: + + fx_name: Hook name from ModelHooks or Callback. Example: `training_step` + + dl_idx: Dataloader idx in short. It starts from 0 to num_dataloaders - 1 + + opt_idx: Optimizer idx in short. It starts from 0 to num_optimizers - 1 + + batch_idx: Index of batch idx seen during batch training or evaluation. + Works only with reduced=False + + split_idx: Index of split idx in training loop when ttbt is used. + + reduced: Data are being aggregated on on_epoch_end. + Indicates if we want to access aggregated Result or not. + """ + + hook_result = self[str(fx_name)] + + dl_idx = str(dl_idx) if dl_idx is not None else None + opt_idx = str(opt_idx) if opt_idx is not None else None + batch_idx = str(batch_idx) if batch_idx is not None else None + split_idx = int(split_idx) if split_idx is not None else None + + internal_type = hook_result._internal_type + + if reduced: + result = hook_result._internals_reduced + else: + result = hook_result._internals + + if internal_type == ResultStoreType.INSIDE_BATCH_TRAIN_LOOP: + if not reduced: + if dl_idx is not None: + result = result[dl_idx] + if opt_idx is not None: + result = result[opt_idx] + if batch_idx is not None: + result = result[batch_idx] + if split_idx is not None: + result = result[split_idx] + else: + if dl_idx is not None: + result = result[dl_idx] + if opt_idx is not None: + result = result[opt_idx] + else: + if dl_idx is not None: + result = result[dl_idx] + if batch_idx and not reduced: + result = result[batch_idx] + + return result def __repr__(self): return f"{self.__class__.__name__}(stage={self._stage}, internals={self._internals})" diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 0a1cb836eda6d..946064660f818 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -44,25 +44,14 @@ def __init__(self, trainer): self._callback_hook_validator = CallbackHookNameValidator() self._current_stage = None - def cached_results(self, stage_or_testing: Union[str, bool]) -> Union[EpochResultStore, None]: - """ Function to access cached_results using str or bool. Bool is used only for testing""" - stage_or_testing = str(stage_or_testing) - stages = self._stages - if stage_or_testing in self._stages: - return self._cached_results[stage_or_testing] - if stage_or_testing in LOOKUP_TABLE: - # Acces using trainer.testing - stage = LOOKUP_TABLE[stage_or_testing] - return self._cached_results[stage] - raise MisconfigurationException( - f"Provide stage_or_testing {stage_or_testing} doesn't belong either to {self._stages}" - f" or {LOOKUP_TABLE.keys()}" - ) + @property + def cached_results(self) -> Union[EpochResultStore, None]: + return self._cached_results[self._current_stage] def set_stage(self, stage_or_testing: str, reset:bool = False) -> None: self._current_stage = self._determine_stage(stage_or_testing) if reset: - self.cached_results(stage_or_testing).reset() + self.cached_results.reset() def check_logging_in_callbacks(self, hook_fx_name, on_step: bool = None, on_epoch: bool = None) -> None: self._callback_hook_validator.check_logging_in_callbacks(current_hook_fx_name=hook_fx_name, @@ -75,17 +64,17 @@ def on_evaluation_batch_start(self, testing, batch, dataloader_idx, num_dataload model._current_dataloader_idx = dataloader_idx if num_dataloaders > 1 else None # track batch_size - self.cached_results(testing)._batch_size = Result.extract_batch_size(batch) + self.cached_results._batch_size = Result.extract_batch_size(batch) - def on_batch_start(self, split_idx: int, opt_idx: int, split_batch) -> None: - self._cached_results["train"]._split_idx = split_idx - self._cached_results["train"]._opt_idx = opt_idx - self._cached_results["train"]._batch_size = Result.extract_batch_size(split_batch) + def on_train_split_start(self, split_idx: int, opt_idx: int, split_batch) -> None: + self.cached_results._split_idx = split_idx + self.cached_results._opt_idx = opt_idx + self.cached_results._batch_size = Result.extract_batch_size(split_batch) def on_train_batch_end(self) -> None: - self._cached_results["train"]._split_idx = None - self._cached_results["train"]._opt_idx = None - self._cached_results["train"]._batch_size = None + self.cached_results._split_idx = None + self.cached_results._opt_idx = None + self.cached_results._batch_size = None def _determine_stage(self, stage_or_testing: Union[str, bool]) -> str: stage_or_testing = str(stage_or_testing) @@ -112,6 +101,16 @@ def on_trainer_init(self, logger, flush_logs_every_n_steps, log_every_n_steps): self.trainer.flush_logs_every_n_steps = flush_logs_every_n_steps self.trainer.log_every_n_steps = log_every_n_steps + @property + def should_flush_logs(self): + should_flush = (self.trainer.global_step + 1) % self.trainer.flush_logs_every_n_steps == 0 + return should_flush or self.trainer.should_stop + + @property + def should_update_logs(self): + should_log_every_n_steps = (self.trainer.global_step + 1) % self.trainer.log_every_n_steps == 0 + return should_log_every_n_steps or self.trainer.should_stop + def configure_logger(self, logger): if logger is True: version = os.environ.get('PL_EXP_VERSION', self.trainer.slurm_job_id) @@ -130,6 +129,53 @@ def configure_logger(self, logger): else: self.trainer.logger = logger + def cache_training_step_metrics(self, opt_closure_result): + """ + This function is responsible to update + logger_connector internals metrics holder based for depreceated logging + """ + using_results_obj = isinstance(opt_closure_result.training_step_output, Result) + + # temporary dict to collect metrics + logged_metrics_tmp = {} + pbar_metrics_tmp = {} + callback_metrics_tmp = {} + + if using_results_obj: + batch_log_metrics = opt_closure_result.training_step_output.get_batch_log_metrics( + include_forked_originals=False + ) + logged_metrics_tmp.update(batch_log_metrics) + + batch_pbar_metrics = opt_closure_result.training_step_output.get_batch_pbar_metrics( + include_forked_originals=False + ) + pbar_metrics_tmp.update(batch_pbar_metrics) + + forked_metrics = opt_closure_result.training_step_output.get_forked_metrics() + callback_metrics_tmp.update(forked_metrics) + callback_metrics_tmp.update(logged_metrics_tmp) + + else: + batch_log_metrics = opt_closure_result.training_step_output.log_metrics + logged_metrics_tmp.update(batch_log_metrics) + + callback_metrics = opt_closure_result.training_step_output.callback_metrics + callback_metrics_tmp.update(callback_metrics) + + batch_pbar_metrics = opt_closure_result.training_step_output.pbar_on_batch_end + pbar_metrics_tmp.update(batch_pbar_metrics) + + # track progress bar metrics + if len(pbar_metrics_tmp) > 0: + self.add_progress_bar_metrics(pbar_metrics_tmp) + + self.callback_metrics.update(callback_metrics_tmp) + + # save legacy log metrics + self.logged_metrics.update(logged_metrics_tmp) + self.cached_results.legacy_batch_log_metrics.update(logged_metrics_tmp) + def log_metrics(self, metrics, grad_norm_dic, step=None): """Logs the metric dict passed in. If `step` parameter is None and `step` key is presented is metrics, @@ -396,8 +442,9 @@ def __process_eval_epoch_end_results_and_log_legacy(self, eval_results, test_mod if num_loaders == 1: self.__process_eval_epoch_end_results_and_log_legacy_update(prog_bar_metrics, log_metrics, callback_metrics) - def on_train_epoch_end(self, epoch_output): - pass + def on_train_epoch_end(self): + # inform cached logger connector epoch finished + self.cached_results.has_batch_loop_finished = True def log_train_epoch_end_metrics(self, epoch_output, @@ -441,12 +488,10 @@ def log_train_epoch_end_metrics(self, # ------------------ if is_1_0_result: # lightning module hook - epoch_end_log_result = self.training_epoch_end(model, epoch_output, num_optimizers) + self.training_epoch_end(model, epoch_output, num_optimizers) # log/aggregate metrics automatically epoch_log_metrics, epoch_progress_bar_metrics = self.__auto_reduce_results_on_epoch_end(epoch_output) - epoch_log_metrics.update(epoch_end_log_result.get_epoch_log_metrics()) - epoch_progress_bar_metrics.update(epoch_end_log_result.get_epoch_pbar_metrics()) # TODO: deprecate 1.0 else: @@ -459,6 +504,14 @@ def log_train_epoch_end_metrics(self, ) epoch_log_metrics, epoch_progress_bar_metrics, epoch_callback_metrics = out + # it will perform reduction over epoch and return log metrics + cached_epoch_log_metrics = self.cached_results.get_epoch_log_metrics() + cached_epoch_pbar_metrics = self.cached_results.get_epoch_pbar_metrics() + + # update + epoch_log_metrics.update(cached_epoch_log_metrics) + epoch_progress_bar_metrics.update(cached_epoch_pbar_metrics) + # -------------------------- # track results # -------------------------- @@ -475,15 +528,16 @@ def log_train_epoch_end_metrics(self, self.add_progress_bar_metrics(epoch_progress_bar_metrics) self.callback_metrics.update(epoch_progress_bar_metrics) + # reset epoch loop result for next epoch + self.cached_results.reset() + def training_epoch_end(self, model, epoch_output, num_optimizers): if not is_overridden('training_epoch_end', model=model): - return Result() + return # run training_epoch_end # refresh the result for custom logging at the epoch level model._current_fx_name = 'training_epoch_end' - model._results = Result() - epoch_output = self.__prepare_epoch_end_inputs(epoch_output) if num_optimizers == 1 or not self.trainer.train_loop.automatic_optimization: @@ -492,15 +546,11 @@ def training_epoch_end(self, model, epoch_output, num_optimizers): # lightningmodule hook epoch_output = model.training_epoch_end(epoch_output) - model._current_fx_name = '' - if epoch_output is not None: raise MisconfigurationException('training_epoch_end expects a return of None. ' 'HINT: remove the return statement in training_epoch_end') - - # user can ALSO log at the end of an epoch - new_epoch_end_logs = model._results - return new_epoch_end_logs + # capture logging + self.trainer.logger_connector.cache_logged_metrics() def __run_legacy_training_epoch_end( self, @@ -527,8 +577,12 @@ def __run_legacy_training_epoch_end( # run training_epoch_end # a list with a result per optimizer index + model._current_fx_name = 'training_epoch_end' epoch_output = model.training_epoch_end(epoch_output) + # capture logging + self.trainer.logger_connector.cache_logged_metrics() + if isinstance(epoch_output, Result): epoch_log_metrics = epoch_output.epoch_log_metrics epoch_progress_bar_metrics = epoch_output.epoch_pbar_metrics @@ -563,7 +617,7 @@ def __auto_reduce_results_on_epoch_end(self, epoch_output): # reduce across training steps opt_outputs = time_reduced_outputs[0].__class__.reduce_on_epoch_end(time_reduced_outputs) - # with manual opt need 1+ metrics because meta is always there + # with manual opt need 1 + metrics because meta is always there if opt_outputs.minimize is not None: opt_outputs.minimize = opt_outputs.minimize.mean() epoch_log_metrics.update(opt_outputs.epoch_log_metrics) @@ -623,12 +677,9 @@ def __gather_result_across_time_and_optimizers(self, epoch_output): def log_train_step_metrics(self, batch_output): # when metrics should be logged - should_log_metrics = ( - (self.trainer.global_step + 1) % self.trainer.log_every_n_steps == 0 or self.trainer.should_stop - ) - if should_log_metrics or self.trainer.fast_dev_run: + if self.should_update_logs or self.trainer.fast_dev_run: # logs user requested information to logger - metrics = batch_output.batch_log_metrics + metrics = self.cached_results.get_latest_batch_log_metrics() grad_norm_dic = batch_output.grad_norm_dic if len(metrics) > 0 or len(grad_norm_dic) > 0: self.log_metrics(metrics, grad_norm_dic) diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index 89a242dbfd886..6ebab1ade0f1d 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -358,6 +358,9 @@ def __log_result_step_metrics(self, output, batch_idx): step_log_metrics = output.get_batch_log_metrics(include_forked_originals=False) step_pbar_metrics = output.get_batch_pbar_metrics(include_forked_originals=False) + cached_batch_log_metrics = \ + self.trainer.logger_connector.cached_results.get_latest_batch_log_metrics() + if len(step_log_metrics) > 0: # make the metrics appear as a different line in the same graph metrics_by_epoch = {} diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 49cf232f76ac7..2d4e2c0d9e4bd 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -838,7 +838,25 @@ def call_setup_hook(self, model): self.setup(stage_name) model.setup(stage_name) + def _reset_result_and_set_hook_fx_name(self, hook_name): + model_ref = self.get_model() + if model_ref is not None: + # used to track current hook name called + model_ref._results = Result() + model_ref._current_hook_fx_name = hook_name + + def _cache_logged_metrics(self): + model_ref = self.get_model() + if model_ref is not None: + # capture logging for this hook + self.logger_connector.cache_logged_metrics() + def call_hook(self, hook_name, *args, **kwargs): + # temporary. Don't modify evaluation behaviour + if self.logger_connector._current_stage == "train": + # set hook_name to model + reset Result obj + self._reset_result_and_set_hook_fx_name(hook_name) + # always profile hooks with self.profiler.profile(hook_name): @@ -860,4 +878,8 @@ def call_hook(self, hook_name, *args, **kwargs): accelerator_hook = getattr(self.accelerator_backend, hook_name) output = accelerator_hook(*args, **kwargs) - return output + # temporary. Don't modify evaluation behaviour + if self.logger_connector._current_stage == "train": + # capture logging + self._cache_logged_metrics() + return output diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 3845b7eb728ac..2f66f5b1a600e 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -251,12 +251,15 @@ def on_train_epoch_start(self, epoch): self.trainer.call_hook("on_train_epoch_start") def on_train_batch_end(self, epoch_output, epoch_end_outputs, batch, batch_idx, dataloader_idx): + # hook + self.trainer.call_hook('on_batch_end') + self.trainer.call_hook('on_train_batch_end', epoch_end_outputs, batch, batch_idx, dataloader_idx) + # figure out what to track for epoch end self.track_epoch_end_reduce_metrics(epoch_output, epoch_end_outputs) - # hook - self.trainer.call_hook("on_batch_end") - self.trainer.call_hook("on_train_batch_end", epoch_end_outputs, batch, batch_idx, dataloader_idx) + # reset batch logger internals + self.trainer.logger_connector.on_train_batch_end() def reset_train_val_dataloaders(self, model): if not self.trainer.reload_dataloaders_every_epoch: @@ -305,13 +308,16 @@ def on_after_backward(self, training_step_output, batch_idx, untouched_loss): def training_step(self, split_batch, batch_idx, opt_idx, hiddens): # give the PL module a result for logging - model = self.trainer.get_model() - model._results = Result() - model._current_fx_name = "training_step" + model_ref = self.trainer.get_model() with self.trainer.profiler.profile("model_forward"): args = self.build_train_args(split_batch, batch_idx, opt_idx, hiddens) + + # manually capture logged metrics + model_ref._current_fx_name = 'training_step' training_step_output = self.trainer.accelerator_backend.training_step(args) + self.trainer.logger_connector.cache_logged_metrics() + training_step_output = self.trainer.call_hook("training_step_end", training_step_output) training_step_output_for_epoch_end, training_step_output = self._process_training_step_output( @@ -484,35 +490,6 @@ def _track_gradient_norm(self): grad_norm_dict = model.grad_norm(self.trainer.track_grad_norm) return grad_norm_dict - def log_training_step_metrics(self, opt_closure_result, batch_callback_metrics, batch_log_metrics): - # track callback metrics - callback_metrics = opt_closure_result.training_step_output.callback_metrics - - # decide which metrics to log (results vs dict return) - using_results_obj = isinstance(opt_closure_result.training_step_output, Result) - if using_results_obj: - metrics_to_log = opt_closure_result.training_step_output.get_batch_log_metrics( - include_forked_originals=False - ) - step_pbar_metrics = opt_closure_result.training_step_output.get_batch_pbar_metrics( - include_forked_originals=False - ) - forked_metrics = opt_closure_result.training_step_output.get_forked_metrics() - callback_metrics.update(forked_metrics) - else: - metrics_to_log = opt_closure_result.training_step_output.log_metrics - step_pbar_metrics = opt_closure_result.training_step_output.pbar_on_batch_end - - # track batch log metrics - batch_log_metrics.append(metrics_to_log) - - # track progress bar metrics - if len(step_pbar_metrics) > 0: - self.trainer.logger_connector.add_progress_bar_metrics(step_pbar_metrics) - self.trainer.logger_connector.callback_metrics.update(step_pbar_metrics) - - batch_callback_metrics.append(callback_metrics) - def process_hiddens(self, opt_closure_result): hiddens = opt_closure_result.hiddens if isinstance(opt_closure_result.training_step_output, Result): @@ -578,6 +555,8 @@ def run_training_epoch(self): should_check_val = self.should_check_val_fx(batch_idx, is_last_batch) if should_check_val: self.trainer.run_evaluation(test_mode=False) + # reset stage to train + self.trainer.logger_connector.set_stage("train") # ----------------------------------------- # SAVE LOGGERS (ie: Tensorboard, etc...) @@ -586,7 +565,6 @@ def run_training_epoch(self): # update LR schedulers monitor_metrics = deepcopy(self.trainer.logger_connector.callback_metrics) - monitor_metrics.update(batch_output.batch_log_metrics) self.update_train_loop_lr_schedulers(monitor_metrics=monitor_metrics) self.trainer.checkpoint_connector.has_trained = True @@ -612,19 +590,19 @@ def run_training_epoch(self): # progress global step according to grads progress self.increment_accumulated_grad_global_step() + # epoch end hook + self.run_on_epoch_end_hook(epoch_output) + # log epoch metrics self.trainer.logger_connector.log_train_epoch_end_metrics( - epoch_output, self.checkpoint_accumulator, self.early_stopping_accumulator, self.num_optimizers + epoch_output, + self.checkpoint_accumulator, + self.early_stopping_accumulator, + self.num_optimizers ) - # hook - self.trainer.logger_connector.on_train_epoch_end(epoch_output) - # when no val loop is present or fast-dev-run still need to call checkpoints - self.check_checkpoint_callback(not (should_check_val or is_overridden("validation_step", model))) - - # epoch end hook - self.run_on_epoch_end_hook(epoch_output) + self.check_checkpoint_callback(not (should_check_val or is_overridden('validation_step', model))) # increment the global step once # progress global step according to grads progress @@ -634,12 +612,6 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx): # track grad norms grad_norm_dic = {} - # track all metrics for callbacks - batch_callback_metrics = [] - - # track metrics to log - batch_log_metrics = [] - # bookkeeping using_results_obj = False self.trainer.hiddens = None @@ -683,8 +655,6 @@ def run_training_batch(self, batch, batch_idx, dataloader_idx): self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens) batch_outputs = self._process_closure_result( - batch_callback_metrics=batch_callback_metrics, - batch_log_metrics=batch_log_metrics, batch_outputs=batch_outputs, opt_idx=opt_idx, ) @@ -711,15 +681,18 @@ def train_step_and_backward_closure(): self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure) else: - self._curr_step_result = self.training_step(split_batch, batch_idx, opt_idx, self.trainer.hiddens) + self._curr_step_result = self.training_step( + split_batch, + batch_idx, + opt_idx, + self.trainer.hiddens + ) if self._curr_step_result is None: # user decided to skip optimization continue batch_outputs = self._process_closure_result( - batch_callback_metrics=batch_callback_metrics, - batch_log_metrics=batch_log_metrics, batch_outputs=batch_outputs, opt_idx=opt_idx, ) @@ -737,19 +710,9 @@ def train_step_and_backward_closure(): # update running loss + reset accumulated loss self.update_running_loss() - # collapse all metrics into one dict - batch_log_metrics = {k: v for d in batch_log_metrics for k, v in d.items()} - - # track all metrics for callbacks - self.trainer.logger_connector.callback_metrics.update(batch_log_metrics) - self.trainer.logger_connector.callback_metrics.update( - {k: v for d in batch_callback_metrics for k, v in d.items() if v is not None} - ) - result = AttributeDict( signal=0, grad_norm_dic=grad_norm_dic, - batch_log_metrics=batch_log_metrics, training_step_output_for_epoch_end=batch_outputs, ) return result @@ -762,14 +725,14 @@ def block_ddp_sync_behaviour(self): yield def _process_closure_result( - self, batch_callback_metrics: list, batch_log_metrics: list, batch_outputs: list, opt_idx: int + self, batch_outputs: list, opt_idx: int ) -> list: opt_closure_result = self._curr_step_result if opt_closure_result is not None: - # log metrics - self.log_training_step_metrics(opt_closure_result, batch_callback_metrics, batch_log_metrics) + # cache metrics + self.trainer.logger_connector.cache_training_step_metrics(opt_closure_result) # track hiddens self.trainer.hiddens = self.process_hiddens(opt_closure_result) @@ -842,8 +805,10 @@ def update_train_loop_lr_schedulers(self, monitor_metrics=None): self.trainer.optimizer_connector.update_learning_rates(interval="step", monitor_metrics=monitor_metrics) def run_on_epoch_end_hook(self, epoch_output): - self.trainer.call_hook("on_epoch_end") - self.trainer.call_hook("on_train_epoch_end", epoch_output) + self.trainer.call_hook('on_epoch_end') + self.trainer.call_hook('on_train_epoch_end', epoch_output) + + self.trainer.logger_connector.on_train_epoch_end() def increment_accumulated_grad_global_step(self): num_accumulated_batches_reached = self._accumulated_batches_reached() @@ -898,10 +863,8 @@ def build_train_args(self, batch, batch_idx, opt_idx, hiddens): def save_loggers_on_train_batch_end(self): # when loggers should save to disk - should_save_log = ( - self.trainer.global_step + 1 - ) % self.trainer.flush_logs_every_n_steps == 0 or self.trainer.should_stop - if should_save_log or self.trainer.fast_dev_run: + should_flush_logs = self.trainer.logger_connector.should_flush_logs + if should_flush_logs or self.trainer.fast_dev_run: if self.trainer.is_global_zero and self.trainer.logger is not None: self.trainer.logger.save() @@ -955,7 +918,7 @@ def run_train_split_start(self, split_idx, split_batch, opt_idx, optimizer): model.toggle_optimizer(optimizer, opt_idx) # use to track metrics internally - self.trainer.logger_connector.on_batch_start(split_idx, opt_idx, split_batch) + self.trainer.logger_connector.on_train_split_start(split_idx, opt_idx, split_batch) def update_running_loss(self): accumulated_loss = self.accumulated_loss.mean() diff --git a/tests/base/develop_utils.py b/tests/base/develop_utils.py index ba0d20c2c8389..9c88ba1b7e4d3 100644 --- a/tests/base/develop_utils.py +++ b/tests/base/develop_utils.py @@ -32,7 +32,7 @@ def assert_speed_parity_relative(pl_times, pt_times, max_diff: float = 0.1): f"lightning {diffs} was slower than PT (threshold {max_diff})" -def assert_speed_parity_absolute(pl_times, pt_times, nb_epochs, max_diff: float = 0.6): +def assert_speed_parity_absolute(pl_times, pt_times, nb_epochs, max_diff: float = 0.55): # assert speeds diffs = np.asarray(pl_times) - np.asarray(pt_times) # norm by vanila time diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 886e0db4e7854..bccc5262a5bda 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -307,7 +307,7 @@ def on_test_model_train(self): trainer.fit(model) - assert model.called == [ + expected = [ 'on_fit_start', 'on_pretrain_routine_start', 'on_pretrain_routine_end', @@ -341,10 +341,12 @@ def on_test_model_train(self): 'on_fit_end', ] + assert model.called == expected + model2 = HookedModel() trainer.test(model2) - assert model2.called == [ + expected = [ 'on_fit_start', 'on_pretrain_routine_start', 'on_pretrain_routine_end', @@ -356,3 +358,5 @@ def on_test_model_train(self): 'on_test_model_train', 'on_fit_end', ] + + assert model2.called == expected diff --git a/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py b/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py index 31b60e1d0be8b..6329480e10a11 100644 --- a/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py +++ b/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py @@ -14,6 +14,7 @@ """ Tests to ensure that the training loop works with a dict """ +import os from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning import Trainer from tests.base.deterministic_model import DeterministicModel @@ -125,6 +126,9 @@ def test_validation_step_dict_return(tmpdir): Test that val step can return a dict with all the expected keys and they end up in the correct place """ + + os.environ['PL_DEV_DEBUG'] = '0' + model = DeterministicModel() model.training_step = model.training_step_dict_return model.validation_step = model.validation_step_dict_return @@ -166,6 +170,8 @@ def test_val_step_step_end_no_return(tmpdir): """ Test that val step + val step end work (with no return in val step end) """ + os.environ['PL_DEV_DEBUG'] = '0' + model = DeterministicModel() model.training_step = model.training_step_dict_return model.validation_step = model.validation_step_dict_return @@ -197,6 +203,9 @@ def test_val_step_step_end(tmpdir): """ Test that val step + val step end work """ + + os.environ['PL_DEV_DEBUG'] = '0' + model = DeterministicModel() model.training_step = model.training_step_dict_return model.validation_step = model.validation_step_dict_return @@ -241,6 +250,9 @@ def test_no_val_step_end(tmpdir): """ Test that val step + val epoch end """ + + os.environ['PL_DEV_DEBUG'] = '0' + model = DeterministicModel() model.training_step = model.training_step_dict_return model.validation_step = model.validation_step_dict_return @@ -284,6 +296,9 @@ def test_full_val_loop(tmpdir): """ Test that val step + val step end + val epoch end """ + + os.environ['PL_DEV_DEBUG'] = '0' + model = DeterministicModel() model.training_step = model.training_step_dict_return model.validation_step = model.validation_step_dict_return diff --git a/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_dict_return.py b/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_dict_return.py index 7e8588ce9f6b2..8d1aaf1b3c548 100644 --- a/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_dict_return.py +++ b/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_dict_return.py @@ -44,9 +44,10 @@ def test_training_step_dict(tmpdir): break out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) + assert out.signal == 0 - assert out.batch_log_metrics['log_acc1'] == 12.0 - assert out.batch_log_metrics['log_acc2'] == 7.0 + assert trainer.logger_connector.logged_metrics['log_acc1'] == 12.0 + assert trainer.logger_connector.logged_metrics['log_acc2'] == 7.0 train_step_out = out.training_step_output_for_epoch_end assert len(train_step_out) == 1 @@ -92,8 +93,8 @@ def training_step_with_step_end(tmpdir): out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 - assert out.batch_log_metrics['log_acc1'] == 14.0 - assert out.batch_log_metrics['log_acc2'] == 9.0 + assert trainer.logger_connector.logged_metrics['log_acc1'] == 14.0 + assert trainer.logger_connector.logged_metrics['log_acc2'] == 9.0 train_step_end_out = out.training_step_output_for_epoch_end pbar_metrics = train_step_end_out['progress_bar'] @@ -133,8 +134,8 @@ def test_full_training_loop_dict(tmpdir): out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 - assert out.batch_log_metrics['log_acc1'] == 14.0 - assert out.batch_log_metrics['log_acc2'] == 9.0 + assert trainer.logger_connector.logged_metrics['log_acc1'] == 14.0 + assert trainer.logger_connector.logged_metrics['log_acc2'] == 9.0 # get the output of the first optimizer train_step_end_out = out.training_step_output_for_epoch_end @@ -220,8 +221,8 @@ def test_train_step_epoch_end(tmpdir): out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 - assert out.batch_log_metrics['log_acc1'] == 12.0 - assert out.batch_log_metrics['log_acc2'] == 7.0 + assert trainer.logger_connector.logged_metrics['log_acc1'] == 12.0 + assert trainer.logger_connector.logged_metrics['log_acc2'] == 7.0 # outputs are for 1 optimizer and no tbptt train_step_end_out = out.training_step_output_for_epoch_end diff --git a/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py b/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py index b5eae913ca428..2a66f743a49ef 100644 --- a/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py +++ b/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py @@ -15,6 +15,7 @@ Tests to ensure that the training loop works with a scalar """ import torch +import os from pytorch_lightning import Trainer from tests.base.deterministic_model import DeterministicModel @@ -46,7 +47,6 @@ def test_training_step_scalar(tmpdir): out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 - assert len(out.batch_log_metrics) == 0 and isinstance(out.batch_log_metrics, dict) assert len(out.grad_norm_dic) == 0 and isinstance(out.grad_norm_dic, dict) train_step_out = out.training_step_output_for_epoch_end @@ -84,7 +84,6 @@ def training_step_scalar_with_step_end(tmpdir): out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 - assert len(out.batch_log_metrics) == 0 and isinstance(out.batch_log_metrics, dict) assert len(out.grad_norm_dic) == 0 and isinstance(out.grad_norm_dic, dict) train_step_out = out.training_step_output_for_epoch_end @@ -104,6 +103,8 @@ def test_full_training_loop_scalar(tmpdir): Checks train_step + training_step_end + training_epoch_end (all with scalar return from train_step) """ + os.environ['PL_DEV_DEBUG'] = '0' + model = DeterministicModel() model.training_step = model.training_step_scalar_return model.training_step_end = model.training_step_end_scalar @@ -132,7 +133,6 @@ def test_full_training_loop_scalar(tmpdir): out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 - assert len(out.batch_log_metrics) == 0 and isinstance(out.batch_log_metrics, dict) assert len(out.grad_norm_dic) == 0 and isinstance(out.grad_norm_dic, dict) train_step_out = out.training_step_output_for_epoch_end @@ -152,6 +152,8 @@ def test_train_step_epoch_end_scalar(tmpdir): Checks train_step + training_epoch_end (NO training_step_end) (with scalar return) """ + os.environ['PL_DEV_DEBUG'] = '0' + model = DeterministicModel() model.training_step = model.training_step_scalar_return model.training_step_end = None @@ -176,7 +178,6 @@ def test_train_step_epoch_end_scalar(tmpdir): out = trainer.train_loop.run_training_batch(batch, batch_idx, 0) assert out.signal == 0 - assert len(out.batch_log_metrics) == 0 and isinstance(out.batch_log_metrics, dict) assert len(out.grad_norm_dic) == 0 and isinstance(out.grad_norm_dic, dict) train_step_out = out.training_step_output_for_epoch_end diff --git a/tests/trainer/logging/test_logger_connector.py b/tests/trainer/logging/test_logger_connector.py index 0f27f2ca4fef4..08936f89eb9f8 100644 --- a/tests/trainer/logging/test_logger_connector.py +++ b/tests/trainer/logging/test_logger_connector.py @@ -17,15 +17,19 @@ import os import torch import pytest - +from copy import deepcopy from pytorch_lightning.trainer import Trainer from pytorch_lightning.core.step_result import Result from pytorch_lightning.trainer.connectors.logger_connector import LoggerConnector +from pytorch_lightning.trainer.connectors.logger_connector.epoch_result_store import EpochResultStore +from pytorch_lightning.trainer.connectors.logger_connector.callback_hook_validator import CallbackHookNameValidator +from pytorch_lightning.callbacks.base import Callback +from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base.boring_model import BoringModel, RandomDataset class Helper: - def decorator_with_arguments(fx_name='', hook_fx_name=''): + def decorator_with_arguments(fx_name='', hook_fx_name=None): def decorator(func): def wrapper(self, *args, **kwargs): # Set information @@ -65,9 +69,9 @@ def training_step(self, batch, batch_idx): self.log("train_loss", loss, on_step=True, on_epoch=True) return {"loss": loss} - def val_dataloader(self): - return [torch.utils.data.DataLoader(RandomDataset(32, 64)), - torch.utils.data.DataLoader(RandomDataset(32, 64))] + def on_train_epoch_end(self, outputs): + # save objects as it will be reset at the end of epoch. + self.train_results = deepcopy(self.trainer.logger_connector.cached_results) model = TestModel() model.val_dataloader = None @@ -82,21 +86,31 @@ def val_dataloader(self): ) trainer.fit(model) - assert len(trainer.logger_connector.cached_results("train")['training_step']['0']['0']) == 2 - assert trainer.logger_connector.cached_results("train")['training_step']['0']['0']['0'][0]["train_loss"] == model.train_losses[0] - assert trainer.logger_connector.cached_results("train")['training_step']['0']['0']['1'][0]["train_loss"] == model.train_losses[1] + train_results = model.train_results - # assert reduction didn't happen yet - assert trainer.logger_connector.cached_results("train").has_reduced is False + assert len(train_results(fx_name="training_step", dl_idx="0", opt_idx="0")) == 2 + generated = train_results(fx_name="training_step", + dl_idx="0", + opt_idx="0", + batch_idx="0", + split_idx="0")["train_loss"] + assert generated == model.train_losses[0] + generated = train_results(fx_name="training_step", + dl_idx="0", + opt_idx="0", + batch_idx="1", + split_idx="0")["train_loss"] + assert generated == model.train_losses[1] - # Launch reduction - trainer.logger_connector.cached_results("train").has_batch_loop_finished = True + assert train_results.has_reduced is not True - # assert reduction did happen - assert trainer.logger_connector.cached_results("train").has_reduced is True + train_results.has_batch_loop_finished = True - assert trainer.logger_connector.cached_results("train")["training_step"]\ - ._internals_reduced["0"]["0"]['train_loss_epoch'].item() == torch.stack(model.train_losses).mean().item() + assert train_results.has_reduced is True + + generated = train_results(fx_name="training_step", dl_idx="0", opt_idx="0", reduced=True)['train_loss_epoch'].item() + excepted = torch.stack(model.train_losses).mean().item() + assert generated == excepted def test__logger_connector__epoch_result_store__train__ttbt(tmpdir): @@ -163,6 +177,10 @@ def train_dataloader(self): sampler=None, ) + def on_train_epoch_end(self, outputs): + # save objects as it will be reset at the end of epoch. + self.train_results = deepcopy(self.trainer.logger_connector.cached_results) + model = TestModel() model.training_epoch_end = None model.example_input_array = torch.randn(5, truncated_bptt_steps) @@ -178,19 +196,22 @@ def train_dataloader(self): ) trainer.fit(model) - assert len(trainer.logger_connector.cached_results("train")['training_step']['0']['0']['0']) == len(model.train_losses) + train_results = model.train_results + + generated = train_results(fx_name="training_step", dl_idx="0", opt_idx="0", batch_idx="0") + assert len(generated) == len(model.train_losses) # assert reduction didn't happen yet - assert trainer.logger_connector.cached_results("train").has_reduced is False + assert train_results.has_reduced is False # Launch reduction - trainer.logger_connector.cached_results("train").has_batch_loop_finished = True + train_results.has_batch_loop_finished = True # assert reduction did happen - assert trainer.logger_connector.cached_results("train").has_reduced is True + assert train_results.has_reduced is True - assert trainer.logger_connector.cached_results("train")['training_step']\ - ._internals_reduced['0']['0']["a_epoch"].item() == torch.stack(model.train_losses).mean().item() + generated = train_results(fx_name="training_step", dl_idx="0", opt_idx="0", reduced=True)['a_epoch'].item() + assert generated == torch.stack(model.train_losses).mean().item() @pytest.mark.parametrize('num_dataloaders', [1, 2]) @@ -206,11 +227,11 @@ class TestModel(BoringModel): test_losses = {} @Helper.decorator_with_arguments(fx_name="test_step") - def test_step(self, batch, batch_idx, dataloader_idx=0): + def test_step(self, batch, batch_idx, dl_idx=0): output = self.layer(batch) loss = self.loss(batch, output) - primary_key = str(dataloader_idx) + primary_key = str(dl_idx) if primary_key not in self.test_losses: self.test_losses[primary_key] = [] @@ -239,11 +260,126 @@ def test_dataloader(self): ) trainer.test(model) - assert len(trainer.logger_connector.cached_results("test")["test_step"]._internals) == num_dataloaders + test_results = trainer.logger_connector._cached_results["test"] + + generated = test_results(fx_name="test_step") + assert len(generated) == num_dataloaders + for dl_idx in range(num_dataloaders): - assert len(trainer.logger_connector.cached_results("test")["test_step"]._internals[str(dl_idx)]) == limit_test_batches - trainer.logger_connector.cached_results("test").has_batch_loop_finished = True + generated = len(test_results(fx_name="test_step", dl_idx=str(dl_idx))) + assert generated == limit_test_batches + + test_results.has_batch_loop_finished = True + for dl_idx in range(num_dataloaders): expected = torch.stack(model.test_losses[str(dl_idx)]).mean() - generated = trainer.logger_connector.cached_results("test")["test_step"]._internals_reduced[str(dl_idx)]["test_loss_epoch"] + generated = test_results(fx_name="test_step", dl_idx=str(dl_idx), reduced=True)["test_loss_epoch"] assert abs(expected.item() - generated.item()) < 1e-6 + + +def test_call_back_validator(tmpdir): + + funcs_name = sorted([f for f in dir(Callback) if not f.startswith('_')]) + + callbacks_func = [ + 'on_after_backward', + 'on_batch_end', + 'on_batch_start', + 'on_before_zero_grad', + 'on_epoch_end', + 'on_epoch_start', + 'on_fit_end', + 'on_fit_start', + 'on_init_end', 'on_init_start', + 'on_keyboard_interrupt', + 'on_load_checkpoint', + 'on_pretrain_routine_end', + 'on_pretrain_routine_start', + 'on_sanity_check_end', + 'on_sanity_check_start', + 'on_save_checkpoint', + 'on_test_batch_end', + 'on_test_batch_start', + 'on_test_end', + 'on_test_epoch_end', + 'on_test_epoch_start', + 'on_test_start', + 'on_train_batch_end', + 'on_train_batch_start', + 'on_train_end', + 'on_train_epoch_end', + 'on_train_epoch_start', + 'on_train_start', + 'on_validation_batch_end', + 'on_validation_batch_start', + 'on_validation_end', + 'on_validation_epoch_end', + 'on_validation_epoch_start', + 'on_validation_start', + 'setup', + 'teardown', + ] + + not_supported = [ + "on_fit_end", + "on_fit_start", + "on_init_end", + "on_init_start", + "on_keyboard_interrupt", + "on_load_checkpoint", + "on_pretrain_routine_end", + "on_pretrain_routine_start", + "on_sanity_check_end", + "on_sanity_check_start", + "on_save_checkpoint", + "on_test_end", + "on_train_end", + "on_validation_end", + "setup", + "teardown", + ] + + assert funcs_name == callbacks_func, """Detected new callback function. + Need to add its logging permission to CallbackHookNameValidator and update this test""" + + validator = CallbackHookNameValidator() + + for func_name in funcs_name: + # This summurize where and what is currently possible to log using `self.log` function. + is_stage = "train" in func_name or "test" in func_name or "validation" in func_name + is_start = "start" in func_name or "batch" in func_name + on_step = is_stage and is_start + on_epoch = True + # creating allowed condition + allowed = ( + is_stage + or "batch" in func_name + or "epoch" in func_name + or "grad" in func_name + or "backward" in func_name + ) + allowed = ( + allowed + and "pretrain" not in func_name + and func_name not in ["on_train_end", "on_test_end", "on_validation_end"] + ) + if allowed: + validator.check_logging_in_callbacks(current_hook_fx_name=func_name, + on_step=on_step, + on_epoch=on_epoch) + if not is_start and is_stage: + with pytest.raises(MisconfigurationException, match="function supports only"): + validator.check_logging_in_callbacks(current_hook_fx_name=func_name, + on_step=True, + on_epoch=on_epoch) + else: + assert func_name in not_supported + with pytest.raises(MisconfigurationException, match="function doesn't support"): + validator.check_logging_in_callbacks(current_hook_fx_name=func_name, + on_step=on_step, + on_epoch=on_epoch) + + result = validator.check_logging_in_callbacks(current_hook_fx_name=None, + on_step=None, + on_epoch=None) + assert result is None diff --git a/tests/trainer/logging_tests/test_distributed_logging.py b/tests/trainer/logging_tests/test_distributed_logging.py index 5fdd021dcc0ae..a600317a024c9 100644 --- a/tests/trainer/logging_tests/test_distributed_logging.py +++ b/tests/trainer/logging_tests/test_distributed_logging.py @@ -26,8 +26,9 @@ def on_pretrain_routine_end(self) -> None: with mock.patch('pytorch_lightning.loggers.base.LightningLoggerBase.agg_and_log_metrics') as m: self.trainer.logger_connector.log_metrics({'a': 2}, {}) logged_times = m.call_count - expected = 1 if self.global_rank == 0 else 0 - assert logged_times == expected, 'actual logger called from non-global zero' + expected = int(self.trainer.is_global_zero) + msg = f'actual logger called from non-global zero, logged_times: {logged_times}, expected: {expected}' + assert logged_times == expected, msg @pytest.mark.skipif(platform.system() == "Windows", diff --git a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py index 414264894e639..60ff33b402e4b 100644 --- a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py +++ b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py @@ -14,15 +14,22 @@ """ Tests to ensure that the training loop works with a dict (1.0) """ -from pytorch_lightning.core.lightning import LightningModule -from tests.base.boring_model import BoringModel, RandomDictDataset, RandomDictStringDataset + import os -import torch +import collections import pytest +import itertools +import numpy as np +import torch +from torch.utils.data import Dataset + +import pytorch_lightning as pl +from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning import Trainer, callbacks + +from tests.base.boring_model import BoringModel, RandomDictDataset, RandomDictStringDataset from tests.base.deterministic_model import DeterministicModel -from torch.utils.data import Dataset def test__training_step__log(tmpdir): @@ -324,12 +331,12 @@ def training_step(self, batch, batch_idx, hiddens): assert y_tensor.shape[1] == truncated_bptt_steps, "tbptt split list failed" pred = self(x_tensor.view(batch_size, truncated_bptt_steps)) - loss_val = torch.nn.functional.mse_loss( + loss = torch.nn.functional.mse_loss( pred, y_tensor.view(batch_size, truncated_bptt_steps)) - self.log('a', loss_val, on_epoch=True) + self.log('a', loss, on_epoch=True) - return {'loss': loss_val, 'hiddens': self.test_hidden} + return {'loss': loss, 'hiddens': self.test_hidden} def on_train_epoch_start(self) -> None: self.test_hidden = None @@ -398,8 +405,10 @@ def val_dataloader(self): generated = set(trainer.logger_connector.logged_metrics) expected = { + 'a_step', 'a_epoch', - 'n_step/epoch_0', 'n_epoch', + 'n_step/epoch_0', + 'n_epoch', 'epoch' } @@ -489,3 +498,187 @@ def validation_step(self, batch, batch_idx): weights_summary=None, ) trainer.fit(model, train_data, val_data) + + +def test_log_works_in_train_callback(tmpdir): + """ + Tests that log can be called within callback + """ + + os.environ['PL_DEV_DEBUG'] = '1' + + class TestCallback(callbacks.Callback): + + # helpers + count = 1 + choices = [False, True] + # used to compute expected values + callback_funcs_called = collections.defaultdict(list) + funcs_called_count = collections.defaultdict(int) + funcs_attr = {} + + def make_logging(self, pl_module: pl.LightningModule, func_name, func_idx, + on_steps=[], on_epochs=[], prob_bars=[]): + self.funcs_called_count[func_name] += 1 + for idx, (on_step, on_epoch, prog_bar) in enumerate(list(itertools.product(*[on_steps, on_epochs, prob_bars]))): + # run logging + custom_func_name = f"{func_idx}_{idx}_{func_name}" + pl_module.log(custom_func_name, self.count * func_idx, on_step=on_step, + on_epoch=on_epoch, prog_bar=prog_bar) + + # catch information for verification + + # on on_train_start is outside the main loop. Won't be called + if func_name == "on_train_start": + self.callback_funcs_called[func_name].append([self.count * func_idx]) + + # Saved only values from second epoch, so we can compute its mean or latest. + if pl_module.trainer.current_epoch == 1: + self.callback_funcs_called[func_name].append([self.count * func_idx]) + + forked = on_step and on_epoch + + self.funcs_attr[custom_func_name] = { + "on_step": on_step, + "on_epoch": on_epoch, + "prog_bar": prog_bar, + "forked": forked, + "func_name": func_name} + + if on_step and on_epoch: + self.funcs_attr[f"{custom_func_name}_step"] = { + "on_step": True, + "on_epoch": False, + "prog_bar": prog_bar, + "forked": False, + "func_name": func_name} + + self.funcs_attr[f"{custom_func_name}_epoch"] = { + "on_step": False, + "on_epoch": True, + "prog_bar": prog_bar, + "forked": False, + "func_name": func_name} + + def on_train_start(self, trainer, pl_module): + self.make_logging(pl_module, 'on_train_start', 1, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_epoch_start(self, trainer, pl_module): + self.make_logging(pl_module, 'on_epoch_start', 2, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_train_epoch_start(self, trainer, pl_module): + self.make_logging(pl_module, 'on_train_epoch_start', 3, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_batch_start(self, trainer, pl_module): + self.make_logging(pl_module, 'on_batch_start', 4, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_train_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx): + self.make_logging(pl_module, 'on_train_batch_start', 5, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_batch_end(self, trainer, pl_module): + self.make_logging(pl_module, 'on_batch_end', 6, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): + self.make_logging(pl_module, 'on_train_batch_end', 7, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + # used to make sure aggregation works fine. + # we should obtain func[value * c for c in range(1, max_epochs * limit_train_batches)]) + # with func = np.mean if on_epoch else func = np.max + self.count += 1 + + def on_epoch_end(self, trainer, pl_module): + self.make_logging(pl_module, 'on_epoch_end', 8, on_steps=[False], + on_epochs=self.choices, prob_bars=self.choices) + + def on_train_epoch_end(self, trainer, pl_module, outputs): + self.make_logging(pl_module, 'on_train_epoch_end', 9, on_steps=[False], + on_epochs=self.choices, prob_bars=self.choices) + + class TestModel(BoringModel): + + manual_loss = [] + + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.manual_loss.append(loss) + self.log('train_loss', loss) + return {"loss": loss} + + max_epochs = 2 + limit_train_batches = 2 + model = TestModel() + test_callback = TestCallback() + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=limit_train_batches, + limit_val_batches=0, + limit_test_batches=0, + val_check_interval=0., + num_sanity_val_steps=0, + max_epochs=max_epochs, + callbacks=[test_callback] + ) + trainer.fit(model) + + assert test_callback.funcs_called_count["on_train_start"] == 1 + assert test_callback.funcs_called_count["on_epoch_start"] == 2 + assert test_callback.funcs_called_count["on_train_epoch_start"] == 2 + assert test_callback.funcs_called_count["on_batch_start"] == 4 + assert test_callback.funcs_called_count["on_train_batch_start"] == 4 + assert test_callback.funcs_called_count["on_batch_end"] == 4 + assert test_callback.funcs_called_count["on_train_batch_end"] == 4 + assert test_callback.funcs_called_count["on_epoch_end"] == 2 + assert test_callback.funcs_called_count["on_train_epoch_end"] == 2 + + # Make sure the func_name exists within callback_metrics. If not, we missed some + callback_metrics_keys = [*trainer.callback_metrics.keys()] + for func_name in test_callback.callback_funcs_called.keys(): + is_in = False + for callback_metrics_key in callback_metrics_keys: + if func_name in callback_metrics_key: + is_in = True + assert is_in, (func_name, callback_metrics_keys) + + # function used to describe expected return logic + def get_expected_output(func_attr, original_values): + if func_attr["on_epoch"] and not func_attr["on_step"]: + # Apply mean on values + expected_output = np.mean(original_values) + else: + # Keep the latest value + expected_output = np.max(original_values) + return expected_output + + # Make sure the func_name output equals the average from all logged values when on_epoch true + # pop extra keys + trainer.callback_metrics.pop("debug_epoch") + assert trainer.logged_metrics["train_loss"] == model.manual_loss[-1] + assert trainer.callback_metrics["train_loss"] == model.manual_loss[-1] + trainer.callback_metrics.pop("train_loss") + + for func_name, output_value in trainer.callback_metrics.items(): + if torch.is_tensor(output_value): + output_value = output_value.item() + # get creation attr + func_attr = test_callback.funcs_attr[func_name] + + # retrived orginal logged values + original_values = test_callback.callback_funcs_called[func_attr["func_name"]] + + # compute expected output and compare to actual one + expected_output = get_expected_output(func_attr, original_values) + assert float(output_value) == float(expected_output) + + for func_name, func_attr in test_callback.funcs_attr.items(): + if func_attr["prog_bar"] and (func_attr["on_step"] or func_attr["on_epoch"]) and not func_attr["forked"]: + assert func_name in trainer.logger_connector.progress_bar_metrics + else: + assert func_name not in trainer.logger_connector.progress_bar_metrics From 5e09fd31e9850902fc0a79e92aaba7b80dae1944 Mon Sep 17 00:00:00 2001 From: cool425589 Date: Fri, 6 Nov 2020 08:36:22 +0800 Subject: [PATCH 61/88] show progressbar only on progress_rank 0 on ddp_slurm (#4437) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: chaton Co-authored-by: Adrian Wälchli --- pytorch_lightning/accelerators/ddp_slurm_accelerator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/ddp_slurm_accelerator.py b/pytorch_lightning/accelerators/ddp_slurm_accelerator.py index 4960445edd27d..1ea4461c3c3cc 100644 --- a/pytorch_lightning/accelerators/ddp_slurm_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_slurm_accelerator.py @@ -123,7 +123,7 @@ def ddp_train(self, process_idx, model): self.set_world_ranks(process_idx) # toggle prog bar - if self.trainer.global_rank == 0 and self.trainer.progress_bar_callback is not None: + if self.trainer.global_rank != 0 and self.trainer.progress_bar_callback is not None: self.trainer.progress_bar_callback.disable() # set warning rank From f3dfb984442c1c4b79f95cfb1d83742aa76cc98b Mon Sep 17 00:00:00 2001 From: Jeff Yang Date: Fri, 6 Nov 2020 17:18:27 +0630 Subject: [PATCH 62/88] [ci] tag v1.4.1 for pypa/gh-action-pypi-publish (#4548) --- .github/workflows/nightly.yml | 2 +- .github/workflows/pypi-release.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index b14f40a6c4339..1395b7ede4b1d 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -29,7 +29,7 @@ jobs: # We do this, since failures on test.pypi aren't that bad - name: Publish to Test PyPI - uses: pypa/gh-action-pypi-publish@v1 + uses: pypa/gh-action-pypi-publish@v1.4.1 with: user: __token__ password: ${{ secrets.test_pypi_password }} diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml index fb99ae2284f76..354f799df20b1 100644 --- a/.github/workflows/pypi-release.yml +++ b/.github/workflows/pypi-release.yml @@ -30,7 +30,7 @@ jobs: # We do this, since failures on test.pypi aren't that bad - name: Publish to Test PyPI if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' - uses: pypa/gh-action-pypi-publish@v1 + uses: pypa/gh-action-pypi-publish@v1.4.1 with: user: __token__ password: ${{ secrets.test_pypi_password }} @@ -39,7 +39,7 @@ jobs: - name: Publish distribution 📦 to PyPI if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' - uses: pypa/gh-action-pypi-publish@v1 + uses: pypa/gh-action-pypi-publish@v1.4.1 with: user: __token__ password: ${{ secrets.pypi_password }} From 6e5f232f5cec2b5e635ae34fa365c6b969d0902e Mon Sep 17 00:00:00 2001 From: Indrayana Rustandi <468296+irustandi@users.noreply.github.com> Date: Fri, 6 Nov 2020 09:53:46 -0500 Subject: [PATCH 63/88] Add Dali MNIST example (#3721) * add MNIST DALI example, update README.md * Fix PEP8 warnings * reformatted using black * add mnist_dali to test_examples.py * Add documentation as docstrings * add nvidia-pyindex and nvidia-dali-cuda100 * replace nvidia-pyindex with --extra-index-url * mark mnist_dali test as Linux and GPU only * adjust CUDA docker and examples.txt, fix import error in test_examples.py * adjust the GPU check * Exit when DALI is not available * remove requirements-examples.txt and DALI pip install * Refactored example, moved to new logging api, added runtime check for test and dali script * Patch to reflect the mnist example module * add req. * Apply suggestions from code review * Removed requirement as it breaks CPU install, added note in README to install DALI * add DALI to Drone * test examples * Apply suggestions from code review * imports * ABC * cuda * cuda * pip DALI * Move build into init function Co-authored-by: SeanNaren Co-authored-by: Jirka Borovec Co-authored-by: Jirka Borovec Co-authored-by: Sean Naren --- .drone.yml | 2 + pl_examples/basic_examples/README.md | 10 +- pl_examples/basic_examples/mnist_dali.py | 204 +++++++++++++++++++++++ pl_examples/test_examples.py | 41 ++++- requirements/examples.txt | 2 +- 5 files changed, 249 insertions(+), 10 deletions(-) create mode 100644 pl_examples/basic_examples/mnist_dali.py diff --git a/.drone.yml b/.drone.yml index 5e6c08f7a8256..9774ffaaaecc7 100644 --- a/.drone.yml +++ b/.drone.yml @@ -32,6 +32,8 @@ steps: - pip --version - nvidia-smi - pip install -r ./requirements/devel.txt --upgrade-strategy only-if-needed -v --no-cache-dir + # when Image has defined CUDa version we can switch to this package spec "nvidia-dali-cuda${CUDA_VERSION%%.*}0" + - pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda100 --upgrade-strategy only-if-needed - pip list - coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --color=yes --durations=25 # --flake8 - python -m pytest benchmarks pl_examples -v --color=yes --maxfail=2 --durations=0 # --flake8 diff --git a/pl_examples/basic_examples/README.md b/pl_examples/basic_examples/README.md index 4dcf06a74bf92..18ae204396290 100644 --- a/pl_examples/basic_examples/README.md +++ b/pl_examples/basic_examples/README.md @@ -14,7 +14,15 @@ python mnist.py python mnist.py --gpus 2 --distributed_backend 'dp' ``` ---- +--- +#### MNIST with DALI +The MNIST example above using [NVIDIA DALI](https://developer.nvidia.com/DALI). +Requires NVIDIA DALI to be installed based on your CUDA version, see [here](https://docs.nvidia.com/deeplearning/dali/user-guide/docs/installation.html). +```bash +python mnist_dali.py +``` + +--- #### Image classifier Generic image classifier with an arbitrary backbone (ie: a simple system) ```bash diff --git a/pl_examples/basic_examples/mnist_dali.py b/pl_examples/basic_examples/mnist_dali.py new file mode 100644 index 0000000000000..649198053a01b --- /dev/null +++ b/pl_examples/basic_examples/mnist_dali.py @@ -0,0 +1,204 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import ABC +from argparse import ArgumentParser +from random import shuffle +from warnings import warn + +import numpy as np +import torch +from torch.nn import functional as F +from torch.utils.data import random_split + +import pytorch_lightning as pl + +try: + from torchvision.datasets.mnist import MNIST + from torchvision import transforms +except Exception: + from tests.base.datasets import MNIST + +try: + import nvidia.dali.ops as ops + import nvidia.dali.types as types + from nvidia.dali.pipeline import Pipeline + from nvidia.dali.plugin.pytorch import DALIClassificationIterator +except (ImportError, ModuleNotFoundError): + warn('NVIDIA DALI is not available') + ops, types, Pipeline, DALIClassificationIterator = ..., ..., ABC, ABC + + +class ExternalMNISTInputIterator(object): + """ + This iterator class wraps torchvision's MNIST dataset and returns the images and labels in batches + """ + + def __init__(self, mnist_ds, batch_size): + self.batch_size = batch_size + self.mnist_ds = mnist_ds + self.indices = list(range(len(self.mnist_ds))) + shuffle(self.indices) + + def __iter__(self): + self.i = 0 + self.n = len(self.mnist_ds) + return self + + def __next__(self): + batch = [] + labels = [] + for _ in range(self.batch_size): + index = self.indices[self.i] + img, label = self.mnist_ds[index] + batch.append(img.numpy()) + labels.append(np.array([label], dtype=np.uint8)) + self.i = (self.i + 1) % self.n + return (batch, labels) + + +class ExternalSourcePipeline(Pipeline): + """ + This DALI pipeline class just contains the MNIST iterator + """ + + def __init__(self, batch_size, eii, num_threads, device_id): + super(ExternalSourcePipeline, self).__init__(batch_size, num_threads, device_id, seed=12) + self.source = ops.ExternalSource(source=eii, num_outputs=2) + self.build() + + def define_graph(self): + images, labels = self.source() + return images, labels + + +class DALIClassificationLoader(DALIClassificationIterator): + """ + This class extends DALI's original DALIClassificationIterator with the __len__() function so that we can call len() on it + """ + + def __init__( + self, + pipelines, + size=-1, + reader_name=None, + auto_reset=False, + fill_last_batch=True, + dynamic_shape=False, + last_batch_padded=False, + ): + super().__init__(pipelines, size, reader_name, auto_reset, fill_last_batch, dynamic_shape, last_batch_padded) + + def __len__(self): + batch_count = self._size // (self._num_gpus * self.batch_size) + last_batch = 1 if self._fill_last_batch else 0 + return batch_count + last_batch + + +class LitClassifier(pl.LightningModule): + def __init__(self, hidden_dim=128, learning_rate=1e-3): + super().__init__() + self.save_hyperparameters() + + self.l1 = torch.nn.Linear(28 * 28, self.hparams.hidden_dim) + self.l2 = torch.nn.Linear(self.hparams.hidden_dim, 10) + + def forward(self, x): + x = x.view(x.size(0), -1) + x = torch.relu(self.l1(x)) + x = torch.relu(self.l2(x)) + return x + + def split_batch(self, batch): + return batch[0]["data"], batch[0]["label"].squeeze().long() + + def training_step(self, batch, batch_idx): + x, y = self.split_batch(batch) + y_hat = self(x) + loss = F.cross_entropy(y_hat, y) + return loss + + def validation_step(self, batch, batch_idx): + x, y = self.split_batch(batch) + y_hat = self(x) + loss = F.cross_entropy(y_hat, y) + self.log('valid_loss', loss) + + def test_step(self, batch, batch_idx): + x, y = self.split_batch(batch) + y_hat = self(x) + loss = F.cross_entropy(y_hat, y) + self.log('test_loss', loss) + + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate) + + @staticmethod + def add_model_specific_args(parent_parser): + parser = ArgumentParser(parents=[parent_parser], add_help=False) + parser.add_argument('--hidden_dim', type=int, default=128) + parser.add_argument('--learning_rate', type=float, default=0.0001) + return parser + + +def cli_main(): + pl.seed_everything(1234) + + # ------------ + # args + # ------------ + parser = ArgumentParser() + parser.add_argument('--batch_size', default=32, type=int) + parser = pl.Trainer.add_argparse_args(parser) + parser = LitClassifier.add_model_specific_args(parser) + args = parser.parse_args() + + # ------------ + # data + # ------------ + dataset = MNIST('', train=True, download=True, transform=transforms.ToTensor()) + mnist_test = MNIST('', train=False, download=True, transform=transforms.ToTensor()) + mnist_train, mnist_val = random_split(dataset, [55000, 5000]) + + eii_train = ExternalMNISTInputIterator(mnist_train, args.batch_size) + eii_val = ExternalMNISTInputIterator(mnist_val, args.batch_size) + eii_test = ExternalMNISTInputIterator(mnist_test, args.batch_size) + + pipe_train = ExternalSourcePipeline(batch_size=args.batch_size, eii=eii_train, num_threads=2, device_id=0) + train_loader = DALIClassificationLoader(pipe_train, size=len(mnist_train), auto_reset=True, fill_last_batch=False) + + pipe_val = ExternalSourcePipeline(batch_size=args.batch_size, eii=eii_val, num_threads=2, device_id=0) + val_loader = DALIClassificationLoader(pipe_val, size=len(mnist_val), auto_reset=True, fill_last_batch=False) + + pipe_test = ExternalSourcePipeline(batch_size=args.batch_size, eii=eii_test, num_threads=2, device_id=0) + test_loader = DALIClassificationLoader(pipe_test, size=len(mnist_test), auto_reset=True, fill_last_batch=False) + + # ------------ + # model + # ------------ + model = LitClassifier(args.hidden_dim, args.learning_rate) + + # ------------ + # training + # ------------ + trainer = pl.Trainer.from_argparse_args(args) + trainer.fit(model, train_loader, val_loader) + + # ------------ + # testing + # ------------ + trainer.test(test_dataloaders=test_loader) + + +if __name__ == "__main__": + cli_main() diff --git a/pl_examples/test_examples.py b/pl_examples/test_examples.py index 7fe5d4ed604dc..60f10a637e583 100644 --- a/pl_examples/test_examples.py +++ b/pl_examples/test_examples.py @@ -1,6 +1,15 @@ +import platform from unittest import mock -import torch + import pytest +import torch + +try: + from nvidia.dali import ops, types, pipeline, plugin +except (ImportError, ModuleNotFoundError): + DALI_AVAILABLE = False +else: + DALI_AVAILABLE = True dp_16_args = """ --max_epochs 1 \ @@ -28,7 +37,7 @@ --precision 16 \ """ - +# TODO # @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") # @pytest.mark.parametrize('cli_args', [dp_16_args]) # def test_examples_dp_mnist(cli_args): @@ -38,6 +47,7 @@ # cli_main() +# TODO # @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") # @pytest.mark.parametrize('cli_args', [dp_16_args]) # def test_examples_dp_image_classifier(cli_args): @@ -45,8 +55,9 @@ # # with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()): # cli_main() -# -# + + +# TODO # @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") # @pytest.mark.parametrize('cli_args', [dp_16_args]) # def test_examples_dp_autoencoder(cli_args): @@ -56,6 +67,7 @@ # cli_main() +# TODO # @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") # @pytest.mark.parametrize('cli_args', [ddp_args]) # def test_examples_ddp_mnist(cli_args): @@ -63,8 +75,9 @@ # # with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()): # cli_main() -# -# + + +# TODO # @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") # @pytest.mark.parametrize('cli_args', [ddp_args]) # def test_examples_ddp_image_classifier(cli_args): @@ -72,8 +85,9 @@ # # with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()): # cli_main() -# -# + + +# TODO # @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") # @pytest.mark.parametrize('cli_args', [ddp_args]) # def test_examples_ddp_autoencoder(cli_args): @@ -92,3 +106,14 @@ def test_examples_cpu(cli_args): for cli_cmd in [mnist_cli, ic_cli, ae_cli]: with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()): cli_cmd() + + +@pytest.mark.skipif(not DALI_AVAILABLE, reason="Nvidia DALI required") +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +@pytest.mark.skipif(platform.system() != 'Linux', reason='Only applies to Linux platform.') +@pytest.mark.parametrize('cli_args', [cpu_args]) +def test_examples_mnist_dali(cli_args): + from pl_examples.basic_examples.mnist_dali import cli_main + + with mock.patch("argparse._sys.argv", ["any.py"] + cli_args.strip().split()): + cli_main() diff --git a/requirements/examples.txt b/requirements/examples.txt index e930579b8b369..0afa62f9ffa95 100644 --- a/requirements/examples.txt +++ b/requirements/examples.txt @@ -1,2 +1,2 @@ torchvision>=0.4.1,<0.9.0 -gym>=0.17.0 \ No newline at end of file +gym>=0.17.0 From 854c13673ba6646ab3638dbc46625c90151ee96d Mon Sep 17 00:00:00 2001 From: chaton Date: Sat, 7 Nov 2020 12:05:29 +0000 Subject: [PATCH 64/88] add congratulations at the end of our notebooks (#4555) * add congratulations at the end of our notebooks * udpate image --- notebooks/01-mnist-hello-world.ipynb | 842 +-- notebooks/02-datamodules.ipynb | 1122 ++-- notebooks/03-basic-gan.ipynb | 890 +-- .../04-transformers-text-classification.ipynb | 1130 ++-- notebooks/05-trainer-flags-overview.ipynb | 5786 +++++++++-------- 5 files changed, 5005 insertions(+), 4765 deletions(-) diff --git a/notebooks/01-mnist-hello-world.ipynb b/notebooks/01-mnist-hello-world.ipynb index 79bc9ebec9632..b0323458c228b 100644 --- a/notebooks/01-mnist-hello-world.ipynb +++ b/notebooks/01-mnist-hello-world.ipynb @@ -1,400 +1,448 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "01-mnist-hello-world.ipynb", - "provenance": [], - "collapsed_sections": [], - "authorship_tag": "ABX9TyOtAKVa5POQ6Xg3UcTQqXDJ", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "i7XbLCXGkll9", - "colab_type": "text" - }, - "source": [ - "# Introduction to Pytorch Lightning ⚡\n", - "\n", - "In this notebook, we'll go over the basics of lightning by preparing models to train on the [MNIST Handwritten Digits dataset](https://en.wikipedia.org/wiki/MNIST_database).\n", - "\n", - "---\n", - " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", - " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", - " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2LODD6w9ixlT", - "colab_type": "text" - }, - "source": [ - "### Setup \n", - "Lightning is easy to install. Simply ```pip install pytorch-lightning```" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "zK7-Gg69kMnG", - "colab_type": "code", - "colab": {} - }, - "source": [ - "! pip install pytorch-lightning --quiet" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "w4_TYnt_keJi", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import os\n", - "\n", - "import torch\n", - "from torch import nn\n", - "from torch.nn import functional as F\n", - "from torch.utils.data import DataLoader, random_split\n", - "from torchvision.datasets import MNIST\n", - "from torchvision import transforms\n", - "import pytorch_lightning as pl\n", - "from pytorch_lightning.metrics.functional import accuracy" - ], - "execution_count": 2, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EHpyMPKFkVbZ", - "colab_type": "text" - }, - "source": [ - "## Simplest example\n", - "\n", - "Here's the simplest most minimal example with just a training loop (no validation, no testing).\n", - "\n", - "**Keep in Mind** - A `LightningModule` *is* a PyTorch `nn.Module` - it just has a few more helpful features." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "V7ELesz1kVQo", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class MNISTModel(pl.LightningModule):\n", - "\n", - " def __init__(self):\n", - " super(MNISTModel, self).__init__()\n", - " self.l1 = torch.nn.Linear(28 * 28, 10)\n", - "\n", - " def forward(self, x):\n", - " return torch.relu(self.l1(x.view(x.size(0), -1)))\n", - "\n", - " def training_step(self, batch, batch_nb):\n", - " x, y = batch\n", - " loss = F.cross_entropy(self(x), y)\n", - " return loss\n", - "\n", - " def configure_optimizers(self):\n", - " return torch.optim.Adam(self.parameters(), lr=0.02)" - ], - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hIrtHg-Dv8TJ", - "colab_type": "text" - }, - "source": [ - "By using the `Trainer` you automatically get:\n", - "1. Tensorboard logging\n", - "2. Model checkpointing\n", - "3. Training and validation loop\n", - "4. early-stopping" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "4Dk6Ykv8lI7X", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Init our model\n", - "mnist_model = MNISTModel()\n", - "\n", - "# Init DataLoader from MNIST Dataset\n", - "train_ds = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())\n", - "train_loader = DataLoader(train_ds, batch_size=32)\n", - "\n", - "# Initialize a trainer\n", - "trainer = pl.Trainer(gpus=1, max_epochs=3, progress_bar_refresh_rate=20)\n", - "\n", - "# Train the model ⚡\n", - "trainer.fit(mnist_model, train_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KNpOoBeIjscS", - "colab_type": "text" - }, - "source": [ - "## A more complete MNIST Lightning Module Example\n", - "\n", - "That wasn't so hard was it?\n", - "\n", - "Now that we've got our feet wet, let's dive in a bit deeper and write a more complete `LightningModule` for MNIST...\n", - "\n", - "This time, we'll bake in all the dataset specific pieces directly in the `LightningModule`. This way, we can avoid writing extra code at the beginning of our script every time we want to run it.\n", - "\n", - "---\n", - "\n", - "### Note what the following built-in functions are doing:\n", - "\n", - "1. [prepare_data()](https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.core.lightning.html#pytorch_lightning.core.lightning.LightningModule.prepare_data) 💾\n", - " - This is where we can download the dataset. We point to our desired dataset and ask torchvision's `MNIST` dataset class to download if the dataset isn't found there.\n", - " - **Note we do not make any state assignments in this function** (i.e. `self.something = ...`)\n", - "\n", - "2. [setup(stage)](https://pytorch-lightning.readthedocs.io/en/latest/lightning-module.html#setup) ⚙️\n", - " - Loads in data from file and prepares PyTorch tensor datasets for each split (train, val, test). \n", - " - Setup expects a 'stage' arg which is used to separate logic for 'fit' and 'test'.\n", - " - If you don't mind loading all your datasets at once, you can set up a condition to allow for both 'fit' related setup and 'test' related setup to run whenever `None` is passed to `stage` (or ignore it altogether and exclude any conditionals).\n", - " - **Note this runs across all GPUs and it *is* safe to make state assignments here**\n", - "\n", - "3. [x_dataloader()](https://pytorch-lightning.readthedocs.io/en/latest/lightning-module.html#data-hooks) ♻️\n", - " - `train_dataloader()`, `val_dataloader()`, and `test_dataloader()` all return PyTorch `DataLoader` instances that are created by wrapping their respective datasets that we prepared in `setup()`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "4DNItffri95Q", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class LitMNIST(pl.LightningModule):\n", - " \n", - " def __init__(self, data_dir='./', hidden_size=64, learning_rate=2e-4):\n", - "\n", - " super().__init__()\n", - "\n", - " # Set our init args as class attributes\n", - " self.data_dir = data_dir\n", - " self.hidden_size = hidden_size\n", - " self.learning_rate = learning_rate\n", - "\n", - " # Hardcode some dataset specific attributes\n", - " self.num_classes = 10\n", - " self.dims = (1, 28, 28)\n", - " channels, width, height = self.dims\n", - " self.transform = transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,))\n", - " ])\n", - "\n", - " # Define PyTorch model\n", - " self.model = nn.Sequential(\n", - " nn.Flatten(),\n", - " nn.Linear(channels * width * height, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, self.num_classes)\n", - " )\n", - "\n", - " def forward(self, x):\n", - " x = self.model(x)\n", - " return F.log_softmax(x, dim=1)\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " preds = torch.argmax(logits, dim=1)\n", - " acc = accuracy(preds, y)\n", - "\n", - " # Calling self.log will surface up scalars for you in TensorBoard\n", - " self.log('val_loss', loss, prog_bar=True)\n", - " self.log('val_acc', acc, prog_bar=True)\n", - " return loss\n", - "\n", - " def test_step(self, batch, batch_idx):\n", - " # Here we just reuse the validation_step for testing\n", - " return self.validation_step(batch, batch_idx)\n", - "\n", - " def configure_optimizers(self):\n", - " optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)\n", - " return optimizer\n", - "\n", - " ####################\n", - " # DATA RELATED HOOKS\n", - " ####################\n", - "\n", - " def prepare_data(self):\n", - " # download\n", - " MNIST(self.data_dir, train=True, download=True)\n", - " MNIST(self.data_dir, train=False, download=True)\n", - "\n", - " def setup(self, stage=None):\n", - "\n", - " # Assign train/val datasets for use in dataloaders\n", - " if stage == 'fit' or stage is None:\n", - " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", - " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", - "\n", - " # Assign test dataset for use in dataloader(s)\n", - " if stage == 'test' or stage is None:\n", - " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", - "\n", - " def train_dataloader(self):\n", - " return DataLoader(self.mnist_train, batch_size=32)\n", - "\n", - " def val_dataloader(self):\n", - " return DataLoader(self.mnist_val, batch_size=32)\n", - "\n", - " def test_dataloader(self):\n", - " return DataLoader(self.mnist_test, batch_size=32)" - ], - "execution_count": 5, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Mb0U5Rk2kLBy", - "colab_type": "code", - "colab": {} - }, - "source": [ - "model = LitMNIST()\n", - "trainer = pl.Trainer(gpus=1, max_epochs=3, progress_bar_refresh_rate=20)\n", - "trainer.fit(model)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nht8AvMptY6I", - "colab_type": "text" - }, - "source": [ - "### Testing\n", - "\n", - "To test a model, call `trainer.test(model)`.\n", - "\n", - "Or, if you've just trained a model, you can just call `trainer.test()` and Lightning will automatically test using the best saved checkpoint (conditioned on val_loss)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "PA151FkLtprO", - "colab_type": "code", - "colab": {} - }, - "source": [ - "trainer.test()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "T3-3lbbNtr5T", - "colab_type": "text" - }, - "source": [ - "### Bonus Tip\n", - "\n", - "You can keep calling `trainer.fit(model)` as many times as you'd like to continue training" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "IFBwCbLet2r6", - "colab_type": "code", - "colab": {} - }, - "source": [ - "trainer.fit(model)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8TRyS5CCt3n9", - "colab_type": "text" - }, - "source": [ - "In Colab, you can use the TensorBoard magic function to view the logs that Lightning has created for you!" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "wizS-QiLuAYo", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Start tensorboard.\n", - "%load_ext tensorboard\n", - "%tensorboard --logdir lightning_logs/" - ], - "execution_count": null, - "outputs": [] - } - ] + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "i7XbLCXGkll9" + }, + "source": [ + "# Introduction to Pytorch Lightning ⚡\n", + "\n", + "In this notebook, we'll go over the basics of lightning by preparing models to train on the [MNIST Handwritten Digits dataset](https://en.wikipedia.org/wiki/MNIST_database).\n", + "\n", + "---\n", + " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", + " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", + " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "2LODD6w9ixlT" + }, + "source": [ + "### Setup \n", + "Lightning is easy to install. Simply ```pip install pytorch-lightning```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "zK7-Gg69kMnG" + }, + "outputs": [], + "source": [ + "! pip install pytorch-lightning --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "w4_TYnt_keJi" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import torch\n", + "from torch import nn\n", + "from torch.nn import functional as F\n", + "from torch.utils.data import DataLoader, random_split\n", + "from torchvision.datasets import MNIST\n", + "from torchvision import transforms\n", + "import pytorch_lightning as pl\n", + "from pytorch_lightning.metrics.functional import accuracy" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "EHpyMPKFkVbZ" + }, + "source": [ + "## Simplest example\n", + "\n", + "Here's the simplest most minimal example with just a training loop (no validation, no testing).\n", + "\n", + "**Keep in Mind** - A `LightningModule` *is* a PyTorch `nn.Module` - it just has a few more helpful features." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "V7ELesz1kVQo" + }, + "outputs": [], + "source": [ + "class MNISTModel(pl.LightningModule):\n", + "\n", + " def __init__(self):\n", + " super(MNISTModel, self).__init__()\n", + " self.l1 = torch.nn.Linear(28 * 28, 10)\n", + "\n", + " def forward(self, x):\n", + " return torch.relu(self.l1(x.view(x.size(0), -1)))\n", + "\n", + " def training_step(self, batch, batch_nb):\n", + " x, y = batch\n", + " loss = F.cross_entropy(self(x), y)\n", + " return loss\n", + "\n", + " def configure_optimizers(self):\n", + " return torch.optim.Adam(self.parameters(), lr=0.02)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "hIrtHg-Dv8TJ" + }, + "source": [ + "By using the `Trainer` you automatically get:\n", + "1. Tensorboard logging\n", + "2. Model checkpointing\n", + "3. Training and validation loop\n", + "4. early-stopping" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "4Dk6Ykv8lI7X" + }, + "outputs": [], + "source": [ + "# Init our model\n", + "mnist_model = MNISTModel()\n", + "\n", + "# Init DataLoader from MNIST Dataset\n", + "train_ds = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())\n", + "train_loader = DataLoader(train_ds, batch_size=32)\n", + "\n", + "# Initialize a trainer\n", + "trainer = pl.Trainer(gpus=1, max_epochs=3, progress_bar_refresh_rate=20)\n", + "\n", + "# Train the model ⚡\n", + "trainer.fit(mnist_model, train_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "KNpOoBeIjscS" + }, + "source": [ + "## A more complete MNIST Lightning Module Example\n", + "\n", + "That wasn't so hard was it?\n", + "\n", + "Now that we've got our feet wet, let's dive in a bit deeper and write a more complete `LightningModule` for MNIST...\n", + "\n", + "This time, we'll bake in all the dataset specific pieces directly in the `LightningModule`. This way, we can avoid writing extra code at the beginning of our script every time we want to run it.\n", + "\n", + "---\n", + "\n", + "### Note what the following built-in functions are doing:\n", + "\n", + "1. [prepare_data()](https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.core.lightning.html#pytorch_lightning.core.lightning.LightningModule.prepare_data) 💾\n", + " - This is where we can download the dataset. We point to our desired dataset and ask torchvision's `MNIST` dataset class to download if the dataset isn't found there.\n", + " - **Note we do not make any state assignments in this function** (i.e. `self.something = ...`)\n", + "\n", + "2. [setup(stage)](https://pytorch-lightning.readthedocs.io/en/latest/lightning-module.html#setup) ⚙️\n", + " - Loads in data from file and prepares PyTorch tensor datasets for each split (train, val, test). \n", + " - Setup expects a 'stage' arg which is used to separate logic for 'fit' and 'test'.\n", + " - If you don't mind loading all your datasets at once, you can set up a condition to allow for both 'fit' related setup and 'test' related setup to run whenever `None` is passed to `stage` (or ignore it altogether and exclude any conditionals).\n", + " - **Note this runs across all GPUs and it *is* safe to make state assignments here**\n", + "\n", + "3. [x_dataloader()](https://pytorch-lightning.readthedocs.io/en/latest/lightning-module.html#data-hooks) ♻️\n", + " - `train_dataloader()`, `val_dataloader()`, and `test_dataloader()` all return PyTorch `DataLoader` instances that are created by wrapping their respective datasets that we prepared in `setup()`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "4DNItffri95Q" + }, + "outputs": [], + "source": [ + "class LitMNIST(pl.LightningModule):\n", + " \n", + " def __init__(self, data_dir='./', hidden_size=64, learning_rate=2e-4):\n", + "\n", + " super().__init__()\n", + "\n", + " # Set our init args as class attributes\n", + " self.data_dir = data_dir\n", + " self.hidden_size = hidden_size\n", + " self.learning_rate = learning_rate\n", + "\n", + " # Hardcode some dataset specific attributes\n", + " self.num_classes = 10\n", + " self.dims = (1, 28, 28)\n", + " channels, width, height = self.dims\n", + " self.transform = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize((0.1307,), (0.3081,))\n", + " ])\n", + "\n", + " # Define PyTorch model\n", + " self.model = nn.Sequential(\n", + " nn.Flatten(),\n", + " nn.Linear(channels * width * height, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Dropout(0.1),\n", + " nn.Linear(hidden_size, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Dropout(0.1),\n", + " nn.Linear(hidden_size, self.num_classes)\n", + " )\n", + "\n", + " def forward(self, x):\n", + " x = self.model(x)\n", + " return F.log_softmax(x, dim=1)\n", + "\n", + " def training_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " logits = self(x)\n", + " loss = F.nll_loss(logits, y)\n", + " return loss\n", + "\n", + " def validation_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " logits = self(x)\n", + " loss = F.nll_loss(logits, y)\n", + " preds = torch.argmax(logits, dim=1)\n", + " acc = accuracy(preds, y)\n", + "\n", + " # Calling self.log will surface up scalars for you in TensorBoard\n", + " self.log('val_loss', loss, prog_bar=True)\n", + " self.log('val_acc', acc, prog_bar=True)\n", + " return loss\n", + "\n", + " def test_step(self, batch, batch_idx):\n", + " # Here we just reuse the validation_step for testing\n", + " return self.validation_step(batch, batch_idx)\n", + "\n", + " def configure_optimizers(self):\n", + " optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)\n", + " return optimizer\n", + "\n", + " ####################\n", + " # DATA RELATED HOOKS\n", + " ####################\n", + "\n", + " def prepare_data(self):\n", + " # download\n", + " MNIST(self.data_dir, train=True, download=True)\n", + " MNIST(self.data_dir, train=False, download=True)\n", + "\n", + " def setup(self, stage=None):\n", + "\n", + " # Assign train/val datasets for use in dataloaders\n", + " if stage == 'fit' or stage is None:\n", + " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", + " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", + "\n", + " # Assign test dataset for use in dataloader(s)\n", + " if stage == 'test' or stage is None:\n", + " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", + "\n", + " def train_dataloader(self):\n", + " return DataLoader(self.mnist_train, batch_size=32)\n", + "\n", + " def val_dataloader(self):\n", + " return DataLoader(self.mnist_val, batch_size=32)\n", + "\n", + " def test_dataloader(self):\n", + " return DataLoader(self.mnist_test, batch_size=32)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Mb0U5Rk2kLBy" + }, + "outputs": [], + "source": [ + "model = LitMNIST()\n", + "trainer = pl.Trainer(gpus=1, max_epochs=3, progress_bar_refresh_rate=20)\n", + "trainer.fit(model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "nht8AvMptY6I" + }, + "source": [ + "### Testing\n", + "\n", + "To test a model, call `trainer.test(model)`.\n", + "\n", + "Or, if you've just trained a model, you can just call `trainer.test()` and Lightning will automatically test using the best saved checkpoint (conditioned on val_loss)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "PA151FkLtprO" + }, + "outputs": [], + "source": [ + "trainer.test()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "T3-3lbbNtr5T" + }, + "source": [ + "### Bonus Tip\n", + "\n", + "You can keep calling `trainer.fit(model)` as many times as you'd like to continue training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "IFBwCbLet2r6" + }, + "outputs": [], + "source": [ + "trainer.fit(model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8TRyS5CCt3n9" + }, + "source": [ + "In Colab, you can use the TensorBoard magic function to view the logs that Lightning has created for you!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "wizS-QiLuAYo" + }, + "outputs": [], + "source": [ + "# Start tensorboard.\n", + "%load_ext tensorboard\n", + "%tensorboard --logdir lightning_logs/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "

Congratulations - Time to Join the Community!

\n", + "
\n", + "\n", + "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n", + "\n", + "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n", + "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n", + "\n", + "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n", + "\n", + "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)!\n", + "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n", + "\n", + "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n", + "\n", + "* Please, star [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "\n", + "### Contributions !\n", + "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n", + "\n", + "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* [Bolt good first issue](https://github.com/PyTorchLightning/pytorch-lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* You can also contribute your own notebooks with useful examples !\n", + "\n", + "### Great thanks from the entire Pytorch Lightning Team for your interest !\n", + "\n", + "" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "authorship_tag": "ABX9TyOtAKVa5POQ6Xg3UcTQqXDJ", + "collapsed_sections": [], + "include_colab_link": true, + "name": "01-mnist-hello-world.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/notebooks/02-datamodules.ipynb b/notebooks/02-datamodules.ipynb index 3e027cd304c77..599cb1d6bd289 100644 --- a/notebooks/02-datamodules.ipynb +++ b/notebooks/02-datamodules.ipynb @@ -1,540 +1,588 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "02-datamodules.ipynb", - "provenance": [], - "collapsed_sections": [], - "toc_visible": true, - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2O5r7QvP8-rt", - "colab_type": "text" - }, - "source": [ - "# PyTorch Lightning DataModules ⚡\n", - "\n", - "With the release of `pytorch-lightning` version 0.9.0, we have included a new class called `LightningDataModule` to help you decouple data related hooks from your `LightningModule`.\n", - "\n", - "This notebook will walk you through how to start using Datamodules.\n", - "\n", - "The most up to date documentation on datamodules can be found [here](https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html).\n", - "\n", - "---\n", - "\n", - " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", - " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", - " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6RYMhmfA9ATN", - "colab_type": "text" - }, - "source": [ - "### Setup\n", - "Lightning is easy to install. Simply ```pip install pytorch-lightning```" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "lj2zD-wsbvGr", - "colab_type": "code", - "colab": {} - }, - "source": [ - "! pip install pytorch-lightning --quiet" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8g2mbvy-9xDI", - "colab_type": "text" - }, - "source": [ - "# Introduction\n", - "\n", - "First, we'll go over a regular `LightningModule` implementation without the use of a `LightningDataModule`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "eg-xDlmDdAwy", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import pytorch_lightning as pl\n", - "from pytorch_lightning.metrics.functional import accuracy\n", - "import torch\n", - "from torch import nn\n", - "import torch.nn.functional as F\n", - "from torch.utils.data import random_split, DataLoader\n", - "\n", - "# Note - you must have torchvision installed for this example\n", - "from torchvision.datasets import MNIST, CIFAR10\n", - "from torchvision import transforms" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DzgY7wi88UuG", - "colab_type": "text" - }, - "source": [ - "## Defining the LitMNISTModel\n", - "\n", - "Below, we reuse a `LightningModule` from our hello world tutorial that classifies MNIST Handwritten Digits.\n", - "\n", - "Unfortunately, we have hardcoded dataset-specific items within the model, forever limiting it to working with MNIST Data. 😢\n", - "\n", - "This is fine if you don't plan on training/evaluating your model on different datasets. However, in many cases, this can become bothersome when you want to try out your architecture with different datasets." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "IQkW8_FF5nU2", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class LitMNIST(pl.LightningModule):\n", - " \n", - " def __init__(self, data_dir='./', hidden_size=64, learning_rate=2e-4):\n", - "\n", - " super().__init__()\n", - "\n", - " # We hardcode dataset specific stuff here.\n", - " self.data_dir = data_dir\n", - " self.num_classes = 10\n", - " self.dims = (1, 28, 28)\n", - " channels, width, height = self.dims\n", - " self.transform = transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,))\n", - " ])\n", - "\n", - " self.hidden_size = hidden_size\n", - " self.learning_rate = learning_rate\n", - "\n", - " # Build model\n", - " self.model = nn.Sequential(\n", - " nn.Flatten(),\n", - " nn.Linear(channels * width * height, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, self.num_classes)\n", - " )\n", - "\n", - " def forward(self, x):\n", - " x = self.model(x)\n", - " return F.log_softmax(x, dim=1)\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " preds = torch.argmax(logits, dim=1)\n", - " acc = accuracy(preds, y)\n", - " self.log('val_loss', loss, prog_bar=True)\n", - " self.log('val_acc', acc, prog_bar=True)\n", - " return loss\n", - "\n", - " def configure_optimizers(self):\n", - " optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)\n", - " return optimizer\n", - "\n", - " ####################\n", - " # DATA RELATED HOOKS\n", - " ####################\n", - "\n", - " def prepare_data(self):\n", - " # download\n", - " MNIST(self.data_dir, train=True, download=True)\n", - " MNIST(self.data_dir, train=False, download=True)\n", - "\n", - " def setup(self, stage=None):\n", - "\n", - " # Assign train/val datasets for use in dataloaders\n", - " if stage == 'fit' or stage is None:\n", - " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", - " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", - "\n", - " # Assign test dataset for use in dataloader(s)\n", - " if stage == 'test' or stage is None:\n", - " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", - "\n", - " def train_dataloader(self):\n", - " return DataLoader(self.mnist_train, batch_size=32)\n", - "\n", - " def val_dataloader(self):\n", - " return DataLoader(self.mnist_val, batch_size=32)\n", - "\n", - " def test_dataloader(self):\n", - " return DataLoader(self.mnist_test, batch_size=32)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "K7sg9KQd-QIO", - "colab_type": "text" - }, - "source": [ - "## Training the ListMNIST Model" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "QxDNDaus6byD", - "colab_type": "code", - "colab": {} - }, - "source": [ - "model = LitMNIST()\n", - "trainer = pl.Trainer(max_epochs=2, gpus=1, progress_bar_refresh_rate=20)\n", - "trainer.fit(model)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dY8d6GxmB0YU", - "colab_type": "text" - }, - "source": [ - "# Using DataModules\n", - "\n", - "DataModules are a way of decoupling data-related hooks from the `LightningModule` so you can develop dataset agnostic models." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eJeT5bW081wn", - "colab_type": "text" - }, - "source": [ - "## Defining The MNISTDataModule\n", - "\n", - "Let's go over each function in the class below and talk about what they're doing:\n", - "\n", - "1. ```__init__```\n", - " - Takes in a `data_dir` arg that points to where you have downloaded/wish to download the MNIST dataset.\n", - " - Defines a transform that will be applied across train, val, and test dataset splits.\n", - " - Defines default `self.dims`, which is a tuple returned from `datamodule.size()` that can help you initialize models.\n", - "\n", - "\n", - "2. ```prepare_data```\n", - " - This is where we can download the dataset. We point to our desired dataset and ask torchvision's `MNIST` dataset class to download if the dataset isn't found there.\n", - " - **Note we do not make any state assignments in this function** (i.e. `self.something = ...`)\n", - "\n", - "3. ```setup```\n", - " - Loads in data from file and prepares PyTorch tensor datasets for each split (train, val, test). \n", - " - Setup expects a 'stage' arg which is used to separate logic for 'fit' and 'test'.\n", - " - If you don't mind loading all your datasets at once, you can set up a condition to allow for both 'fit' related setup and 'test' related setup to run whenever `None` is passed to `stage`.\n", - " - **Note this runs across all GPUs and it *is* safe to make state assignments here**\n", - "\n", - "\n", - "4. ```x_dataloader```\n", - " - `train_dataloader()`, `val_dataloader()`, and `test_dataloader()` all return PyTorch `DataLoader` instances that are created by wrapping their respective datasets that we prepared in `setup()`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "DfGKyGwG_X9v", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class MNISTDataModule(pl.LightningDataModule):\n", - "\n", - " def __init__(self, data_dir: str = './'):\n", - " super().__init__()\n", - " self.data_dir = data_dir\n", - " self.transform = transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,))\n", - " ])\n", - "\n", - " # self.dims is returned when you call dm.size()\n", - " # Setting default dims here because we know them.\n", - " # Could optionally be assigned dynamically in dm.setup()\n", - " self.dims = (1, 28, 28)\n", - " self.num_classes = 10\n", - "\n", - " def prepare_data(self):\n", - " # download\n", - " MNIST(self.data_dir, train=True, download=True)\n", - " MNIST(self.data_dir, train=False, download=True)\n", - "\n", - " def setup(self, stage=None):\n", - "\n", - " # Assign train/val datasets for use in dataloaders\n", - " if stage == 'fit' or stage is None:\n", - " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", - " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", - "\n", - " # Assign test dataset for use in dataloader(s)\n", - " if stage == 'test' or stage is None:\n", - " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", - "\n", - " def train_dataloader(self):\n", - " return DataLoader(self.mnist_train, batch_size=32)\n", - "\n", - " def val_dataloader(self):\n", - " return DataLoader(self.mnist_val, batch_size=32)\n", - "\n", - " def test_dataloader(self):\n", - " return DataLoader(self.mnist_test, batch_size=32)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "H2Yoj-9M9dS7", - "colab_type": "text" - }, - "source": [ - "## Defining the dataset agnostic `LitModel`\n", - "\n", - "Below, we define the same model as the `LitMNIST` model we made earlier. \n", - "\n", - "However, this time our model has the freedom to use any input data that we'd like 🔥." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "PM2IISuOBDIu", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class LitModel(pl.LightningModule):\n", - " \n", - " def __init__(self, channels, width, height, num_classes, hidden_size=64, learning_rate=2e-4):\n", - "\n", - " super().__init__()\n", - "\n", - " # We take in input dimensions as parameters and use those to dynamically build model.\n", - " self.channels = channels\n", - " self.width = width\n", - " self.height = height\n", - " self.num_classes = num_classes\n", - " self.hidden_size = hidden_size\n", - " self.learning_rate = learning_rate\n", - "\n", - " self.model = nn.Sequential(\n", - " nn.Flatten(),\n", - " nn.Linear(channels * width * height, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, hidden_size),\n", - " nn.ReLU(),\n", - " nn.Dropout(0.1),\n", - " nn.Linear(hidden_size, num_classes)\n", - " )\n", - "\n", - " def forward(self, x):\n", - " x = self.model(x)\n", - " return F.log_softmax(x, dim=1)\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx):\n", - "\n", - " x, y = batch\n", - " logits = self(x)\n", - " loss = F.nll_loss(logits, y)\n", - " preds = torch.argmax(logits, dim=1)\n", - " acc = accuracy(preds, y)\n", - " self.log('val_loss', loss, prog_bar=True)\n", - " self.log('val_acc', acc, prog_bar=True)\n", - " return loss\n", - "\n", - " def configure_optimizers(self):\n", - " optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)\n", - " return optimizer" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "G4Z5olPe-xEo", - "colab_type": "text" - }, - "source": [ - "## Training the `LitModel` using the `MNISTDataModule`\n", - "\n", - "Now, we initialize and train the `LitModel` using the `MNISTDataModule`'s configuration settings and dataloaders." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "kV48vP_9mEli", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Init DataModule\n", - "dm = MNISTDataModule()\n", - "# Init model from datamodule's attributes\n", - "model = LitModel(*dm.size(), dm.num_classes)\n", - "# Init trainer\n", - "trainer = pl.Trainer(max_epochs=3, progress_bar_refresh_rate=20, gpus=1)\n", - "# Pass the datamodule as arg to trainer.fit to override model hooks :)\n", - "trainer.fit(model, dm)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WNxrugIGRRv5", - "colab_type": "text" - }, - "source": [ - "## Defining the CIFAR10 DataModule\n", - "\n", - "Lets prove the `LitModel` we made earlier is dataset agnostic by defining a new datamodule for the CIFAR10 dataset." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "1tkaYLU7RT5P", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class CIFAR10DataModule(pl.LightningDataModule):\n", - "\n", - " def __init__(self, data_dir: str = './'):\n", - " super().__init__()\n", - " self.data_dir = data_dir\n", - " self.transform = transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n", - " ])\n", - "\n", - " self.dims = (3, 32, 32)\n", - " self.num_classes = 10\n", - "\n", - " def prepare_data(self):\n", - " # download\n", - " CIFAR10(self.data_dir, train=True, download=True)\n", - " CIFAR10(self.data_dir, train=False, download=True)\n", - "\n", - " def setup(self, stage=None):\n", - "\n", - " # Assign train/val datasets for use in dataloaders\n", - " if stage == 'fit' or stage is None:\n", - " cifar_full = CIFAR10(self.data_dir, train=True, transform=self.transform)\n", - " self.cifar_train, self.cifar_val = random_split(cifar_full, [45000, 5000])\n", - "\n", - " # Assign test dataset for use in dataloader(s)\n", - " if stage == 'test' or stage is None:\n", - " self.cifar_test = CIFAR10(self.data_dir, train=False, transform=self.transform)\n", - "\n", - " def train_dataloader(self):\n", - " return DataLoader(self.cifar_train, batch_size=32)\n", - "\n", - " def val_dataloader(self):\n", - " return DataLoader(self.cifar_val, batch_size=32)\n", - "\n", - " def test_dataloader(self):\n", - " return DataLoader(self.cifar_test, batch_size=32)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BrXxf3oX_gsZ", - "colab_type": "text" - }, - "source": [ - "## Training the `LitModel` using the `CIFAR10DataModule`\n", - "\n", - "Our model isn't very good, so it will perform pretty badly on the CIFAR10 dataset.\n", - "\n", - "The point here is that we can see that our `LitModel` has no problem using a different datamodule as its input data." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "sd-SbWi_krdj", - "colab_type": "code", - "colab": {} - }, - "source": [ - "dm = CIFAR10DataModule()\n", - "model = LitModel(*dm.size(), dm.num_classes, hidden_size=256)\n", - "trainer = pl.Trainer(max_epochs=5, progress_bar_refresh_rate=20, gpus=1)\n", - "trainer.fit(model, dm)" - ], - "execution_count": null, - "outputs": [] - } - ] + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "2O5r7QvP8-rt" + }, + "source": [ + "# PyTorch Lightning DataModules ⚡\n", + "\n", + "With the release of `pytorch-lightning` version 0.9.0, we have included a new class called `LightningDataModule` to help you decouple data related hooks from your `LightningModule`.\n", + "\n", + "This notebook will walk you through how to start using Datamodules.\n", + "\n", + "The most up to date documentation on datamodules can be found [here](https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html).\n", + "\n", + "---\n", + "\n", + " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", + " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", + " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "6RYMhmfA9ATN" + }, + "source": [ + "### Setup\n", + "Lightning is easy to install. Simply ```pip install pytorch-lightning```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "lj2zD-wsbvGr" + }, + "outputs": [], + "source": [ + "! pip install pytorch-lightning --quiet" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8g2mbvy-9xDI" + }, + "source": [ + "# Introduction\n", + "\n", + "First, we'll go over a regular `LightningModule` implementation without the use of a `LightningDataModule`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "eg-xDlmDdAwy" + }, + "outputs": [], + "source": [ + "import pytorch_lightning as pl\n", + "from pytorch_lightning.metrics.functional import accuracy\n", + "import torch\n", + "from torch import nn\n", + "import torch.nn.functional as F\n", + "from torch.utils.data import random_split, DataLoader\n", + "\n", + "# Note - you must have torchvision installed for this example\n", + "from torchvision.datasets import MNIST, CIFAR10\n", + "from torchvision import transforms" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "DzgY7wi88UuG" + }, + "source": [ + "## Defining the LitMNISTModel\n", + "\n", + "Below, we reuse a `LightningModule` from our hello world tutorial that classifies MNIST Handwritten Digits.\n", + "\n", + "Unfortunately, we have hardcoded dataset-specific items within the model, forever limiting it to working with MNIST Data. 😢\n", + "\n", + "This is fine if you don't plan on training/evaluating your model on different datasets. However, in many cases, this can become bothersome when you want to try out your architecture with different datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "IQkW8_FF5nU2" + }, + "outputs": [], + "source": [ + "class LitMNIST(pl.LightningModule):\n", + " \n", + " def __init__(self, data_dir='./', hidden_size=64, learning_rate=2e-4):\n", + "\n", + " super().__init__()\n", + "\n", + " # We hardcode dataset specific stuff here.\n", + " self.data_dir = data_dir\n", + " self.num_classes = 10\n", + " self.dims = (1, 28, 28)\n", + " channels, width, height = self.dims\n", + " self.transform = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize((0.1307,), (0.3081,))\n", + " ])\n", + "\n", + " self.hidden_size = hidden_size\n", + " self.learning_rate = learning_rate\n", + "\n", + " # Build model\n", + " self.model = nn.Sequential(\n", + " nn.Flatten(),\n", + " nn.Linear(channels * width * height, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Dropout(0.1),\n", + " nn.Linear(hidden_size, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Dropout(0.1),\n", + " nn.Linear(hidden_size, self.num_classes)\n", + " )\n", + "\n", + " def forward(self, x):\n", + " x = self.model(x)\n", + " return F.log_softmax(x, dim=1)\n", + "\n", + " def training_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " logits = self(x)\n", + " loss = F.nll_loss(logits, y)\n", + " return loss\n", + "\n", + " def validation_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " logits = self(x)\n", + " loss = F.nll_loss(logits, y)\n", + " preds = torch.argmax(logits, dim=1)\n", + " acc = accuracy(preds, y)\n", + " self.log('val_loss', loss, prog_bar=True)\n", + " self.log('val_acc', acc, prog_bar=True)\n", + " return loss\n", + "\n", + " def configure_optimizers(self):\n", + " optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)\n", + " return optimizer\n", + "\n", + " ####################\n", + " # DATA RELATED HOOKS\n", + " ####################\n", + "\n", + " def prepare_data(self):\n", + " # download\n", + " MNIST(self.data_dir, train=True, download=True)\n", + " MNIST(self.data_dir, train=False, download=True)\n", + "\n", + " def setup(self, stage=None):\n", + "\n", + " # Assign train/val datasets for use in dataloaders\n", + " if stage == 'fit' or stage is None:\n", + " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", + " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", + "\n", + " # Assign test dataset for use in dataloader(s)\n", + " if stage == 'test' or stage is None:\n", + " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", + "\n", + " def train_dataloader(self):\n", + " return DataLoader(self.mnist_train, batch_size=32)\n", + "\n", + " def val_dataloader(self):\n", + " return DataLoader(self.mnist_val, batch_size=32)\n", + "\n", + " def test_dataloader(self):\n", + " return DataLoader(self.mnist_test, batch_size=32)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "K7sg9KQd-QIO" + }, + "source": [ + "## Training the ListMNIST Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "QxDNDaus6byD" + }, + "outputs": [], + "source": [ + "model = LitMNIST()\n", + "trainer = pl.Trainer(max_epochs=2, gpus=1, progress_bar_refresh_rate=20)\n", + "trainer.fit(model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "dY8d6GxmB0YU" + }, + "source": [ + "# Using DataModules\n", + "\n", + "DataModules are a way of decoupling data-related hooks from the `LightningModule` so you can develop dataset agnostic models." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "eJeT5bW081wn" + }, + "source": [ + "## Defining The MNISTDataModule\n", + "\n", + "Let's go over each function in the class below and talk about what they're doing:\n", + "\n", + "1. ```__init__```\n", + " - Takes in a `data_dir` arg that points to where you have downloaded/wish to download the MNIST dataset.\n", + " - Defines a transform that will be applied across train, val, and test dataset splits.\n", + " - Defines default `self.dims`, which is a tuple returned from `datamodule.size()` that can help you initialize models.\n", + "\n", + "\n", + "2. ```prepare_data```\n", + " - This is where we can download the dataset. We point to our desired dataset and ask torchvision's `MNIST` dataset class to download if the dataset isn't found there.\n", + " - **Note we do not make any state assignments in this function** (i.e. `self.something = ...`)\n", + "\n", + "3. ```setup```\n", + " - Loads in data from file and prepares PyTorch tensor datasets for each split (train, val, test). \n", + " - Setup expects a 'stage' arg which is used to separate logic for 'fit' and 'test'.\n", + " - If you don't mind loading all your datasets at once, you can set up a condition to allow for both 'fit' related setup and 'test' related setup to run whenever `None` is passed to `stage`.\n", + " - **Note this runs across all GPUs and it *is* safe to make state assignments here**\n", + "\n", + "\n", + "4. ```x_dataloader```\n", + " - `train_dataloader()`, `val_dataloader()`, and `test_dataloader()` all return PyTorch `DataLoader` instances that are created by wrapping their respective datasets that we prepared in `setup()`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "DfGKyGwG_X9v" + }, + "outputs": [], + "source": [ + "class MNISTDataModule(pl.LightningDataModule):\n", + "\n", + " def __init__(self, data_dir: str = './'):\n", + " super().__init__()\n", + " self.data_dir = data_dir\n", + " self.transform = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize((0.1307,), (0.3081,))\n", + " ])\n", + "\n", + " # self.dims is returned when you call dm.size()\n", + " # Setting default dims here because we know them.\n", + " # Could optionally be assigned dynamically in dm.setup()\n", + " self.dims = (1, 28, 28)\n", + " self.num_classes = 10\n", + "\n", + " def prepare_data(self):\n", + " # download\n", + " MNIST(self.data_dir, train=True, download=True)\n", + " MNIST(self.data_dir, train=False, download=True)\n", + "\n", + " def setup(self, stage=None):\n", + "\n", + " # Assign train/val datasets for use in dataloaders\n", + " if stage == 'fit' or stage is None:\n", + " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", + " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", + "\n", + " # Assign test dataset for use in dataloader(s)\n", + " if stage == 'test' or stage is None:\n", + " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", + "\n", + " def train_dataloader(self):\n", + " return DataLoader(self.mnist_train, batch_size=32)\n", + "\n", + " def val_dataloader(self):\n", + " return DataLoader(self.mnist_val, batch_size=32)\n", + "\n", + " def test_dataloader(self):\n", + " return DataLoader(self.mnist_test, batch_size=32)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "H2Yoj-9M9dS7" + }, + "source": [ + "## Defining the dataset agnostic `LitModel`\n", + "\n", + "Below, we define the same model as the `LitMNIST` model we made earlier. \n", + "\n", + "However, this time our model has the freedom to use any input data that we'd like 🔥." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "PM2IISuOBDIu" + }, + "outputs": [], + "source": [ + "class LitModel(pl.LightningModule):\n", + " \n", + " def __init__(self, channels, width, height, num_classes, hidden_size=64, learning_rate=2e-4):\n", + "\n", + " super().__init__()\n", + "\n", + " # We take in input dimensions as parameters and use those to dynamically build model.\n", + " self.channels = channels\n", + " self.width = width\n", + " self.height = height\n", + " self.num_classes = num_classes\n", + " self.hidden_size = hidden_size\n", + " self.learning_rate = learning_rate\n", + "\n", + " self.model = nn.Sequential(\n", + " nn.Flatten(),\n", + " nn.Linear(channels * width * height, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Dropout(0.1),\n", + " nn.Linear(hidden_size, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Dropout(0.1),\n", + " nn.Linear(hidden_size, num_classes)\n", + " )\n", + "\n", + " def forward(self, x):\n", + " x = self.model(x)\n", + " return F.log_softmax(x, dim=1)\n", + "\n", + " def training_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " logits = self(x)\n", + " loss = F.nll_loss(logits, y)\n", + " return loss\n", + "\n", + " def validation_step(self, batch, batch_idx):\n", + "\n", + " x, y = batch\n", + " logits = self(x)\n", + " loss = F.nll_loss(logits, y)\n", + " preds = torch.argmax(logits, dim=1)\n", + " acc = accuracy(preds, y)\n", + " self.log('val_loss', loss, prog_bar=True)\n", + " self.log('val_acc', acc, prog_bar=True)\n", + " return loss\n", + "\n", + " def configure_optimizers(self):\n", + " optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)\n", + " return optimizer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "G4Z5olPe-xEo" + }, + "source": [ + "## Training the `LitModel` using the `MNISTDataModule`\n", + "\n", + "Now, we initialize and train the `LitModel` using the `MNISTDataModule`'s configuration settings and dataloaders." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "kV48vP_9mEli" + }, + "outputs": [], + "source": [ + "# Init DataModule\n", + "dm = MNISTDataModule()\n", + "# Init model from datamodule's attributes\n", + "model = LitModel(*dm.size(), dm.num_classes)\n", + "# Init trainer\n", + "trainer = pl.Trainer(max_epochs=3, progress_bar_refresh_rate=20, gpus=1)\n", + "# Pass the datamodule as arg to trainer.fit to override model hooks :)\n", + "trainer.fit(model, dm)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "WNxrugIGRRv5" + }, + "source": [ + "## Defining the CIFAR10 DataModule\n", + "\n", + "Lets prove the `LitModel` we made earlier is dataset agnostic by defining a new datamodule for the CIFAR10 dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "1tkaYLU7RT5P" + }, + "outputs": [], + "source": [ + "class CIFAR10DataModule(pl.LightningDataModule):\n", + "\n", + " def __init__(self, data_dir: str = './'):\n", + " super().__init__()\n", + " self.data_dir = data_dir\n", + " self.transform = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n", + " ])\n", + "\n", + " self.dims = (3, 32, 32)\n", + " self.num_classes = 10\n", + "\n", + " def prepare_data(self):\n", + " # download\n", + " CIFAR10(self.data_dir, train=True, download=True)\n", + " CIFAR10(self.data_dir, train=False, download=True)\n", + "\n", + " def setup(self, stage=None):\n", + "\n", + " # Assign train/val datasets for use in dataloaders\n", + " if stage == 'fit' or stage is None:\n", + " cifar_full = CIFAR10(self.data_dir, train=True, transform=self.transform)\n", + " self.cifar_train, self.cifar_val = random_split(cifar_full, [45000, 5000])\n", + "\n", + " # Assign test dataset for use in dataloader(s)\n", + " if stage == 'test' or stage is None:\n", + " self.cifar_test = CIFAR10(self.data_dir, train=False, transform=self.transform)\n", + "\n", + " def train_dataloader(self):\n", + " return DataLoader(self.cifar_train, batch_size=32)\n", + "\n", + " def val_dataloader(self):\n", + " return DataLoader(self.cifar_val, batch_size=32)\n", + "\n", + " def test_dataloader(self):\n", + " return DataLoader(self.cifar_test, batch_size=32)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "BrXxf3oX_gsZ" + }, + "source": [ + "## Training the `LitModel` using the `CIFAR10DataModule`\n", + "\n", + "Our model isn't very good, so it will perform pretty badly on the CIFAR10 dataset.\n", + "\n", + "The point here is that we can see that our `LitModel` has no problem using a different datamodule as its input data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "sd-SbWi_krdj" + }, + "outputs": [], + "source": [ + "dm = CIFAR10DataModule()\n", + "model = LitModel(*dm.size(), dm.num_classes, hidden_size=256)\n", + "trainer = pl.Trainer(max_epochs=5, progress_bar_refresh_rate=20, gpus=1)\n", + "trainer.fit(model, dm)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "

Congratulations - Time to Join the Community!

\n", + "
\n", + "\n", + "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n", + "\n", + "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n", + "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n", + "\n", + "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n", + "\n", + "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)!\n", + "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n", + "\n", + "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n", + "\n", + "* Please, star [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "\n", + "### Contributions !\n", + "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n", + "\n", + "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* [Bolt good first issue](https://github.com/PyTorchLightning/pytorch-lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* You can also contribute your own notebooks with useful examples !\n", + "\n", + "### Great thanks from the entire Pytorch Lightning Team for your interest !\n", + "\n", + "" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "include_colab_link": true, + "name": "02-datamodules.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/notebooks/03-basic-gan.ipynb b/notebooks/03-basic-gan.ipynb index a19153e133a5f..31555265938d8 100644 --- a/notebooks/03-basic-gan.ipynb +++ b/notebooks/03-basic-gan.ipynb @@ -1,424 +1,472 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "03-basic-gan.ipynb", - "provenance": [], - "collapsed_sections": [], - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "J37PBnE_x7IW", - "colab_type": "text" - }, - "source": [ - "# PyTorch Lightning Basic GAN Tutorial ⚡\n", - "\n", - "How to train a GAN!\n", - "\n", - "Main takeaways:\n", - "1. Generator and discriminator are arbitrary PyTorch modules.\n", - "2. training_step does both the generator and discriminator training.\n", - "\n", - "---\n", - "\n", - " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", - " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", - " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kg2MKpRmybht", - "colab_type": "text" - }, - "source": [ - "### Setup\n", - "Lightning is easy to install. Simply `pip install pytorch-lightning`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "LfrJLKPFyhsK", - "colab_type": "code", - "colab": {} - }, - "source": [ - "! pip install pytorch-lightning --quiet" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "BjEPuiVLyanw", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import os\n", - "from argparse import ArgumentParser\n", - "from collections import OrderedDict\n", - "\n", - "import numpy as np\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "import torchvision\n", - "import torchvision.transforms as transforms\n", - "from torch.utils.data import DataLoader, random_split\n", - "from torchvision.datasets import MNIST\n", - "\n", - "import pytorch_lightning as pl" - ], - "execution_count": 2, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "OuXJzr4G2uHV", - "colab_type": "text" - }, - "source": [ - "### MNIST DataModule\n", - "\n", - "Below, we define a DataModule for the MNIST Dataset. To learn more about DataModules, check out our tutorial on them or see the [latest docs](https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "DOY_nHu328g7", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class MNISTDataModule(pl.LightningDataModule):\n", - "\n", - " def __init__(self, data_dir: str = './', batch_size: int = 64, num_workers: int = 8):\n", - " super().__init__()\n", - " self.data_dir = data_dir\n", - " self.batch_size = batch_size\n", - " self.num_workers = num_workers\n", - "\n", - " self.transform = transforms.Compose([\n", - " transforms.ToTensor(),\n", - " transforms.Normalize((0.1307,), (0.3081,))\n", - " ])\n", - "\n", - " # self.dims is returned when you call dm.size()\n", - " # Setting default dims here because we know them.\n", - " # Could optionally be assigned dynamically in dm.setup()\n", - " self.dims = (1, 28, 28)\n", - " self.num_classes = 10\n", - "\n", - " def prepare_data(self):\n", - " # download\n", - " MNIST(self.data_dir, train=True, download=True)\n", - " MNIST(self.data_dir, train=False, download=True)\n", - "\n", - " def setup(self, stage=None):\n", - "\n", - " # Assign train/val datasets for use in dataloaders\n", - " if stage == 'fit' or stage is None:\n", - " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", - " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", - "\n", - " # Assign test dataset for use in dataloader(s)\n", - " if stage == 'test' or stage is None:\n", - " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", - "\n", - " def train_dataloader(self):\n", - " return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=self.num_workers)\n", - "\n", - " def val_dataloader(self):\n", - " return DataLoader(self.mnist_val, batch_size=self.batch_size, num_workers=self.num_workers)\n", - "\n", - " def test_dataloader(self):\n", - " return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=self.num_workers)" - ], - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tW3c0QrQyF9P", - "colab_type": "text" - }, - "source": [ - "### A. Generator" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "0E2QDjl5yWtz", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class Generator(nn.Module):\n", - " def __init__(self, latent_dim, img_shape):\n", - " super().__init__()\n", - " self.img_shape = img_shape\n", - "\n", - " def block(in_feat, out_feat, normalize=True):\n", - " layers = [nn.Linear(in_feat, out_feat)]\n", - " if normalize:\n", - " layers.append(nn.BatchNorm1d(out_feat, 0.8))\n", - " layers.append(nn.LeakyReLU(0.2, inplace=True))\n", - " return layers\n", - "\n", - " self.model = nn.Sequential(\n", - " *block(latent_dim, 128, normalize=False),\n", - " *block(128, 256),\n", - " *block(256, 512),\n", - " *block(512, 1024),\n", - " nn.Linear(1024, int(np.prod(img_shape))),\n", - " nn.Tanh()\n", - " )\n", - "\n", - " def forward(self, z):\n", - " img = self.model(z)\n", - " img = img.view(img.size(0), *self.img_shape)\n", - " return img" - ], - "execution_count": 4, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uyrltsGvyaI3", - "colab_type": "text" - }, - "source": [ - "### B. Discriminator" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Ed3MR3vnyxyW", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class Discriminator(nn.Module):\n", - " def __init__(self, img_shape):\n", - " super().__init__()\n", - "\n", - " self.model = nn.Sequential(\n", - " nn.Linear(int(np.prod(img_shape)), 512),\n", - " nn.LeakyReLU(0.2, inplace=True),\n", - " nn.Linear(512, 256),\n", - " nn.LeakyReLU(0.2, inplace=True),\n", - " nn.Linear(256, 1),\n", - " nn.Sigmoid(),\n", - " )\n", - "\n", - " def forward(self, img):\n", - " img_flat = img.view(img.size(0), -1)\n", - " validity = self.model(img_flat)\n", - "\n", - " return validity" - ], - "execution_count": 5, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BwUMom3ryySK", - "colab_type": "text" - }, - "source": [ - "### C. GAN\n", - "\n", - "#### A couple of cool features to check out in this example...\n", - "\n", - " - We use `some_tensor.type_as(another_tensor)` to make sure we initialize new tensors on the right device (i.e. GPU, CPU).\n", - " - Lightning will put your dataloader data on the right device automatically\n", - " - In this example, we pull from latent dim on the fly, so we need to dynamically add tensors to the right device.\n", - " - `type_as` is the way we recommend to do this.\n", - " - This example shows how to use multiple dataloaders in your `LightningModule`." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "3vKszYf6y1Vv", - "colab_type": "code", - "colab": {} - }, - "source": [ - " class GAN(pl.LightningModule):\n", - "\n", - " def __init__(\n", - " self,\n", - " channels,\n", - " width,\n", - " height,\n", - " latent_dim: int = 100,\n", - " lr: float = 0.0002,\n", - " b1: float = 0.5,\n", - " b2: float = 0.999,\n", - " batch_size: int = 64,\n", - " **kwargs\n", - " ):\n", - " super().__init__()\n", - " self.save_hyperparameters()\n", - "\n", - " # networks\n", - " data_shape = (channels, width, height)\n", - " self.generator = Generator(latent_dim=self.hparams.latent_dim, img_shape=data_shape)\n", - " self.discriminator = Discriminator(img_shape=data_shape)\n", - "\n", - " self.validation_z = torch.randn(8, self.hparams.latent_dim)\n", - "\n", - " self.example_input_array = torch.zeros(2, self.hparams.latent_dim)\n", - "\n", - " def forward(self, z):\n", - " return self.generator(z)\n", - "\n", - " def adversarial_loss(self, y_hat, y):\n", - " return F.binary_cross_entropy(y_hat, y)\n", - "\n", - " def training_step(self, batch, batch_idx, optimizer_idx):\n", - " imgs, _ = batch\n", - "\n", - " # sample noise\n", - " z = torch.randn(imgs.shape[0], self.hparams.latent_dim)\n", - " z = z.type_as(imgs)\n", - "\n", - " # train generator\n", - " if optimizer_idx == 0:\n", - "\n", - " # generate images\n", - " self.generated_imgs = self(z)\n", - "\n", - " # log sampled images\n", - " sample_imgs = self.generated_imgs[:6]\n", - " grid = torchvision.utils.make_grid(sample_imgs)\n", - " self.logger.experiment.add_image('generated_images', grid, 0)\n", - "\n", - " # ground truth result (ie: all fake)\n", - " # put on GPU because we created this tensor inside training_loop\n", - " valid = torch.ones(imgs.size(0), 1)\n", - " valid = valid.type_as(imgs)\n", - "\n", - " # adversarial loss is binary cross-entropy\n", - " g_loss = self.adversarial_loss(self.discriminator(self(z)), valid)\n", - " tqdm_dict = {'g_loss': g_loss}\n", - " output = OrderedDict({\n", - " 'loss': g_loss,\n", - " 'progress_bar': tqdm_dict,\n", - " 'log': tqdm_dict\n", - " })\n", - " return output\n", - "\n", - " # train discriminator\n", - " if optimizer_idx == 1:\n", - " # Measure discriminator's ability to classify real from generated samples\n", - "\n", - " # how well can it label as real?\n", - " valid = torch.ones(imgs.size(0), 1)\n", - " valid = valid.type_as(imgs)\n", - "\n", - " real_loss = self.adversarial_loss(self.discriminator(imgs), valid)\n", - "\n", - " # how well can it label as fake?\n", - " fake = torch.zeros(imgs.size(0), 1)\n", - " fake = fake.type_as(imgs)\n", - "\n", - " fake_loss = self.adversarial_loss(\n", - " self.discriminator(self(z).detach()), fake)\n", - "\n", - " # discriminator loss is the average of these\n", - " d_loss = (real_loss + fake_loss) / 2\n", - " tqdm_dict = {'d_loss': d_loss}\n", - " output = OrderedDict({\n", - " 'loss': d_loss,\n", - " 'progress_bar': tqdm_dict,\n", - " 'log': tqdm_dict\n", - " })\n", - " return output\n", - "\n", - " def configure_optimizers(self):\n", - " lr = self.hparams.lr\n", - " b1 = self.hparams.b1\n", - " b2 = self.hparams.b2\n", - "\n", - " opt_g = torch.optim.Adam(self.generator.parameters(), lr=lr, betas=(b1, b2))\n", - " opt_d = torch.optim.Adam(self.discriminator.parameters(), lr=lr, betas=(b1, b2))\n", - " return [opt_g, opt_d], []\n", - "\n", - " def on_epoch_end(self):\n", - " z = self.validation_z.type_as(self.generator.model[0].weight)\n", - "\n", - " # log sampled images\n", - " sample_imgs = self(z)\n", - " grid = torchvision.utils.make_grid(sample_imgs)\n", - " self.logger.experiment.add_image('generated_images', grid, self.current_epoch)" - ], - "execution_count": 6, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Ey5FmJPnzm_E", - "colab_type": "code", - "colab": {} - }, - "source": [ - "dm = MNISTDataModule()\n", - "model = GAN(*dm.size())\n", - "trainer = pl.Trainer(gpus=1, max_epochs=5, progress_bar_refresh_rate=20)\n", - "trainer.fit(model, dm)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "MlECc7cHzolp", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# Start tensorboard.\n", - "%load_ext tensorboard\n", - "%tensorboard --logdir lightning_logs/" - ], - "execution_count": null, - "outputs": [] - } - ] + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "J37PBnE_x7IW" + }, + "source": [ + "# PyTorch Lightning Basic GAN Tutorial ⚡\n", + "\n", + "How to train a GAN!\n", + "\n", + "Main takeaways:\n", + "1. Generator and discriminator are arbitrary PyTorch modules.\n", + "2. training_step does both the generator and discriminator training.\n", + "\n", + "---\n", + "\n", + " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", + " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", + " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "kg2MKpRmybht" + }, + "source": [ + "### Setup\n", + "Lightning is easy to install. Simply `pip install pytorch-lightning`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "LfrJLKPFyhsK" + }, + "outputs": [], + "source": [ + "! pip install pytorch-lightning --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "BjEPuiVLyanw" + }, + "outputs": [], + "source": [ + "import os\n", + "from argparse import ArgumentParser\n", + "from collections import OrderedDict\n", + "\n", + "import numpy as np\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "import torchvision\n", + "import torchvision.transforms as transforms\n", + "from torch.utils.data import DataLoader, random_split\n", + "from torchvision.datasets import MNIST\n", + "\n", + "import pytorch_lightning as pl" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "OuXJzr4G2uHV" + }, + "source": [ + "### MNIST DataModule\n", + "\n", + "Below, we define a DataModule for the MNIST Dataset. To learn more about DataModules, check out our tutorial on them or see the [latest docs](https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "DOY_nHu328g7" + }, + "outputs": [], + "source": [ + "class MNISTDataModule(pl.LightningDataModule):\n", + "\n", + " def __init__(self, data_dir: str = './', batch_size: int = 64, num_workers: int = 8):\n", + " super().__init__()\n", + " self.data_dir = data_dir\n", + " self.batch_size = batch_size\n", + " self.num_workers = num_workers\n", + "\n", + " self.transform = transforms.Compose([\n", + " transforms.ToTensor(),\n", + " transforms.Normalize((0.1307,), (0.3081,))\n", + " ])\n", + "\n", + " # self.dims is returned when you call dm.size()\n", + " # Setting default dims here because we know them.\n", + " # Could optionally be assigned dynamically in dm.setup()\n", + " self.dims = (1, 28, 28)\n", + " self.num_classes = 10\n", + "\n", + " def prepare_data(self):\n", + " # download\n", + " MNIST(self.data_dir, train=True, download=True)\n", + " MNIST(self.data_dir, train=False, download=True)\n", + "\n", + " def setup(self, stage=None):\n", + "\n", + " # Assign train/val datasets for use in dataloaders\n", + " if stage == 'fit' or stage is None:\n", + " mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)\n", + " self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])\n", + "\n", + " # Assign test dataset for use in dataloader(s)\n", + " if stage == 'test' or stage is None:\n", + " self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)\n", + "\n", + " def train_dataloader(self):\n", + " return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=self.num_workers)\n", + "\n", + " def val_dataloader(self):\n", + " return DataLoader(self.mnist_val, batch_size=self.batch_size, num_workers=self.num_workers)\n", + "\n", + " def test_dataloader(self):\n", + " return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=self.num_workers)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "tW3c0QrQyF9P" + }, + "source": [ + "### A. Generator" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "0E2QDjl5yWtz" + }, + "outputs": [], + "source": [ + "class Generator(nn.Module):\n", + " def __init__(self, latent_dim, img_shape):\n", + " super().__init__()\n", + " self.img_shape = img_shape\n", + "\n", + " def block(in_feat, out_feat, normalize=True):\n", + " layers = [nn.Linear(in_feat, out_feat)]\n", + " if normalize:\n", + " layers.append(nn.BatchNorm1d(out_feat, 0.8))\n", + " layers.append(nn.LeakyReLU(0.2, inplace=True))\n", + " return layers\n", + "\n", + " self.model = nn.Sequential(\n", + " *block(latent_dim, 128, normalize=False),\n", + " *block(128, 256),\n", + " *block(256, 512),\n", + " *block(512, 1024),\n", + " nn.Linear(1024, int(np.prod(img_shape))),\n", + " nn.Tanh()\n", + " )\n", + "\n", + " def forward(self, z):\n", + " img = self.model(z)\n", + " img = img.view(img.size(0), *self.img_shape)\n", + " return img" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "uyrltsGvyaI3" + }, + "source": [ + "### B. Discriminator" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Ed3MR3vnyxyW" + }, + "outputs": [], + "source": [ + "class Discriminator(nn.Module):\n", + " def __init__(self, img_shape):\n", + " super().__init__()\n", + "\n", + " self.model = nn.Sequential(\n", + " nn.Linear(int(np.prod(img_shape)), 512),\n", + " nn.LeakyReLU(0.2, inplace=True),\n", + " nn.Linear(512, 256),\n", + " nn.LeakyReLU(0.2, inplace=True),\n", + " nn.Linear(256, 1),\n", + " nn.Sigmoid(),\n", + " )\n", + "\n", + " def forward(self, img):\n", + " img_flat = img.view(img.size(0), -1)\n", + " validity = self.model(img_flat)\n", + "\n", + " return validity" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "BwUMom3ryySK" + }, + "source": [ + "### C. GAN\n", + "\n", + "#### A couple of cool features to check out in this example...\n", + "\n", + " - We use `some_tensor.type_as(another_tensor)` to make sure we initialize new tensors on the right device (i.e. GPU, CPU).\n", + " - Lightning will put your dataloader data on the right device automatically\n", + " - In this example, we pull from latent dim on the fly, so we need to dynamically add tensors to the right device.\n", + " - `type_as` is the way we recommend to do this.\n", + " - This example shows how to use multiple dataloaders in your `LightningModule`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "3vKszYf6y1Vv" + }, + "outputs": [], + "source": [ + " class GAN(pl.LightningModule):\n", + "\n", + " def __init__(\n", + " self,\n", + " channels,\n", + " width,\n", + " height,\n", + " latent_dim: int = 100,\n", + " lr: float = 0.0002,\n", + " b1: float = 0.5,\n", + " b2: float = 0.999,\n", + " batch_size: int = 64,\n", + " **kwargs\n", + " ):\n", + " super().__init__()\n", + " self.save_hyperparameters()\n", + "\n", + " # networks\n", + " data_shape = (channels, width, height)\n", + " self.generator = Generator(latent_dim=self.hparams.latent_dim, img_shape=data_shape)\n", + " self.discriminator = Discriminator(img_shape=data_shape)\n", + "\n", + " self.validation_z = torch.randn(8, self.hparams.latent_dim)\n", + "\n", + " self.example_input_array = torch.zeros(2, self.hparams.latent_dim)\n", + "\n", + " def forward(self, z):\n", + " return self.generator(z)\n", + "\n", + " def adversarial_loss(self, y_hat, y):\n", + " return F.binary_cross_entropy(y_hat, y)\n", + "\n", + " def training_step(self, batch, batch_idx, optimizer_idx):\n", + " imgs, _ = batch\n", + "\n", + " # sample noise\n", + " z = torch.randn(imgs.shape[0], self.hparams.latent_dim)\n", + " z = z.type_as(imgs)\n", + "\n", + " # train generator\n", + " if optimizer_idx == 0:\n", + "\n", + " # generate images\n", + " self.generated_imgs = self(z)\n", + "\n", + " # log sampled images\n", + " sample_imgs = self.generated_imgs[:6]\n", + " grid = torchvision.utils.make_grid(sample_imgs)\n", + " self.logger.experiment.add_image('generated_images', grid, 0)\n", + "\n", + " # ground truth result (ie: all fake)\n", + " # put on GPU because we created this tensor inside training_loop\n", + " valid = torch.ones(imgs.size(0), 1)\n", + " valid = valid.type_as(imgs)\n", + "\n", + " # adversarial loss is binary cross-entropy\n", + " g_loss = self.adversarial_loss(self.discriminator(self(z)), valid)\n", + " tqdm_dict = {'g_loss': g_loss}\n", + " output = OrderedDict({\n", + " 'loss': g_loss,\n", + " 'progress_bar': tqdm_dict,\n", + " 'log': tqdm_dict\n", + " })\n", + " return output\n", + "\n", + " # train discriminator\n", + " if optimizer_idx == 1:\n", + " # Measure discriminator's ability to classify real from generated samples\n", + "\n", + " # how well can it label as real?\n", + " valid = torch.ones(imgs.size(0), 1)\n", + " valid = valid.type_as(imgs)\n", + "\n", + " real_loss = self.adversarial_loss(self.discriminator(imgs), valid)\n", + "\n", + " # how well can it label as fake?\n", + " fake = torch.zeros(imgs.size(0), 1)\n", + " fake = fake.type_as(imgs)\n", + "\n", + " fake_loss = self.adversarial_loss(\n", + " self.discriminator(self(z).detach()), fake)\n", + "\n", + " # discriminator loss is the average of these\n", + " d_loss = (real_loss + fake_loss) / 2\n", + " tqdm_dict = {'d_loss': d_loss}\n", + " output = OrderedDict({\n", + " 'loss': d_loss,\n", + " 'progress_bar': tqdm_dict,\n", + " 'log': tqdm_dict\n", + " })\n", + " return output\n", + "\n", + " def configure_optimizers(self):\n", + " lr = self.hparams.lr\n", + " b1 = self.hparams.b1\n", + " b2 = self.hparams.b2\n", + "\n", + " opt_g = torch.optim.Adam(self.generator.parameters(), lr=lr, betas=(b1, b2))\n", + " opt_d = torch.optim.Adam(self.discriminator.parameters(), lr=lr, betas=(b1, b2))\n", + " return [opt_g, opt_d], []\n", + "\n", + " def on_epoch_end(self):\n", + " z = self.validation_z.type_as(self.generator.model[0].weight)\n", + "\n", + " # log sampled images\n", + " sample_imgs = self(z)\n", + " grid = torchvision.utils.make_grid(sample_imgs)\n", + " self.logger.experiment.add_image('generated_images', grid, self.current_epoch)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Ey5FmJPnzm_E" + }, + "outputs": [], + "source": [ + "dm = MNISTDataModule()\n", + "model = GAN(*dm.size())\n", + "trainer = pl.Trainer(gpus=1, max_epochs=5, progress_bar_refresh_rate=20)\n", + "trainer.fit(model, dm)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "MlECc7cHzolp" + }, + "outputs": [], + "source": [ + "# Start tensorboard.\n", + "%load_ext tensorboard\n", + "%tensorboard --logdir lightning_logs/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "

Congratulations - Time to Join the Community!

\n", + "
\n", + "\n", + "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n", + "\n", + "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n", + "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n", + "\n", + "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n", + "\n", + "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)!\n", + "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n", + "\n", + "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n", + "\n", + "* Please, star [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "\n", + "### Contributions !\n", + "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n", + "\n", + "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* [Bolt good first issue](https://github.com/PyTorchLightning/pytorch-lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* You can also contribute your own notebooks with useful examples !\n", + "\n", + "### Great thanks from the entire Pytorch Lightning Team for your interest !\n", + "\n", + "" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "include_colab_link": true, + "name": "03-basic-gan.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/notebooks/04-transformers-text-classification.ipynb b/notebooks/04-transformers-text-classification.ipynb index ae7424c7d4864..037b24e4ddd9d 100644 --- a/notebooks/04-transformers-text-classification.ipynb +++ b/notebooks/04-transformers-text-classification.ipynb @@ -1,543 +1,591 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "04-transformers-text-classification.ipynb", - "provenance": [], - "collapsed_sections": [], - "toc_visible": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8ag5ANQPJ_j9" + }, + "source": [ + "# Finetune 🤗 Transformers Models with PyTorch Lightning ⚡\n", + "\n", + "This notebook will use HuggingFace's `datasets` library to get data, which will be wrapped in a `LightningDataModule`. Then, we write a class to perform text classification on any dataset from the[ GLUE Benchmark](https://gluebenchmark.com/). (We just show CoLA and MRPC due to constraint on compute/disk)\n", + "\n", + "[HuggingFace's NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=cola) can help you get a feel for the two datasets we will use and what tasks they are solving for.\n", + "\n", + "---\n", + " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", + " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", + " - Ask a question on [the forum](https://forums.pytorchlightning.ai/)\n", + " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)\n", + "\n", + " - [HuggingFace datasets](https://github.com/huggingface/datasets)\n", + " - [HuggingFace transformers](https://github.com/huggingface/transformers)" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "8ag5ANQPJ_j9", - "colab_type": "text" - }, - "source": [ - "# Finetune 🤗 Transformers Models with PyTorch Lightning ⚡\n", - "\n", - "This notebook will use HuggingFace's `datasets` library to get data, which will be wrapped in a `LightningDataModule`. Then, we write a class to perform text classification on any dataset from the[ GLUE Benchmark](https://gluebenchmark.com/). (We just show CoLA and MRPC due to constraint on compute/disk)\n", - "\n", - "[HuggingFace's NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=cola) can help you get a feel for the two datasets we will use and what tasks they are solving for.\n", - "\n", - "---\n", - " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", - " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", - " - Ask a question on [the forum](https://forums.pytorchlightning.ai/)\n", - " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)\n", - "\n", - " - [HuggingFace datasets](https://github.com/huggingface/datasets)\n", - " - [HuggingFace transformers](https://github.com/huggingface/transformers)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fqlsVTj7McZ3", - "colab_type": "text" - }, - "source": [ - "### Setup" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "OIhHrRL-MnKK", - "colab_type": "code", - "colab": {} - }, - "source": [ - "!pip install pytorch-lightning datasets transformers" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "6yuQT_ZQMpCg", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from argparse import ArgumentParser\n", - "from datetime import datetime\n", - "from typing import Optional\n", - "\n", - "import datasets\n", - "import numpy as np\n", - "import pytorch_lightning as pl\n", - "import torch\n", - "from torch.utils.data import DataLoader\n", - "from transformers import (\n", - " AdamW,\n", - " AutoModelForSequenceClassification,\n", - " AutoConfig,\n", - " AutoTokenizer,\n", - " get_linear_schedule_with_warmup,\n", - " glue_compute_metrics\n", - ")" - ], - "execution_count": 2, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9ORJfiuiNZ_N", - "colab_type": "text" - }, - "source": [ - "## GLUE DataModule" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "jW9xQhZxMz1G", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class GLUEDataModule(pl.LightningDataModule):\n", - "\n", - " task_text_field_map = {\n", - " 'cola': ['sentence'],\n", - " 'sst2': ['sentence'],\n", - " 'mrpc': ['sentence1', 'sentence2'],\n", - " 'qqp': ['question1', 'question2'],\n", - " 'stsb': ['sentence1', 'sentence2'],\n", - " 'mnli': ['premise', 'hypothesis'],\n", - " 'qnli': ['question', 'sentence'],\n", - " 'rte': ['sentence1', 'sentence2'],\n", - " 'wnli': ['sentence1', 'sentence2'],\n", - " 'ax': ['premise', 'hypothesis']\n", - " }\n", - "\n", - " glue_task_num_labels = {\n", - " 'cola': 2,\n", - " 'sst2': 2,\n", - " 'mrpc': 2,\n", - " 'qqp': 2,\n", - " 'stsb': 1,\n", - " 'mnli': 3,\n", - " 'qnli': 2,\n", - " 'rte': 2,\n", - " 'wnli': 2,\n", - " 'ax': 3\n", - " }\n", - "\n", - " loader_columns = [\n", - " 'datasets_idx',\n", - " 'input_ids',\n", - " 'token_type_ids',\n", - " 'attention_mask',\n", - " 'start_positions',\n", - " 'end_positions',\n", - " 'labels'\n", - " ]\n", - "\n", - " def __init__(\n", - " self,\n", - " model_name_or_path: str,\n", - " task_name: str ='mrpc',\n", - " max_seq_length: int = 128,\n", - " train_batch_size: int = 32,\n", - " eval_batch_size: int = 32,\n", - " **kwargs\n", - " ):\n", - " super().__init__()\n", - " self.model_name_or_path = model_name_or_path\n", - " self.task_name = task_name\n", - " self.max_seq_length = max_seq_length\n", - " self.train_batch_size = train_batch_size\n", - " self.eval_batch_size = eval_batch_size\n", - "\n", - " self.text_fields = self.task_text_field_map[task_name]\n", - " self.num_labels = self.glue_task_num_labels[task_name]\n", - " self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)\n", - "\n", - " def setup(self, stage):\n", - " self.dataset = datasets.load_dataset('glue', self.task_name)\n", - "\n", - " for split in self.dataset.keys():\n", - " self.dataset[split] = self.dataset[split].map(\n", - " self.convert_to_features,\n", - " batched=True,\n", - " remove_columns=['label'],\n", - " )\n", - " self.columns = [c for c in self.dataset[split].column_names if c in self.loader_columns]\n", - " self.dataset[split].set_format(type=\"torch\", columns=self.columns)\n", - "\n", - " self.eval_splits = [x for x in self.dataset.keys() if 'validation' in x]\n", - "\n", - " def prepare_data(self):\n", - " datasets.load_dataset('glue', self.task_name)\n", - " AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)\n", - " \n", - " def train_dataloader(self):\n", - " return DataLoader(self.dataset['train'], batch_size=self.train_batch_size)\n", - " \n", - " def val_dataloader(self):\n", - " if len(self.eval_splits) == 1:\n", - " return DataLoader(self.dataset['validation'], batch_size=self.eval_batch_size)\n", - " elif len(self.eval_splits) > 1:\n", - " return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]\n", - "\n", - " def test_dataloader(self):\n", - " if len(self.eval_splits) == 1:\n", - " return DataLoader(self.dataset['test'], batch_size=self.eval_batch_size)\n", - " elif len(self.eval_splits) > 1:\n", - " return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]\n", - "\n", - " def convert_to_features(self, example_batch, indices=None):\n", - "\n", - " # Either encode single sentence or sentence pairs\n", - " if len(self.text_fields) > 1:\n", - " texts_or_text_pairs = list(zip(example_batch[self.text_fields[0]], example_batch[self.text_fields[1]]))\n", - " else:\n", - " texts_or_text_pairs = example_batch[self.text_fields[0]]\n", - "\n", - " # Tokenize the text/text pairs\n", - " features = self.tokenizer.batch_encode_plus(\n", - " texts_or_text_pairs,\n", - " max_length=self.max_seq_length,\n", - " pad_to_max_length=True,\n", - " truncation=True\n", - " )\n", - "\n", - " # Rename label to labels to make it easier to pass to model forward\n", - " features['labels'] = example_batch['label']\n", - "\n", - " return features" - ], - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jQC3a6KuOpX3", - "colab_type": "text" - }, - "source": [ - "#### You could use this datamodule with standalone PyTorch if you wanted..." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "JCMH3IAsNffF", - "colab_type": "code", - "colab": {} - }, - "source": [ - "dm = GLUEDataModule('distilbert-base-uncased')\n", - "dm.prepare_data()\n", - "dm.setup('fit')\n", - "next(iter(dm.train_dataloader()))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "l9fQ_67BO2Lj", - "colab_type": "text" - }, - "source": [ - "## GLUE Model" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "gtn5YGKYO65B", - "colab_type": "code", - "colab": {} - }, - "source": [ - "class GLUETransformer(pl.LightningModule):\n", - " def __init__(\n", - " self,\n", - " model_name_or_path: str,\n", - " num_labels: int,\n", - " learning_rate: float = 2e-5,\n", - " adam_epsilon: float = 1e-8,\n", - " warmup_steps: int = 0,\n", - " weight_decay: float = 0.0,\n", - " train_batch_size: int = 32,\n", - " eval_batch_size: int = 32,\n", - " eval_splits: Optional[list] = None,\n", - " **kwargs\n", - " ):\n", - " super().__init__()\n", - "\n", - " self.save_hyperparameters()\n", - "\n", - " self.config = AutoConfig.from_pretrained(model_name_or_path, num_labels=num_labels)\n", - " self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, config=self.config)\n", - " self.metric = datasets.load_metric(\n", - " 'glue',\n", - " self.hparams.task_name,\n", - " experiment_id=datetime.now().strftime(\"%d-%m-%Y_%H-%M-%S\")\n", - " )\n", - "\n", - " def forward(self, **inputs):\n", - " return self.model(**inputs)\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " outputs = self(**batch)\n", - " loss = outputs[0]\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx, dataloader_idx=0):\n", - " outputs = self(**batch)\n", - " val_loss, logits = outputs[:2]\n", - "\n", - " if self.hparams.num_labels >= 1:\n", - " preds = torch.argmax(logits, axis=1)\n", - " elif self.hparams.num_labels == 1:\n", - " preds = logits.squeeze()\n", - "\n", - " labels = batch[\"labels\"]\n", - "\n", - " return {'loss': val_loss, \"preds\": preds, \"labels\": labels}\n", - "\n", - " def validation_epoch_end(self, outputs):\n", - " if self.hparams.task_name == 'mnli':\n", - " for i, output in enumerate(outputs):\n", - " # matched or mismatched\n", - " split = self.hparams.eval_splits[i].split('_')[-1]\n", - " preds = torch.cat([x['preds'] for x in output]).detach().cpu().numpy()\n", - " labels = torch.cat([x['labels'] for x in output]).detach().cpu().numpy()\n", - " loss = torch.stack([x['loss'] for x in output]).mean()\n", - " self.log(f'val_loss_{split}', loss, prog_bar=True)\n", - " split_metrics = {f\"{k}_{split}\": v for k, v in self.metric.compute(predictions=preds, references=labels).items()}\n", - " self.log_dict(split_metrics, prog_bar=True)\n", - " return loss\n", - "\n", - " preds = torch.cat([x['preds'] for x in outputs]).detach().cpu().numpy()\n", - " labels = torch.cat([x['labels'] for x in outputs]).detach().cpu().numpy()\n", - " loss = torch.stack([x['loss'] for x in outputs]).mean()\n", - " self.log('val_loss', loss, prog_bar=True)\n", - " self.log_dict(self.metric.compute(predictions=preds, references=labels), prog_bar=True)\n", - " return loss\n", - "\n", - " def setup(self, stage):\n", - " if stage == 'fit':\n", - " # Get dataloader by calling it - train_dataloader() is called after setup() by default\n", - " train_loader = self.train_dataloader()\n", - "\n", - " # Calculate total steps\n", - " self.total_steps = (\n", - " (len(train_loader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.gpus)))\n", - " // self.hparams.accumulate_grad_batches\n", - " * float(self.hparams.max_epochs)\n", - " )\n", - "\n", - " def configure_optimizers(self):\n", - " \"Prepare optimizer and schedule (linear warmup and decay)\"\n", - " model = self.model\n", - " no_decay = [\"bias\", \"LayerNorm.weight\"]\n", - " optimizer_grouped_parameters = [\n", - " {\n", - " \"params\": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],\n", - " \"weight_decay\": self.hparams.weight_decay,\n", - " },\n", - " {\n", - " \"params\": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],\n", - " \"weight_decay\": 0.0,\n", - " },\n", - " ]\n", - " optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)\n", - "\n", - " scheduler = get_linear_schedule_with_warmup(\n", - " optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps\n", - " )\n", - " scheduler = {\n", - " 'scheduler': scheduler,\n", - " 'interval': 'step',\n", - " 'frequency': 1\n", - " }\n", - " return [optimizer], [scheduler]\n", - "\n", - " @staticmethod\n", - " def add_model_specific_args(parent_parser):\n", - " parser = ArgumentParser(parents=[parent_parser], add_help=False)\n", - " parser.add_argument(\"--learning_rate\", default=2e-5, type=float)\n", - " parser.add_argument(\"--adam_epsilon\", default=1e-8, type=float)\n", - " parser.add_argument(\"--warmup_steps\", default=0, type=int)\n", - " parser.add_argument(\"--weight_decay\", default=0.0, type=float)\n", - " return parser" - ], - "execution_count": 5, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ha-NdIP_xbd3", - "colab_type": "text" - }, - "source": [ - "### ⚡ Quick Tip \n", - " - Combine arguments from your DataModule, Model, and Trainer into one for easy and robust configuration" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "3dEHnl3RPlAR", - "colab_type": "code", - "colab": {} - }, - "source": [ - "def parse_args(args=None):\n", - " parser = ArgumentParser()\n", - " parser = pl.Trainer.add_argparse_args(parser)\n", - " parser = GLUEDataModule.add_argparse_args(parser)\n", - " parser = GLUETransformer.add_model_specific_args(parser)\n", - " parser.add_argument('--seed', type=int, default=42)\n", - " return parser.parse_args(args)\n", - "\n", - "\n", - "def main(args):\n", - " pl.seed_everything(args.seed)\n", - " dm = GLUEDataModule.from_argparse_args(args)\n", - " dm.prepare_data()\n", - " dm.setup('fit')\n", - " model = GLUETransformer(num_labels=dm.num_labels, eval_splits=dm.eval_splits, **vars(args))\n", - " trainer = pl.Trainer.from_argparse_args(args)\n", - " return dm, model, trainer" - ], - "execution_count": 6, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PkuLaeec3sJ-", - "colab_type": "text" - }, - "source": [ - "# Training" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QSpueK5UPsN7", - "colab_type": "text" - }, - "source": [ - "## CoLA\n", - "\n", - "See an interactive view of the CoLA dataset in [NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=cola)" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "NJnFmtpnPu0Y", - "colab_type": "code", - "colab": {} - }, - "source": [ - "mocked_args = \"\"\"\n", - " --model_name_or_path albert-base-v2\n", - " --task_name cola\n", - " --max_epochs 3\n", - " --gpus 1\"\"\".split()\n", - "\n", - "args = parse_args(mocked_args)\n", - "dm, model, trainer = main(args)\n", - "trainer.fit(model, dm)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_MrNsTnqdz4z", - "colab_type": "text" - }, - "source": [ - "## MRPC\n", - "\n", - "See an interactive view of the MRPC dataset in [NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=mrpc)" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "LBwRxg9Cb3d-", - "colab_type": "code", - "colab": {} - }, - "source": [ - "mocked_args = \"\"\"\n", - " --model_name_or_path distilbert-base-cased\n", - " --task_name mrpc\n", - " --max_epochs 3\n", - " --gpus 1\"\"\".split()\n", - "\n", - "args = parse_args(mocked_args)\n", - "dm, model, trainer = main(args)\n", - "trainer.fit(model, dm)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iZhbn0HzfdCu", - "colab_type": "text" - }, - "source": [ - "## MNLI\n", - "\n", - " - The MNLI dataset is huge, so we aren't going to bother trying to train it here.\n", - "\n", - " - Let's just make sure our multi-dataloader logic is right by skipping over training and going straight to validation.\n", - "\n", - "See an interactive view of the MRPC dataset in [NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=mnli)" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "AvsZMOggfcWW", - "colab_type": "code", - "colab": {} - }, - "source": [ - "mocked_args = \"\"\"\n", - " --model_name_or_path distilbert-base-uncased\n", - " --task_name mnli\n", - " --max_epochs 1\n", - " --gpus 1\n", - " --limit_train_batches 10\n", - " --progress_bar_refresh_rate 20\"\"\".split()\n", - "\n", - "args = parse_args(mocked_args)\n", - "dm, model, trainer = main(args)\n", - "trainer.fit(model, dm)" - ], - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "fqlsVTj7McZ3" + }, + "source": [ + "### Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "OIhHrRL-MnKK" + }, + "outputs": [], + "source": [ + "!pip install pytorch-lightning datasets transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "6yuQT_ZQMpCg" + }, + "outputs": [], + "source": [ + "from argparse import ArgumentParser\n", + "from datetime import datetime\n", + "from typing import Optional\n", + "\n", + "import datasets\n", + "import numpy as np\n", + "import pytorch_lightning as pl\n", + "import torch\n", + "from torch.utils.data import DataLoader\n", + "from transformers import (\n", + " AdamW,\n", + " AutoModelForSequenceClassification,\n", + " AutoConfig,\n", + " AutoTokenizer,\n", + " get_linear_schedule_with_warmup,\n", + " glue_compute_metrics\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "9ORJfiuiNZ_N" + }, + "source": [ + "## GLUE DataModule" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "jW9xQhZxMz1G" + }, + "outputs": [], + "source": [ + "class GLUEDataModule(pl.LightningDataModule):\n", + "\n", + " task_text_field_map = {\n", + " 'cola': ['sentence'],\n", + " 'sst2': ['sentence'],\n", + " 'mrpc': ['sentence1', 'sentence2'],\n", + " 'qqp': ['question1', 'question2'],\n", + " 'stsb': ['sentence1', 'sentence2'],\n", + " 'mnli': ['premise', 'hypothesis'],\n", + " 'qnli': ['question', 'sentence'],\n", + " 'rte': ['sentence1', 'sentence2'],\n", + " 'wnli': ['sentence1', 'sentence2'],\n", + " 'ax': ['premise', 'hypothesis']\n", + " }\n", + "\n", + " glue_task_num_labels = {\n", + " 'cola': 2,\n", + " 'sst2': 2,\n", + " 'mrpc': 2,\n", + " 'qqp': 2,\n", + " 'stsb': 1,\n", + " 'mnli': 3,\n", + " 'qnli': 2,\n", + " 'rte': 2,\n", + " 'wnli': 2,\n", + " 'ax': 3\n", + " }\n", + "\n", + " loader_columns = [\n", + " 'datasets_idx',\n", + " 'input_ids',\n", + " 'token_type_ids',\n", + " 'attention_mask',\n", + " 'start_positions',\n", + " 'end_positions',\n", + " 'labels'\n", + " ]\n", + "\n", + " def __init__(\n", + " self,\n", + " model_name_or_path: str,\n", + " task_name: str ='mrpc',\n", + " max_seq_length: int = 128,\n", + " train_batch_size: int = 32,\n", + " eval_batch_size: int = 32,\n", + " **kwargs\n", + " ):\n", + " super().__init__()\n", + " self.model_name_or_path = model_name_or_path\n", + " self.task_name = task_name\n", + " self.max_seq_length = max_seq_length\n", + " self.train_batch_size = train_batch_size\n", + " self.eval_batch_size = eval_batch_size\n", + "\n", + " self.text_fields = self.task_text_field_map[task_name]\n", + " self.num_labels = self.glue_task_num_labels[task_name]\n", + " self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)\n", + "\n", + " def setup(self, stage):\n", + " self.dataset = datasets.load_dataset('glue', self.task_name)\n", + "\n", + " for split in self.dataset.keys():\n", + " self.dataset[split] = self.dataset[split].map(\n", + " self.convert_to_features,\n", + " batched=True,\n", + " remove_columns=['label'],\n", + " )\n", + " self.columns = [c for c in self.dataset[split].column_names if c in self.loader_columns]\n", + " self.dataset[split].set_format(type=\"torch\", columns=self.columns)\n", + "\n", + " self.eval_splits = [x for x in self.dataset.keys() if 'validation' in x]\n", + "\n", + " def prepare_data(self):\n", + " datasets.load_dataset('glue', self.task_name)\n", + " AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)\n", + " \n", + " def train_dataloader(self):\n", + " return DataLoader(self.dataset['train'], batch_size=self.train_batch_size)\n", + " \n", + " def val_dataloader(self):\n", + " if len(self.eval_splits) == 1:\n", + " return DataLoader(self.dataset['validation'], batch_size=self.eval_batch_size)\n", + " elif len(self.eval_splits) > 1:\n", + " return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]\n", + "\n", + " def test_dataloader(self):\n", + " if len(self.eval_splits) == 1:\n", + " return DataLoader(self.dataset['test'], batch_size=self.eval_batch_size)\n", + " elif len(self.eval_splits) > 1:\n", + " return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]\n", + "\n", + " def convert_to_features(self, example_batch, indices=None):\n", + "\n", + " # Either encode single sentence or sentence pairs\n", + " if len(self.text_fields) > 1:\n", + " texts_or_text_pairs = list(zip(example_batch[self.text_fields[0]], example_batch[self.text_fields[1]]))\n", + " else:\n", + " texts_or_text_pairs = example_batch[self.text_fields[0]]\n", + "\n", + " # Tokenize the text/text pairs\n", + " features = self.tokenizer.batch_encode_plus(\n", + " texts_or_text_pairs,\n", + " max_length=self.max_seq_length,\n", + " pad_to_max_length=True,\n", + " truncation=True\n", + " )\n", + "\n", + " # Rename label to labels to make it easier to pass to model forward\n", + " features['labels'] = example_batch['label']\n", + "\n", + " return features" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "jQC3a6KuOpX3" + }, + "source": [ + "#### You could use this datamodule with standalone PyTorch if you wanted..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "JCMH3IAsNffF" + }, + "outputs": [], + "source": [ + "dm = GLUEDataModule('distilbert-base-uncased')\n", + "dm.prepare_data()\n", + "dm.setup('fit')\n", + "next(iter(dm.train_dataloader()))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "l9fQ_67BO2Lj" + }, + "source": [ + "## GLUE Model" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "gtn5YGKYO65B" + }, + "outputs": [], + "source": [ + "class GLUETransformer(pl.LightningModule):\n", + " def __init__(\n", + " self,\n", + " model_name_or_path: str,\n", + " num_labels: int,\n", + " learning_rate: float = 2e-5,\n", + " adam_epsilon: float = 1e-8,\n", + " warmup_steps: int = 0,\n", + " weight_decay: float = 0.0,\n", + " train_batch_size: int = 32,\n", + " eval_batch_size: int = 32,\n", + " eval_splits: Optional[list] = None,\n", + " **kwargs\n", + " ):\n", + " super().__init__()\n", + "\n", + " self.save_hyperparameters()\n", + "\n", + " self.config = AutoConfig.from_pretrained(model_name_or_path, num_labels=num_labels)\n", + " self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, config=self.config)\n", + " self.metric = datasets.load_metric(\n", + " 'glue',\n", + " self.hparams.task_name,\n", + " experiment_id=datetime.now().strftime(\"%d-%m-%Y_%H-%M-%S\")\n", + " )\n", + "\n", + " def forward(self, **inputs):\n", + " return self.model(**inputs)\n", + "\n", + " def training_step(self, batch, batch_idx):\n", + " outputs = self(**batch)\n", + " loss = outputs[0]\n", + " return loss\n", + "\n", + " def validation_step(self, batch, batch_idx, dataloader_idx=0):\n", + " outputs = self(**batch)\n", + " val_loss, logits = outputs[:2]\n", + "\n", + " if self.hparams.num_labels >= 1:\n", + " preds = torch.argmax(logits, axis=1)\n", + " elif self.hparams.num_labels == 1:\n", + " preds = logits.squeeze()\n", + "\n", + " labels = batch[\"labels\"]\n", + "\n", + " return {'loss': val_loss, \"preds\": preds, \"labels\": labels}\n", + "\n", + " def validation_epoch_end(self, outputs):\n", + " if self.hparams.task_name == 'mnli':\n", + " for i, output in enumerate(outputs):\n", + " # matched or mismatched\n", + " split = self.hparams.eval_splits[i].split('_')[-1]\n", + " preds = torch.cat([x['preds'] for x in output]).detach().cpu().numpy()\n", + " labels = torch.cat([x['labels'] for x in output]).detach().cpu().numpy()\n", + " loss = torch.stack([x['loss'] for x in output]).mean()\n", + " self.log(f'val_loss_{split}', loss, prog_bar=True)\n", + " split_metrics = {f\"{k}_{split}\": v for k, v in self.metric.compute(predictions=preds, references=labels).items()}\n", + " self.log_dict(split_metrics, prog_bar=True)\n", + " return loss\n", + "\n", + " preds = torch.cat([x['preds'] for x in outputs]).detach().cpu().numpy()\n", + " labels = torch.cat([x['labels'] for x in outputs]).detach().cpu().numpy()\n", + " loss = torch.stack([x['loss'] for x in outputs]).mean()\n", + " self.log('val_loss', loss, prog_bar=True)\n", + " self.log_dict(self.metric.compute(predictions=preds, references=labels), prog_bar=True)\n", + " return loss\n", + "\n", + " def setup(self, stage):\n", + " if stage == 'fit':\n", + " # Get dataloader by calling it - train_dataloader() is called after setup() by default\n", + " train_loader = self.train_dataloader()\n", + "\n", + " # Calculate total steps\n", + " self.total_steps = (\n", + " (len(train_loader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.gpus)))\n", + " // self.hparams.accumulate_grad_batches\n", + " * float(self.hparams.max_epochs)\n", + " )\n", + "\n", + " def configure_optimizers(self):\n", + " \"Prepare optimizer and schedule (linear warmup and decay)\"\n", + " model = self.model\n", + " no_decay = [\"bias\", \"LayerNorm.weight\"]\n", + " optimizer_grouped_parameters = [\n", + " {\n", + " \"params\": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],\n", + " \"weight_decay\": self.hparams.weight_decay,\n", + " },\n", + " {\n", + " \"params\": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],\n", + " \"weight_decay\": 0.0,\n", + " },\n", + " ]\n", + " optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)\n", + "\n", + " scheduler = get_linear_schedule_with_warmup(\n", + " optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps\n", + " )\n", + " scheduler = {\n", + " 'scheduler': scheduler,\n", + " 'interval': 'step',\n", + " 'frequency': 1\n", + " }\n", + " return [optimizer], [scheduler]\n", + "\n", + " @staticmethod\n", + " def add_model_specific_args(parent_parser):\n", + " parser = ArgumentParser(parents=[parent_parser], add_help=False)\n", + " parser.add_argument(\"--learning_rate\", default=2e-5, type=float)\n", + " parser.add_argument(\"--adam_epsilon\", default=1e-8, type=float)\n", + " parser.add_argument(\"--warmup_steps\", default=0, type=int)\n", + " parser.add_argument(\"--weight_decay\", default=0.0, type=float)\n", + " return parser" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ha-NdIP_xbd3" + }, + "source": [ + "### ⚡ Quick Tip \n", + " - Combine arguments from your DataModule, Model, and Trainer into one for easy and robust configuration" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "3dEHnl3RPlAR" + }, + "outputs": [], + "source": [ + "def parse_args(args=None):\n", + " parser = ArgumentParser()\n", + " parser = pl.Trainer.add_argparse_args(parser)\n", + " parser = GLUEDataModule.add_argparse_args(parser)\n", + " parser = GLUETransformer.add_model_specific_args(parser)\n", + " parser.add_argument('--seed', type=int, default=42)\n", + " return parser.parse_args(args)\n", + "\n", + "\n", + "def main(args):\n", + " pl.seed_everything(args.seed)\n", + " dm = GLUEDataModule.from_argparse_args(args)\n", + " dm.prepare_data()\n", + " dm.setup('fit')\n", + " model = GLUETransformer(num_labels=dm.num_labels, eval_splits=dm.eval_splits, **vars(args))\n", + " trainer = pl.Trainer.from_argparse_args(args)\n", + " return dm, model, trainer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "PkuLaeec3sJ-" + }, + "source": [ + "# Training" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "QSpueK5UPsN7" + }, + "source": [ + "## CoLA\n", + "\n", + "See an interactive view of the CoLA dataset in [NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=cola)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "NJnFmtpnPu0Y" + }, + "outputs": [], + "source": [ + "mocked_args = \"\"\"\n", + " --model_name_or_path albert-base-v2\n", + " --task_name cola\n", + " --max_epochs 3\n", + " --gpus 1\"\"\".split()\n", + "\n", + "args = parse_args(mocked_args)\n", + "dm, model, trainer = main(args)\n", + "trainer.fit(model, dm)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "_MrNsTnqdz4z" + }, + "source": [ + "## MRPC\n", + "\n", + "See an interactive view of the MRPC dataset in [NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=mrpc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "LBwRxg9Cb3d-" + }, + "outputs": [], + "source": [ + "mocked_args = \"\"\"\n", + " --model_name_or_path distilbert-base-cased\n", + " --task_name mrpc\n", + " --max_epochs 3\n", + " --gpus 1\"\"\".split()\n", + "\n", + "args = parse_args(mocked_args)\n", + "dm, model, trainer = main(args)\n", + "trainer.fit(model, dm)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "iZhbn0HzfdCu" + }, + "source": [ + "## MNLI\n", + "\n", + " - The MNLI dataset is huge, so we aren't going to bother trying to train it here.\n", + "\n", + " - Let's just make sure our multi-dataloader logic is right by skipping over training and going straight to validation.\n", + "\n", + "See an interactive view of the MRPC dataset in [NLP Viewer](https://huggingface.co/nlp/viewer/?dataset=glue&config=mnli)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "AvsZMOggfcWW" + }, + "outputs": [], + "source": [ + "mocked_args = \"\"\"\n", + " --model_name_or_path distilbert-base-uncased\n", + " --task_name mnli\n", + " --max_epochs 1\n", + " --gpus 1\n", + " --limit_train_batches 10\n", + " --progress_bar_refresh_rate 20\"\"\".split()\n", + "\n", + "args = parse_args(mocked_args)\n", + "dm, model, trainer = main(args)\n", + "trainer.fit(model, dm)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "

Congratulations - Time to Join the Community!

\n", + "
\n", + "\n", + "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n", + "\n", + "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n", + "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n", + "\n", + "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n", + "\n", + "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)!\n", + "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n", + "\n", + "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n", + "\n", + "* Please, star [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "\n", + "### Contributions !\n", + "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n", + "\n", + "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* [Bolt good first issue](https://github.com/PyTorchLightning/pytorch-lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* You can also contribute your own notebooks with useful examples !\n", + "\n", + "### Great thanks from the entire Pytorch Lightning Team for your interest !\n", + "\n", + "" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "04-transformers-text-classification.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/05-trainer-flags-overview.ipynb b/notebooks/05-trainer-flags-overview.ipynb index 4589887ecb986..f1f93104f4552 100644 --- a/notebooks/05-trainer-flags-overview.ipynb +++ b/notebooks/05-trainer-flags-overview.ipynb @@ -1,2871 +1,2919 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "05-trainer-flags-overview.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "goRmGIRI5cfC" - }, - "source": [ - "# Introduction to Lightning Flags ⚡🚩\n", - "\n", - "In this notebook, we'll go over the flags available in the `Trainer` object. Note that not everything will work in the Colab environment (multi-gpu, etc). This notebook accompanies the Trainer videos we'll be putting out.\n", - "\n", - "---\n", - " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", - " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", - " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jKj5lgdr5j48" - }, - "source": [ - "--- \n", - "### Setup \n", - "First thing first, we need to install Lightning. Simply ```pip install pytorch-lightning```" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "UGjilEHk4vb7" - }, - "source": [ - "! pip install pytorch-lightning" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "zaVUShmQ5n8Y" - }, - "source": [ - "import os\n", - "\n", - "from argparse import ArgumentParser\n", - "import torch\n", - "from torch import nn\n", - "from torch.nn import functional as F\n", - "from torch.utils.data import DataLoader\n", - "from torch.utils.data import random_split\n", - "from torchvision.datasets import MNIST\n", - "from torchvision import transforms\n", - "import pytorch_lightning as pl\n", - "from pytorch_lightning.metrics.functional import accuracy\n", - "\n", - "from torchvision.datasets.mnist import MNIST\n", - "from torchvision import transforms" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "6tgkS8IYZwY_" - }, - "source": [ - "# ------------\n", - "# data\n", - "# ------------\n", - "pl.seed_everything(1234)\n", - "batch_size = 32\n", - "\n", - "# Init DataLoader from MNIST Dataset\n", - "\n", - "dataset = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())\n", - "mnist_test = MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor())\n", - "mnist_train, mnist_val = random_split(dataset, [55000, 5000])\n", - "\n", - "train_loader = DataLoader(mnist_train, batch_size=batch_size)\n", - "val_loader = DataLoader(mnist_val, batch_size=batch_size)\n", - "test_loader = DataLoader(mnist_test, batch_size=batch_size)\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gEulmrbxwaYL" - }, - "source": [ - "### Simple AutoEncoder Model\n", - "\n", - "Were gonna define a simple Lightning model so we can play with all the settings of the Lightning Trainer.\n", - "\n", - "LightningModule is simply pure Pytorch reorganized into hooks, that represents all the steps in the training process.\n", - "\n", - "You can use LightningModule hooks to control every part of your model, but for the purpose of this video we will use a very simple MNIST classifier, a model that takes 28*28 grayscale images of hand written images, and can predict the digit between 0-9.\n", - "\n", - "The LightningModule can encompass a single model, like an image classifier, or a deep learning system composed of multiple models, like this auto encoder that contains an encoder and a decoder.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "x-34xKCI40yW" - }, - "source": [ - "class LitAutoEncoder(pl.LightningModule):\n", - "\n", - " def __init__(self, batch_size=32, lr=1e-3):\n", - " super().__init__()\n", - " self.encoder = nn.Sequential(\n", - " nn.Linear(28 * 28, 64),\n", - " nn.ReLU(),\n", - " nn.Linear(64, 3)\n", - " )\n", - " self.decoder = nn.Sequential(\n", - " nn.Linear(3, 64),\n", - " nn.ReLU(),\n", - " nn.Linear(64, 28 * 28)\n", - " )\n", - " self.batch_size=batch_size\n", - " self.learning_rate=lr\n", - "\n", - " def forward(self, x):\n", - " # in lightning, forward defines the prediction/inference actions\n", - " embedding = self.encoder(x)\n", - " return embedding\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " x = x.view(x.size(0), -1)\n", - " z = self.encoder(x)\n", - " x_hat = self.decoder(z)\n", - " loss = F.mse_loss(x_hat, x)\n", - " self.log('train_loss', loss)\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " x = x.view(x.size(0), -1)\n", - " z = self.encoder(x)\n", - " x_hat = self.decoder(z)\n", - " loss = F.mse_loss(x_hat, x)\n", - " self.log('val_loss', loss)\n", - " \n", - " def test_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " x = x.view(x.size(0), -1)\n", - " z = self.encoder(x)\n", - " x_hat = self.decoder(z)\n", - " loss = F.mse_loss(x_hat, x)\n", - " self.log('test_loss', loss)\n", - "\n", - " def configure_optimizers(self):\n", - " optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)\n", - " return optimizer" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VbxcRCrxiYly" - }, - "source": [ - "You'll notice the LightningModule doesn't have epoch and batch loops, we're not calling model.train() and model.eval(), and no mentions of CUDA or hardware. That's because it is all automated by the Lightning Trainer. All the engineering boilerplate is automated by the trainer: \n", - "\n", - "* Training loops\n", - "* Evaluation and test loops\n", - "* Calling model.train(), model.eval(), no_grad at the right time\n", - "* CUDA or to_device calls\n", - "\n", - "It also allows you to train your models on different hardware like GPUs and TPUs without changing your code!\n", - "\n", - "\n", - "### To use the lightning trainer simply:\n", - "\n", - "1. init your LightningModule and datasets\n", - "\n", - "2. init lightning trainer\n", - "\n", - "3. call trainer.fit\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "HOk9c4_35FKg" - }, - "source": [ - "#####################\n", - "# 1. Init Model\n", - "#####################\n", - "\n", - "model = LitAutoEncoder()\n", - "\n", - "#####################\n", - "# 2. Init Trainer\n", - "#####################\n", - "\n", - "# these 2 flags are explained in the later sections...but for short explanation:\n", - "# - progress_bar_refresh_rate: limits refresh rate of tqdm progress bar so Colab doesn't freak out\n", - "# - max_epochs: only run 2 epochs instead of default of 1000\n", - "trainer = pl.Trainer(progress_bar_refresh_rate=20, max_epochs=2)\n", - "\n", - "#####################\n", - "# 3. Train\n", - "#####################\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3meDako-Qa_6" - }, - "source": [ - "Our model is training just like that, using the Lightning defaults. The beauty of Lightning is that everything is easily configurable.\n", - "In our next videos were going to show you all the ways you can control your Trainer to do things like controlling your training, validation and test loops, running on GPUs and TPUs, checkpointing, early stopping, and a lot more.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z_Wry2MckQkI" - }, - "source": [ - "# Training loop and eval loop Flags" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0MkI1xB2vsLj" - }, - "source": [ - "\n", - "To really scale up your networks, you can use accelerators like GPUs. GPUs or Graphical Processing Units, parallelize matrix multiplications which enable speed ups of at least 100x over training on CPUs.\n", - "\n", - "Let's say you have a machine with 8 GPUs on it. You can set this flag to 1, 4, or 8 GPUs and lightning will automatically distribute your training for you.\n", - "\n", - "```\n", - "trainer = pl.Trainer(gpus=1)\n", - "```\n", - "\n", - "---------\n", - "\n", - "Lightning makes your code hardware agnostic... This means, you can switch between CPUs, GPUs without code changes.\n", - "\n", - "However, it requires forming good PyTorch habits:\n", - "\n", - "1. First, remove the .cuda() or .to() calls in your code.\n", - "2. Second, when you initialize a new tensor, set the device=self.device in the call since every lightningModule knows what gpu index or TPU core it is on.\n", - "\n", - "You can also use type_as and or you can register the tensor as a buffer in your module’s __init__ method with register_buffer().\n", - "\n", - "```\n", - "# before lightning\n", - "def forward(self, x):\n", - " z = torch.Tensor(2, 3)\n", - " z = z.cuda(0)\n", - "\n", - "# with lightning\n", - "def forward(self, x):\n", - " z = torch.Tensor(2, 3)\n", - " z = z.type_as(x, device=self.device)\n", - "```\n", - "\n", - "\n", - "```\n", - "class LitModel(LightningModule):\n", - "\n", - " def __init__(self):\n", - " ...\n", - " self.register_buffer(\"sigma\", torch.eye(3))\n", - " # you can now access self.sigma anywhere in your module\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hw6jJhhjvlSL" - }, - "source": [ - "Lightning Trainer automates all the engineering boilerplate like iterating over epochs and batches, training eval and test loops, CUDA and to(device) calls, calling model.train and model.eval.\n", - "\n", - "You still have full control over the loops, by using the following trainer flags:" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pT5-ETH9eUg6" - }, - "source": [ - "## Calling validation steps\n", - "Sometimes, training an epoch may be pretty fast, like minutes per epoch. In this case, you might not need to validate on every epoch. Instead, you can actually validate after a few epochs.\n", - "\n", - "Use `check_val_every_n_epoch` flag to control the frequency of validation step:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Z-EMVvKheu3D" - }, - "source": [ - "# run val loop every 10 training epochs\n", - "trainer = pl.Trainer(check_val_every_n_epoch=10)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UOzZr9S2UcSO" - }, - "source": [ - "## val_check_interval\n", - "\n", - "In some cases where your epoch is very long, you might want to check validation within an epoch.\n", - "\n", - "You can also run validation step within your training epochs, by setting `val_check_interval` flag.\n", - "\n", - "Set `val_check_interval` to a float between [0.0 to 1.0] to check your validation set within a training epoch. For example, setting it to 0.25 will check your validation set 4 times during a training epoch.\n", - "\n", - "Default is set to 1.0" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "9kbUbvrUVLrT" - }, - "source": [ - "# check validation set 4 times during a training epoch\n", - "trainer = pl.Trainer(val_check_interval=0.25)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Onm1gBsKVaw4" - }, - "source": [ - "When you have iterable data sets, or when streaming data for production use cases, it is useful to check the validation set every number of steps. \n", - "Set val_check_interval to an int:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "psn6DVb5Vi85" - }, - "source": [ - "# check validation set every 1000 training batches\n", - "# use this when using iterableDataset and your dataset has no length\n", - "# (ie: production cases with streaming data)\n", - "trainer = pl.Trainer(val_check_interval=1000)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QkoYonrWkb7-" - }, - "source": [ - "## num_sanity_val_steps \n", - "\n", - "You may have run into an issue, where you have a bug in your validation loop, but won't catch it until your training loop ends.\n", - "\n", - "and if your training loop takes hours or days, you will waste valuable compute.\n", - "\n", - "Instead, lightning automatically runs through 2 steps of validation in the beginning to catch these kinds of bugs up front.\n", - "\n", - "\n", - "The `num_sanity_val_steps` flag can help you run n batches of validation before starting the training routine.\n", - "\n", - "You can set it to 0 to turn it off" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "zOcT-ugSkiKW" - }, - "source": [ - "# turn it off\n", - "trainer = pl.Trainer(num_sanity_val_steps=0)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zS0ob1ZmTw56" - }, - "source": [ - "Set it to -1 to check all validation data before training" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "rzqvjA4UT263" - }, - "source": [ - "# check all validation data\n", - "trainer = pl.Trainer(num_sanity_val_steps=-1)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uMB41wq4T3Z2" - }, - "source": [ - "Or use any arbitrary number of validation steps" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "lGP78aQzT7VS" - }, - "source": [ - "trainer = pl.Trainer(num_sanity_val_steps=10)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "H-xaYRtd1rb-" - }, - "source": [ - "## Limit train, validation, and test batches\n", - "\n", - "You can set limits on how much of training, validation and test dataset you want your model to check. This is useful if you have really large validation or tests sets, for debugging or testing something that happens at the end of an epoch.\n", - "\n", - "Set the flag to int to specify the number of batches to run\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "XiK5cFKL1rcA" - }, - "source": [ - "# run for only 10 batches\n", - "trainer = pl.Trainer(limit_test_batches=10)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Y4LK0g65RrBm" - }, - "source": [ - "For example, some metrics need to be computed on the entire validation results, such as AUC ROC. " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "8MmeRs2DR3dD" - }, - "source": [ - "trainer = pl.Trainer(limit_val_batches=10)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xmigcNa1A2Vy" - }, - "source": [ - "You can use a float to limit the batches be percentage of the set on every epoch" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "W7uGJt8nA4tv" - }, - "source": [ - "# run through only 25% of the test set each epoch\n", - "trainer = pl.Trainer(limit_test_batches=0.25)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YRI8THtUN7_e" - }, - "source": [ - "# Training on GPUs\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R8FFkX_FwlfE" - }, - "source": [ - "To run on 1 GPU set the flag to 1" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Nnzkf3KaOE27" - }, - "source": [ - "trainer = pl.Trainer(gpus=1)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cxBg47s5PB1P" - }, - "source": [ - "to run on 2 or 4 GPUs, set the flag to 2 or 4." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "cSEM4ihLrohT" - }, - "source": [ - "trainer = pl.Trainer(gpus=2)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZE6ZgwtNudro" - }, - "source": [ - "You can also select which GPU devices to run on, using a list of indices like [1, 4] \n", - "\n", - "or a string containing a comma separated list of GPU ids like '1,2'\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "gQkJtq0urrjq" - }, - "source": [ - "# list: train on GPUs 1, 4 (by bus ordering)\n", - "# trainer = Trainer(gpus='1, 4') # equivalent\n", - "trainer = pl.Trainer(gpus=[1, 4])\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "XghDPad4us74" - }, - "source": [ - "trainer = pl.Trainer(gpus=list(range(4)))\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6FVkKHpSPMTW" - }, - "source": [ - "You can use all the GPUs you have available by setting `gpus=-1`" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "r6cKQijYrtPe" - }, - "source": [ - "# trainer = Trainer(gpus='-1') - equivalent\n", - "trainer = pl.Trainer(gpus=-1)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2C-fNLm3UGCV" - }, - "source": [ - "Lightning uses the PCI bus_id as the index for ordering GPUs." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_V75s7EhOFhE" - }, - "source": [ - "### `auto_select_gpus`\n", - "\n", - "You can save on GPUs by running in “exclusive mode”, meaning only one process at a time can access them. If your not sure which GPUs you should use when running exclusive mode, Lightning can automatically find unoccupied GPUs for you. \n", - "\n", - "Simply specify the number of gpus as an integer `gpus=k`, and set the trainer flag `auto_select_gpus=True`. Lightning will automatically help you find k gpus that are not occupied by other processes." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "_Sd3XFsAOIwd" - }, - "source": [ - "# enable auto selection (will find two available gpus on system)\n", - "trainer = pl.Trainer(gpus=2, auto_select_gpus=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a5JGSBMQhJNp" - }, - "source": [ - "## analyzing GPU usage\n", - "\n", - "### log_gpu_memory\n", - "\n", - "This is useful to analyze the memory usage of your GPUs.\n", - "\n", - "To get the GPU memory usage for every GPU on the master node, set the flag to log_gpu_memory=all.\n", - "\n", - "Under the hood, lightning uses the nvidia-smi command which may slow your training down.\n", - "\n", - "Your logs can become overwhelmed if you log the usage from many GPUs at once. In this case, you can also set the flag to min_max which will log only the min and max usage across all the GPUs of the master node.\n", - "\n", - "Note that lightning is not logging the usage across all nodes for performance reasons." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "idus3ZGahOki" - }, - "source": [ - "# log all the GPUs (on master node only)\n", - "trainer = Trainer(log_gpu_memory='all')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-mevgiy_hkip" - }, - "source": [ - "To avoid the performance decrease you can also set `log_gpu_memory=min_max` to only log the min and max memory on the master node.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "SlvLJnWyhs7J" - }, - "source": [ - "# log only the min and max memory on the master node\n", - "trainer = Trainer(log_gpu_memory='min_max')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "K82FLLIJVQG3" - }, - "source": [ - "\n", - "But what if you want to train on multiple machines and not just one?" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YViQ6PXesAue" - }, - "source": [ - "# Training on multiple GPUs" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WacbBQUivxQq" - }, - "source": [ - "Lightning makes your models hardware agnostic, and you can run on GPUs with a flip of a flag. Lightning also supports training on multiple GPUs across many machines.\n", - "\n", - "You can do this by setting the num_nodes flag.\n", - "\n", - "The world size, or the total number of GPUs you are using, will be gpus*num_nodes.\n", - "\n", - "If i set gpus=8 and num_nodes=32 then I will be training on 256 GPUs." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "5iKckmDvr8zZ" - }, - "source": [ - "trainer = pl.Trainer(gpus=8, num_nodes=32)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GgcSbDjjlSTh" - }, - "source": [ - "## distributed backends\n", - "\n", - "Under the hood, Lightning uses distributed data parallel (or DDP) by default to distribute training across GPUs.\n", - "\n", - "This Lightning implementation of DDP calls your script under the hood multiple times with the correct environment variables.\n", - "\n", - "Under the hood it's as if you had called your script like this:\n", - "\n", - "1. Each GPU across each node gets its own process.\n", - "2. Each GPU gets visibility into a subset of the overall dataset. It will only ever see that subset.\n", - "3. Each process inits the model. (Make sure to set the random seed so that each model initializes with the same weights.)\n", - "4. Each process performs a full forward and backward pass in parallel.\n", - "5. The gradients are synced and averaged across all processes.\n", - "6. Each process updates its optimizer.\n", - "If you request multiple GPUs or nodes without setting a mode, DDP will be automatically used.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "n_Brr7F5wdtj" - }, - "source": [ - "# ddp = DistributedDataParallel\n", - "# trainer = pl.Trainer(gpus=2, num_nodes=2) equivalent\n", - "trainer = pl.Trainer(gpus=2, num_nodes=2, distributed_backend='ddp')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "edxHyttC5J3e" - }, - "source": [ - "DDP is the fastest and recommended way to distribute your training, but you can pass in other backends to `distributed_backend` trainer flag, when DDP is not supported.\n", - "\n", - "DDP isn't available in\n", - "* Jupyter Notebook, Google COLAB, Kaggle, etc.\n", - "* If You have a nested script without a root package\n", - "* or if Your script needs to invoke .fit or .test multiple times" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZDh96mavxHxf" - }, - "source": [ - "### DDP_SPAWN\n", - "\n", - "In these cases, you can use `ddp_spawn` instead. `ddp_spawn` is exactly like DDP except that it uses `.spawn()` to start the training processes." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "JM5TKtgLxo37" - }, - "source": [ - "trainer = pl.Trainer(gpus=2, num_nodes=2, distributed_backend='ddp_spawn')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sebhVE3qrhKK" - }, - "source": [ - "We STRONGLY discourage this use because it has limitations (due to Python and PyTorch):\n", - "\n", - "* Since .spawn() trains the model in subprocesses, the model on the main process does not get updated.\n", - "\n", - "* Dataloader(num_workers=N), where N is large, bottlenecks training with DDP… ie: it will be VERY slow or won’t work at all. This is a PyTorch limitation.\n", - "\n", - "* Forces everything to be picklable.\n", - "\n", - "DDP is MUCH faster than DDP_spawn. To be able to use DDP we recommend you: \n", - "\n", - "1. Install a top-level module for your project using setup.py\n", - "\n", - "```\n", - "# setup.py\n", - "#!/usr/bin/env python\n", - "\n", - "from setuptools import setup, find_packages\n", - "\n", - "setup(name='src',\n", - " version='0.0.1',\n", - " description='Describe Your Cool Project',\n", - " author='',\n", - " author_email='',\n", - " url='https://github.com/YourSeed', # REPLACE WITH YOUR OWN GITHUB PROJECT LINK\n", - " install_requires=[\n", - " 'pytorch-lightning'\n", - " ],\n", - " packages=find_packages()\n", - " )\n", - "\n", - "```\n", - "\n", - "2. Setup your project like so:\n", - "\n", - "```\n", - "/project\n", - " /src\n", - " some_file.py\n", - " /or_a_folder\n", - " setup.py\n", - "```\n", - "3. Install as a root-level package\n", - "```\n", - "cd /project\n", - "pip install -e .\n", - "```\n", - "4. You can then call your scripts anywhere\n", - "```\n", - "cd /project/src\n", - "\n", - "python some_file.py --distributed_backend 'ddp' --gpus 8\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cmB3I_oyw7a8" - }, - "source": [ - "### DP\n", - "\n", - "If you're using windows, DDP is not supported. You can use `dp` for DataParallel instead: DataParallel uses multithreading, instead of multiprocessing. It splits a batch across k GPUs. That is, if you have a batch of 32 and use DP with 2 gpus, each GPU will process 16 samples, after which the root node will aggregate the results.\n", - "\n", - "DP use is discouraged by PyTorch and Lightning. Use DDP which is more stable and at least 3x faster.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "OO-J0ISvlVCg" - }, - "source": [ - "# dp = DataParallel\n", - "trainer = pl.Trainer(gpus=2, distributed_backend='dp')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Y7E2eHZKwUn9" - }, - "source": [ - "### DDP2\n", - "\n", - "In certain cases, it’s advantageous to use ***all*** batches on the same machine, instead of a subset. For instance, in self-supervised learning, a common performance boost comes from increasing the number of negative samples.\n", - "\n", - "In this case, we can use DDP2 which behaves like DP in a machine and DDP across nodes. DDP2 does the following:\n", - "\n", - "* Copies a subset of the data to each node.\n", - "* Inits a model on each node.\n", - "* Runs a forward and backward pass using DP.\n", - "* Syncs gradients across nodes.\n", - "* Applies the optimizer updates.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Y4xweqL3xHER" - }, - "source": [ - "# ddp2 = DistributedDataParallel + dp\n", - "trainer = pl.Trainer(gpus=2, num_nodes=2, distributed_backend='ddp2')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lhKNCnveeeq5" - }, - "source": [ - "- The second mode is ddp_spawn. This works like ddp, but instead of calling your script multiple times, lightning will use multiprocessing spawn to start a subprocess per GPU. \n", - "\n", - "However, you should be careful of mixing this mode with num_workers > 0 in your dataloaders because it will bottleneck your training. This is a current known limitation of PyTorch which is why we recommend using our ddp implementation instead.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HUf9ANyQkFFO" - }, - "source": [ - "\n", - "### mocking ddp\n", - "\n", - "Testing or debugging DDP can be hard, so we have a distributed backend that simulates ddp on cpus to make it easier. Set `num_processes` to a number greater than 1 when using distributed_backend=\"ddp_cpu\" to mimic distributed training on a machine without GPUs. Note that while this is useful for debugging, it will not provide any speedup, since single-process Torch already makes efficient use of multiple CPUs." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ZSal5Da9kHOf" - }, - "source": [ - "# Simulate DDP for debugging on your GPU-less laptop\n", - "trainer = Trainer(distributed_backend=\"ddp_cpu\", num_processes=2)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Br_btCy5lgES" - }, - "source": [ - "# Training on TPUS\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DXkBNITdv44d" - }, - "source": [ - "Another option for accelerating your training is using TPUs.\n", - "A TPU is a Tensor processing unit, designed specifically for deep learning. Each TPU has 8 cores where each core is optimized for 128x128 matrix multiplies. Google estimates that 8 TPU cores are about as fast as 4 V100 GPUs!\n", - "\n", - "A TPU pod hosts many TPUs on it. Currently, TPU pod v2 has 2048 cores! You can request a full pod from Google cloud or a “slice” which gives you some subset of those 2048 cores.\n", - "\n", - "At this moment, TPUs are available on Google Cloud (GCP), Google Colab and Kaggle Environments.\n", - "\n", - "Lightning supports training on TPUs without any code adjustments to your model. Just like when using GPUs, Lightning automatically inserts the correct samplers - no need to do this yourself!\n", - "\n", - "Under the hood, lightning uses the XLA framework developed jointly by the facebook and google XLA teams. And we want to recognize their efforts in advancing TPU adoption of PyTorch.\n", - "\n", - "## tpu_cores\n", - "To train on TPUs, set the tpu_cores flag.\n", - "\n", - "When using colab or kaggle, the allowed values are 1 or 8 cores. When using google cloud, any value above 8 is allowed.\n", - "\n", - "Your effective batch size is the batch size passed into a dataloader times the total number of tpu cores." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "itP9y70gmD9M" - }, - "source": [ - "# int: train on a single core\n", - "trainer = pl.Trainer(tpu_cores=1)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "NJKnzPb3mKEg" - }, - "source": [ - "# int: train on all cores few cores\n", - "trainer = pl.Trainer(tpu_cores=8)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8a4exfWUmOHq" - }, - "source": [ - "You can also choose which TPU core to train on, by passing a list [1-8]. This is not an officially supported use case but we are working with the XLA team to improve this user experience.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "S6OrjE_bmT-_" - }, - "source": [ - "# list: train on a single selected core\n", - "trainer = pl.Trainer(tpu_cores=[2])\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Afqx3sFUmfWD" - }, - "source": [ - "To train on more than 8 cores (ie: a POD), submit this script using the xla_dist script.\n", - "\n", - "\n", - "\n", - "```\n", - "python -m torch_xla.distributed.xla_dist\n", - "--tpu=$TPU_POD_NAME\n", - "--conda-env=torch-xla-nightly\n", - "--env=XLA_USE_BF16=1\n", - "-- python your_trainer_file.py\n", - "```\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ncPvbUVQqKOh" - }, - "source": [ - "# Advanced distributed training\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4MP7bEgnv7qK" - }, - "source": [ - "\n", - "Lightning supports distributed training across multiple GPUs and TPUs out of the box by setting trainer flags, but it also allows you to control the way sampling is done if you need to." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wdHiTfAMepKH" - }, - "source": [ - "## replace_sampler_ddp\n", - "In PyTorch, you must use torch.nn.DistributedSampler for multi-node or GPU training. The sampler makes sure each GPU sees the appropriate part of your data.\n", - "\n", - "```\n", - "# without lightning\n", - "def train_dataloader(self):\n", - " dataset = MNIST(...)\n", - " sampler = None\n", - "\n", - " if self.on_tpu:\n", - " sampler = DistributedSampler(dataset)\n", - "\n", - " return DataLoader(dataset, sampler=sampler)\n", - "```\n", - "Lightning adds the correct samplers when needed, so no need to explicitly add samplers. By default it will add `shuffle=True` for train sampler and `shuffle=False` for val/test sampler.\n", - "\n", - "If you want to customize this behaviour, you can set `replace_sampler_ddp=False` and add your own distributed sampler.\n", - "\n", - "(note: For iterable datasets, we don’t do this automatically.)\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ZfmcB_e_7HbE" - }, - "source": [ - "sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=False)\n", - "dataloader = DataLoader(dataset, batch_size=32, sampler=sampler)\n", - "\n", - "trainer = pl.Trainer(gpus=2, num_nodes=2, replace_sampler_ddp=False)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-IOhk1n0lL3_" - }, - "source": [ - "## prepare_data_per_node\n", - "\n", - "When doing multi NODE training, if your nodes share the same file system, then you don't want to download data more than once to avoid possible collisions. \n", - "\n", - "Lightning automatically calls the prepare_data hook on the root GPU of the master node (ie: only a single GPU).\n", - "\n", - "In some cases where your nodes don't share the same file system, you need to download the data on each node. In this case you can set this flag to true and lightning will download the data on the root GPU of each node.\n", - "\n", - "This flag is defaulted to True." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "WFBMUR48lM04" - }, - "source": [ - "trainer = pl.Trainer(gpus=2, num_nodes=2, prepare_data_per_node=False)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FKBwXqo4q-Vp" - }, - "source": [ - "## sync_batchnorm\n", - "\n", - "Batch norm is computed per GPU/TPU. This flag enables synchronization between batchnorm layers across all GPUs.\n", - "It is recommended if you have small batch sizes.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "GhaCLTEZrAQi" - }, - "source": [ - "trainer = Trainer(gpus=4, sync_batchnorm=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XuFA7VTFMY9-" - }, - "source": [ - "# Debugging flags\n", - "\n", - "Lightning offers a couple of flags to make debugging your models easier:\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AKoS3fdml4Jx" - }, - "source": [ - "## Fast Dev Run\n", - "\n", - "To help you save time debugging, your first run should use the fast_dev_run flag.\n", - "\n", - "This won't generate logs or save checkpoints but will touch every line of your code to make sure that it is working as intended.\n", - "\n", - "Think about this flag like a compiler. You make changes to your code, and run Trainer with this flag to verify that your changes are bug free.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "L5vuG7GSmhzK" - }, - "source": [ - "trainer = pl.Trainer(fast_dev_run=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HRP1qQR5nT4p" - }, - "source": [ - "## overfit_batches\n", - "\n", - "Uses this much data of the training set. If nonzero, will use the same training set for validation and testing. If the training dataloaders have shuffle=True, Lightning will automatically disable it.\n", - "\n", - "Useful for quickly debugging or trying to overfit on purpose." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "NTM-dqGMnXms" - }, - "source": [ - "# use only 1% of the train set (and use the train set for val and test)\n", - "trainer = pl.Trainer(overfit_batches=0.01)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "c0LV0gC3nl1X" - }, - "source": [ - "# overfit on 10 of the same batches\n", - "trainer = pl.Trainer(overfit_batches=10)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lt3UHU6WgtS_" - }, - "source": [ - "Or a float to represent percentage of data to run" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "K3yUqADhgnkf" - }, - "source": [ - "# run through only 25% of the test set each epoch\n", - "trainer = pl.Trainer(limit_test_batches=0.25)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ODN66NeVg_2o" - }, - "source": [ - "In the case of multiple test dataloaders, the limit applies to each dataloader individually.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8aQx5SLeMz1R" - }, - "source": [ - "# accumulate_grad_batches\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "g8GczZXFwKC7" - }, - "source": [ - "The batch size controls the accuracy of the estimate of the gradients. Small batch size use less memory, but decrease accuracy. When training large models, such as NLP transformers, it is useful to accumulate gradients before calling backwards(). It allows for bigger batch sizes than what can actually fit on a GPU/TPU in a single step.\n", - "\n", - "Use accumulate_grad_batches to accumulate gradients every k batches or as set up in the dict. Trainer also calls optimizer.step() for the last indivisible step number.\n", - "\n", - "For example, set accumulate_grad_batches to 4 to accumulate every 4 batches. In this case the effective batch size is batch_size*4, so if your batch size is 32, effectively it will be 128." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "2jB6-Z_yPhhf" - }, - "source": [ - "# accumulate every 4 batches (effective batch size is batch*4)\n", - "trainer = pl.Trainer(accumulate_grad_batches=4)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_Yi-bdTOgINC" - }, - "source": [ - "You can also pass a dictionary to specify different accumulation per epoch. We can set it to `{5: 3, 10: 20}` to have no accumulation for epochs 1 to 4, accumulate 3 batches for epoch 5 to 10, and 20 batches after that." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "X3xsoZ3YPgBv" - }, - "source": [ - "# no accumulation for epochs 1-4. accumulate 3 for epochs 5-10. accumulate 20 after that\n", - "trainer = pl.Trainer(accumulate_grad_batches={5: 3, 10: 20})\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "myzH8mV4M1_9" - }, - "source": [ - "# 16 bit precision\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "v9EaFAonwOk6" - }, - "source": [ - "Most deep learning frameworks like PyTorch, train with 32-bit floating point arithmetic. \n", - "\n", - "But many models can still achieve full accuracy using half the precision.\n", - "\n", - "In 2017, NVIDIA researchers successfully used a combination of 32 and 16 bit precision (also known as mixed precision) and achieved the same accuracy as 32 bit precision training.\n", - "\n", - "The main two advantages are:\n", - "\n", - "- a reduction in memory requirements which enables larger batch sizes and models.\n", - "- and a speed up in compute. On ampere, turing and volta architectures 16 bit precision models can train at least 3 times faster.\n", - "\n", - "As of PyTorch 1.6, NVIDIA and Facebook moved mixed precision functionality into PyTorch core as the AMP package, torch.cuda.amp. \n", - "\n", - "This package supersedes the apex package developed by NVIDIA." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TjNypZPHnxvJ" - }, - "source": [ - "## precision\n", - "\n", - "Use precision flag to switch between full precision (32) to half precision (16). Can be used on CPU, GPU or TPUs.\n", - "\n", - "When using PyTorch 1.6+ Lightning uses the native amp implementation to support 16-bit.\n", - "\n", - "If used on TPU will use torch.bfloat16 but tensor printing will still show torch.float32" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "kBZKMVx1nw-D" - }, - "source": [ - "# 16-bit precision\n", - "trainer = pl.Trainer(gpus=1, precision=16)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VJGj3Jh7oQXU" - }, - "source": [ - "In earlier version of Lightning, we use NVIDIA Apex for 16-bit precision. Apex was the first library to attempt 16-bit and the automatic mixed precision library (amp), has since been merged into core PyTorch as of 1.6.\n", - "\n", - "If you insist in using Apex, you can set the amp_backend flag to 'apex' and install Apex on your own." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "BDV1trAUPc9h" - }, - "source": [ - "trainer = pl.Trainer(gpus=1, precision=16, amp_backend='apex')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HK5c_aVfNV4e" - }, - "source": [ - "## amp_level\n", - "Apex includes 4 optimization levels:\n", - "O0 (FP32 training)\n", - "O1 (Conservative Mixed Precision): only some whitelist ops are done in FP16.\n", - "O2 (Fast Mixed Precision): this is the standard mixed precision training. It maintains FP32 master weights and optimizer.step acts directly on the FP32 master weights.\n", - "O3 (FP16 training): full FP16. Passing keep_batchnorm_fp32=True can speed things up as cudnn batchnorm is faster anyway.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "FshMFPowNbWt" - }, - "source": [ - "# default used by the Trainer\n", - "trainer = pl.Trainer(gpus=1, precision=16, amp_backend='apex', amp_level='O2')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "y8KEr1YvNgkC" - }, - "source": [ - "# `auto_scale_batch_size`\n", - "\n", - " \n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7F1pKFIuwSFl" - }, - "source": [ - "Lightning can help you improve your model by using auto_scale_batch_size flag, which tries to find the largest batch size that fits into memory, before you start your training.\n", - "Larger batch size often yields better estimates of gradients, but may also result in longer training time. \n", - "\n", - "Set it to True to initially run a batch size finder trying to find the largest batch size that fits into memory. The result will be stored in self.batch_size in the LightningModule.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "9_jE-iyyheIv" - }, - "source": [ - "trainer = pl.Trainer(auto_scale_batch_size=True)\n", - "\n", - "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yaHsJvwFhNJt" - }, - "source": [ - "You can set the value to `power`. `power` scaling starts from a batch size of 1 and keeps doubling the batch size until an out-of-memory (OOM) error is encountered.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Qx0FbQrphgw1" - }, - "source": [ - "trainer = pl.Trainer(auto_scale_batch_size='power')\n", - "\n", - "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8bwgVF9zhZ75" - }, - "source": [ - "You can also set it to `binsearch`, that continues to finetune the batch size by performing a binary search.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "QObXNs3yNrg9" - }, - "source": [ - "# run batch size scaling, result overrides hparams.batch_size\n", - "trainer = pl.Trainer(auto_scale_batch_size='binsearch')\n", - "\n", - "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5OWdhSsZjqW7" - }, - "source": [ - "This feature expects that a batch_size field in the hparams of your model, i.e., model.hparams.batch_size should exist and will be overridden by the results of this algorithm. \n", - "\n", - "Additionally, your train_dataloader() method should depend on this field for this feature to work.\n", - "\n", - "The algorithm in short works by:\n", - "1. Dumping the current state of the model and trainer\n", - "\n", - "2. Iteratively until convergence or maximum number of tries max_trials (default 25) has been reached:\n", - "* Call fit() method of trainer. This evaluates steps_per_trial (default 3) number of training steps. Each training step can trigger an OOM error if the tensors (training batch, weights, gradients etc.) allocated during the steps have a too large memory footprint.\n", - " * If an OOM error is encountered, decrease the batch size\n", - " * Else increase it.\n", - "* How much the batch size is increased/decreased is determined by the chosen strategy.\n", - "\n", - "3. The found batch size is saved to model.hparams.batch_size\n", - "\n", - "4. Restore the initial state of model and trainer\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "q4CvxfZmOWBd" - }, - "source": [ - "# `auto_lr_find`\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "j85e8usNwdBV" - }, - "source": [ - "Selecting a good learning rate for your deep learning training is essential for both better performance and faster convergence.\n", - "\n", - "Even optimizers such as Adam that are self-adjusting the learning rate can benefit from more optimal choices.\n", - "\n", - "To reduce the amount of guesswork concerning choosing a good initial learning rate, you can use Lightning auto learning rate finder.\n", - "\n", - "The learning rate finder does a small run where the learning rate is increased after each processed batch and the corresponding loss is logged. The result of this is a lr vs. loss plot that can be used as guidance for choosing an optimal initial lr.\n", - "\n", - "\n", - "warning: For the moment, this feature only works with models having a single optimizer. LR support for DDP is not implemented yet, it is coming soon.\n", - "\n", - "\n", - "***auto_lr_find=***\n", - "\n", - "In the most basic use case, this feature can be enabled during trainer construction with Trainer(auto_lr_find=True).\n", - "When .fit(model) is called, the LR finder will automatically run before any training is done. The lr that is found and used will be written to the console and logged together with all other hyperparameters of the model." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "iuhve9RBOfFh" - }, - "source": [ - "# default used by the Trainer (no learning rate finder)\n", - "trainer = pl.Trainer(mnist_model, auto_lr_find=False)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BL-gjXNCPDXk" - }, - "source": [ - "This flag sets your learning rate which can be accessed via self.lr or self.learning_rate.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "wEb-vIMmPJQf" - }, - "source": [ - "class LitModel(LightningModule):\n", - "\n", - " def __init__(self, learning_rate):\n", - " self.learning_rate = learning_rate\n", - "\n", - " def configure_optimizers(self):\n", - " return Adam(self.parameters(), lr=(self.lr or self.learning_rate))\n", - "\n", - "# finds learning rate automatically\n", - "# sets hparams.lr or hparams.learning_rate to that learning rate\n", - "trainer = pl.Trainer(mnist_model, auto_lr_find=True)\n", - "\n", - "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RweqvpnVPPSh" - }, - "source": [ - "To use an arbitrary value set it as auto_lr_find\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "4LKI39IfPLJv" - }, - "source": [ - "trainer = pl.Trainer(mnist_model, auto_lr_find='my_value')\n", - "\n", - "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9VAhPRKbPX-m" - }, - "source": [ - "Under the hood, when you call tune it runs the learning rate finder.\n", - "\n", - "If you want to inspect the results of the learning rate finder before doing any actual training or just play around with the parameters of the algorithm, this can be done by invoking the lr_find method of the trainer. A typical example of this would look like\n", - "\n", - "\n", - "```\n", - "trainer = pl.Trainer(auto_lr_find=True)\n", - "\n", - "# Run learning rate finder\n", - "lr_finder = trainer.lr_find(model)\n", - "\n", - "# Results can be found in\n", - "lr_finder.results\n", - "\n", - "# Plot with\n", - "fig = lr_finder.plot(suggest=True)\n", - "fig.show()\n", - "\n", - "# Pick point based on plot, or get suggestion\n", - "new_lr = lr_finder.suggestion()\n", - "\n", - "# update hparams of the model\n", - "model.hparams.lr = new_lr\n", - "\n", - "# Fit model\n", - "trainer.fit(model)\n", - "```\n", - "\n", - "The figure produced by lr_finder.plot() should look something like the figure below. It is recommended to not pick the learning rate that achieves the lowest loss, but instead something in the middle of the sharpest downward slope (red point). This is the point returned py lr_finder.suggestion().\n", - "\n", - "![image.png]()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tn1RV-jfOjt1" - }, - "source": [ - "# `benchmark`\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rsmTl5zfwjM3" - }, - "source": [ - "You can try to speed your system by setting `benchmark=True`, which enables cudnn.benchmark. This flag is likely to increase the speed of your system if your input sizes don’t change. This flag makes cudnn auto-tuner look for the optimal set of algorithms for the given hardware configuration. This usually leads to faster runtime.\n", - "But if your input sizes changes at each iteration, then cudnn will benchmark every time a new size appears, possibly leading to worse runtime performances." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "dWr-OCBgQCeb" - }, - "source": [ - "trainer = pl.Trainer(gpus=1, benchmark=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qwAvSKYGa24K" - }, - "source": [ - "# `deterministic`\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tl5mfmafwmat" - }, - "source": [ - "PyTorch does not guarantee reproducible results, even when using identical seeds. To guarentee reproducible results, you can remove most of the randomness from your process by setting the `deterministic` flag to True.\n", - "\n", - "Note that it might make your system slower." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Mhv5LZ3HbNCK" - }, - "source": [ - "trainer = pl.Trainer(gpus=1, deterministic=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "u_5eJSvTf60f" - }, - "source": [ - "# Exploding and vanishing gradients" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "B6drjh4pq6Jv" - }, - "source": [ - "## track_grad_norm\n", - "\n", - "You can debug your grad norm to identify exploding or vanishing gradients using the `track_grad_norm` flag.\n", - "\n", - "Set value to 2 to track the 2-norm. or p to any p-norm." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "2taHUir8rflR" - }, - "source": [ - "# track the 2-norm\n", - "trainer = pl.Trainer(track_grad_norm=2)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3vHKxmruk62f" - }, - "source": [ - "May be set to ‘inf’ infinity-norm." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "g7TbD6SxlAjP" - }, - "source": [ - "trainer = pl.Trainer(track_grad_norm='inf')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TcMlRe7ywpe6" - }, - "source": [ - "## Gradient clipping\n", - "\n", - "\n", - "Exploding gradients refer to the problem that the gradients get too large and overflow in training, making the model unstable. Gradient clipping will ‘clip’ the gradients or cap them to a Threshold value to prevent the gradients from getting too large. To avoid this, we can set `gradient_clip_val` (default is set to 0.0).\n", - "\n", - "[when to use it, what are relevant values]" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "jF9JwmbOgOWF" - }, - "source": [ - "trainer = pl.Trainer(gradient_clip_val=0.1)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ggb4MkkQrr1h" - }, - "source": [ - "# truncated_bptt_steps\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "s1Iu6PyAw9_r" - }, - "source": [ - "If you have a large recurrent model, you can use truncated_bptt_steps flag to split up the backprop over portions of the sequence. This flag will automatically truncate your batches and the trainer will apply Truncated Backprop to it.\n", - "\n", - "Make sure your batches have a sequence dimension.\n", - "\n", - "Lightning takes care of splitting your batch along the time-dimension.\n", - "```\n", - "# we use the second as the time dimension\n", - "# (batch, time, ...)\n", - "sub_batch = batch[0, 0:t, ...]\n", - "Using this feature requires updating your LightningModule’s pytorch_lightning.core.LightningModule.training_step() to include a hiddens arg with the hidden\n", - "\n", - "# Truncated back-propagation through time\n", - "def training_step(self, batch, batch_idx, hiddens):\n", - " # hiddens are the hiddens from the previous truncated backprop step\n", - " out, hiddens = self.lstm(data, hiddens)\n", - "\n", - " return {\n", - " \"loss\": ...,\n", - " \"hiddens\": hiddens # remember to detach() this\n", - " }\n", - "```" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "WiTF1VMtruMU" - }, - "source": [ - "# backprop every 5 steps in a batch\n", - "trainer = pl.Trainer(truncated_bptt_steps=5)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8XI_kEWkS-nT" - }, - "source": [ - "To modify how the batch is split, override pytorch_lightning.core.LightningModule.tbptt_split_batch():\n", - "\n", - "```\n", - "class LitMNIST(LightningModule):\n", - " def tbptt_split_batch(self, batch, split_size):\n", - " # do your own splitting on the batch\n", - " return splits\n", - "```\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oLbEmbmupwQ8" - }, - "source": [ - "# reload_dataloaders_every_epoch\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CLdNGVv9xD_L" - }, - "source": [ - "Set to True to reload dataloaders every epoch (instead of loading just once in the beginning of training).\n", - "\n", - "```\n", - "# if False (default)\n", - "train_loader = model.train_dataloader()\n", - "for epoch in epochs:\n", - " for batch in train_loader:\n", - " ...\n", - "\n", - "# if True\n", - "for epoch in epochs:\n", - " train_loader = model.train_dataloader()\n", - " for batch in train_loader:\n", - "\n", - "```" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "10AXthXxp311" - }, - "source": [ - "trainer = pl.Trainer(reload_dataloaders_every_epoch=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "f513EYl0bmmL" - }, - "source": [ - "# Callbacks\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2pt7iGh4xNs5" - }, - "source": [ - "\n", - "Lightning Callbacks are self-contained programs that can be reused across projects.\n", - "Callbacks should capture NON-ESSENTIAL logic that is NOT required for your LightningModule to run. Lightning includes some a few built-in callbacks that can be used with flags like early stopping and Model Checkpointing, but you can also create your own callbacks to add any functionality to your models.\n", - "\n", - "The callback API includes hooks that allow you to add logic at every point of your training:\n", - "setup, teardown, on_epoch_start, on_epoch_end, on_batch_start, on_batch_end, on_init_start, on_keyboard_interrupt etc. \n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1t84gvDNsUuh" - }, - "source": [ - "## callbacks\n", - "\n", - "Use **callbacks=** to pass a list of user defined callbacks. These callbacks DO NOT replace the built-in callbacks (loggers or EarlyStopping). \n", - "\n", - "In this example, we create a dummy callback that prints a message when training starts and ends, using on_train_start and on_train_end hooks." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "oIXZYabub3f0" - }, - "source": [ - "from pytorch_lightning.callbacks import Callback\n", - "\n", - "class PrintCallback(Callback):\n", - " def on_train_start(self, trainer, pl_module):\n", - " print(\"Training is started!\")\n", - " def on_train_end(self, trainer, pl_module):\n", - " print(\"Training is done.\")\n", - "\n", - "# a list of callbacks\n", - "callbacks = [PrintCallback()]\n", - "trainer = pl.Trainer(callbacks=callbacks)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cNF74CLYfJJu" - }, - "source": [ - "# Model checkpointing\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2blgquBrxLtS" - }, - "source": [ - "Checkpoints capture the exact value of all parameters used by a model.\n", - "\n", - "Checkpointing your training allows you to resume a training process in case it was interrupted, fine-tune a model or use a pre-trained model for inference without having to retrain the model.\n", - "\n", - "Lightning automates saving and loading checkpoints so you restore a training session, saving all the required parameters including: \n", - "* 16-bit scaling factor (apex)\n", - "* Current epoch\n", - "* Global step\n", - "* Model state_dict\n", - "* State of all optimizers\n", - "* State of all learningRate schedulers\n", - "* State of all callbacks\n", - "* The hyperparameters used for that model if passed in as hparams (Argparse.Namespace)\n", - "\n", - "By default Lightning will save a checkpoint in the working directory, which will be updated every epoch.\n", - "\n", - "### Automatic saving\n", - "By default Lightning will save a checkpoint in the end of the first epoch in the working directory, which will be updated every epoch." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "XGu0JULrg9l7" - }, - "source": [ - "# default used by the Trainer\n", - "trainer = pl.Trainer(default_root_path=os.getcwd())\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3s9OjkGuhq1W" - }, - "source": [ - "To change the checkpoint path pass in **default_root_dir=**" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "DgdxkrIQhvfw" - }, - "source": [ - "trainer = pl.Trainer(default_root_dir='/your/path/to/save/checkpoints')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Qyvj_bkWrJiE" - }, - "source": [ - "\n", - "You can also have Lightning update your checkpoint based on a specific metric that you are logging (using self.log), by passing the key to `monitor=`. For example, if we want to save checkpoint based on the validation loss, logged as `val_loss`, you can pass:\n", - "\n", - "\n", - "```\n", - "checkpoint_callback = ModelCheckpoint(\n", - " filepath=os.getcwd(),\n", - " save_top_k=1,\n", - " verbose=True,\n", - " monitor='val_loss',\n", - " mode='min',\n", - " prefix=''\n", - ")\n", - "```\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "YzYMivw1rO1O" - }, - "source": [ - "from pytorch_lightning.callbacks import ModelCheckpoint\n", - "\n", - "trainer = pl.Trainer(callbacks=[ModelCheckpoint(monitor='val_loss')])\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5hYs_FV8iDMn" - }, - "source": [ - "You can modify the behavior of checkpointing by creating your own callback, and passing it to the trainer. \n", - "You can control\n", - "* filepath- where logs are saved\n", - "* save_top_k- save k top models\n", - "* verbose\n", - "* monitor- the metric to monitor\n", - "* mode\n", - "* prefix\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Tb1K2VYDiNTu" - }, - "source": [ - "from pytorch_lightning.callbacks import ModelCheckpoint\n", - "\n", - "# DEFAULTS used by the Trainer\n", - "checkpoint_callback = ModelCheckpoint(\n", - " filepath=os.getcwd(),\n", - " save_top_k=3,\n", - " verbose=True,\n", - " monitor='val_loss',\n", - " mode='min',\n", - " prefix='',\n", - ")\n", - "\n", - "trainer = Trainer(callbacks=[checkpoint_callback])\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YKhZ6xRojJcl" - }, - "source": [ - "You can disable checkpointing it by passing\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Yt8zd2ZFjOXX" - }, - "source": [ - "trainer = Trainer(checkpoint_callback=False)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HcLy8asCjrj9" - }, - "source": [ - "### Manual saving\n", - "\n", - "You can manually save checkpoints and restore your model from the checkpointed state.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "kZSkMJf0jR4x" - }, - "source": [ - "trainer.fit(model)\n", - "trainer.save_checkpoint(\"example.ckpt\")\n", - "new_model = LitAutoEncoder.load_from_checkpoint(checkpoint_path=\"example.ckpt\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "X2d9cjVPj7CP" - }, - "source": [ - "### Checkpoint Loading\n", - "To load a model along with its weights, biases and module_arguments use following method:\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "BpAFfg5zkFmH" - }, - "source": [ - "model = LitAutoEncoder.load_from_checkpoint(PATH)\n", - "\n", - "print(model.learning_rate)\n", - "# prints the learning_rate you used in this checkpoint\n", - "\n", - "model.eval()\n", - "y_hat = model(x)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jTQ3mxSJkhFN" - }, - "source": [ - "But if you don’t want to use the values saved in the checkpoint, pass in your own here" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "IoMcOh9-kfUP" - }, - "source": [ - "class LitAutoEncoder(LightningModule):\n", - "\n", - " def __init__(self, in_dim, out_dim):\n", - " super().__init__()\n", - " self.save_hyperparameters()\n", - " self.l1 = nn.Linear(self.hparams.in_dim, self.hparams.out_dim)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ITPVY8mNknut" - }, - "source": [ - "you can restore the model like this\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "H7XeRJzVkuY8" - }, - "source": [ - "# if you train and save the model like this it will use these values when loading\n", - "# the weights. But you can overwrite this\n", - "LitAutoEncoder(in_dim=32, out_dim=10)\n", - "\n", - "# uses in_dim=32, out_dim=10\n", - "model = LitAutoEncoder.load_from_checkpoint(PATH)\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "14WwGpnVk0a4" - }, - "source": [ - "# uses in_dim=128, out_dim=10\n", - "model = LitAutoEncoder.load_from_checkpoint(PATH, in_dim=128, out_dim=10)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bY5s6wP_k1CU" - }, - "source": [ - "\n", - "\n", - "## Restoring Training State (resume_from_checkpoint)\n", - "If your training was cut short for some reason, you can resume exactly from where you left off using the `resume_from_checkpoint` flag, which will automatically restore model, epoch, step, LR schedulers, apex, etc..." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "9zfhHtyrk3rO" - }, - "source": [ - "model = LitAutoEncoder()\n", - "trainer = pl.Trainer(resume_from_checkpoint='some/path/to/my_checkpoint.ckpt')\n", - "\n", - "# automatically restores model, epoch, step, LR schedulers, apex, etc...\n", - "trainer.fit(model)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xkKdvALFsmT2" - }, - "source": [ - "## weights_save_path\n", - "You can specify a directory for saving weights file using `weights_save_path`.\n", - "\n", - "(If you are using a custom checkpoint callback, the checkpoint callback will override this flag)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "9OwHHFcCsrgT" - }, - "source": [ - "# save to your custom path\n", - "trainer = pl.Trainer(weights_save_path='my/path')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "PbNtlJ9Wsscf" - }, - "source": [ - "# if checkpoint callback used, then overrides the weights path\n", - "# **NOTE: this saves weights to some/path NOT my/path\n", - "checkpoint = ModelCheckpoint(filepath='some/path')\n", - "trainer = pl.Trainer(\n", - " callbacks=[checkpoint],\n", - " weights_save_path='my/path'\n", - ")\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uDdxCuyHdWQt" - }, - "source": [ - "# Early stopping\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fqAy3ihRxTfR" - }, - "source": [ - "The EarlyStopping callback can be used to monitor a validation metric and stop the training when no improvement is observed, to help you avoid overfitting.\n", - "\n", - "To enable Early Stopping you can init the EarlyStopping callback, and pass it to `callbacks=` trainer flag. The callback will look for a logged metric to early stop on.\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "lFx976CheH93" - }, - "source": [ - "from pytorch_lightning.callbacks.early_stopping import EarlyStopping\n", - "\n", - "trainer = pl.Trainer(callbacks=[EarlyStopping('val_loss')])\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MwpJfTvjeOwF" - }, - "source": [ - "You can customize the callback using the following params:\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "V6I9h6HteK2U" - }, - "source": [ - "from pytorch_lightning.callbacks.early_stopping import EarlyStopping\n", - "\n", - "early_stop_callback = EarlyStopping(\n", - " monitor='val_accuracy',\n", - " min_delta=0.00,\n", - " patience=3,\n", - " verbose=False,\n", - " mode='max'\n", - ")\n", - "trainer = pl.Trainer(callbacks=[early_stop_callback])\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7TAIerPYe_Q1" - }, - "source": [ - "The EarlyStopping callback runs at the end of every validation epoch, which, under the default configuration, happens after every training epoch. However, the frequency of validation can be modified by setting various parameters on the Trainer, for example check_val_every_n_epoch and val_check_interval. It must be noted that the patience parameter counts the number of validation epochs with no improvement, and not the number of training epochs. Therefore, with parameters check_val_every_n_epoch=10 and patience=3, the trainer will perform at least 40 training epochs before being stopped." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VoKrX2ENh9Fg" - }, - "source": [ - "# Logging" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-CQTPKd7iKLm" - }, - "source": [ - "Lightning has built in integration with various loggers such as TensorBoard, wandb, commet, etc.\n", - "\n", - "\n", - "You can pass any metrics you want to log during training to `self.log`, such as loss or accuracy. Similarly, pass in to self.log any metric you want to log during validation step.\n", - "\n", - "These values will be passed in to the logger of your choise. simply pass in any supported logger to logger trainer flag.\n", - "\n", - "\n", - "\n", - "Use the as`logger=` trainer flag to pass in a Logger, or iterable collection of Loggers, for experiment tracking.\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ty5VPS3AiS8L" - }, - "source": [ - "from pytorch_lightning.loggers import TensorBoardLogger\n", - "\n", - "# default logger used by trainer\n", - "logger = TensorBoardLogger(\n", - " save_dir=os.getcwd(),\n", - " version=1,\n", - " name='lightning_logs'\n", - ")\n", - "trainer = pl.Trainer(logger=logger)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jc5oWNpoiuuc" - }, - "source": [ - "Lightning supports the use of multiple loggers, just pass a list to the Trainer.\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "BlYwMRRyivp_" - }, - "source": [ - "from pytorch_lightning.loggers import TensorBoardLogger, TestTubeLogger\n", - "logger1 = TensorBoardLogger('tb_logs', name='my_model')\n", - "logger2 = TestTubeLogger('tb_logs', name='my_model')\n", - "trainer = pl.Trainer(logger=[logger1, logger2])" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a7EyspQPh7iQ" - }, - "source": [ - "## flush_logs_every_n_steps\n", - "\n", - "Use this flag to determine when logging to disc should happen." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Em_XvsmyiBbk" - }, - "source": [ - "trainer = pl.Trainer(flush_logs_every_n_steps=100)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_vDeKE98qsl1" - }, - "source": [ - "## log_every_n_steps\n", - "How often to add logging rows (does not write to disk)\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "HkqD7D_0w1Tt" - }, - "source": [ - "trainer = pl.Trainer(log_every_n_steps=1000)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9uw0gfe422CT" - }, - "source": [ - "# info logging" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dQXpt0aatDGo" - }, - "source": [ - "### default_root_dir\n", - "\n", - "---\n", - "\n", - "\n", - "\n", - "Default path for logs and weights when no logger or pytorch_lightning.callbacks.ModelCheckpoint callback passed. On certain clusters you might want to separate where logs and checkpoints are stored. If you don’t then use this argument for convenience. Paths can be local paths or remote paths such as s3://bucket/path or ‘hdfs://path/’. Credentials will need to be set up to use remote filepaths." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CMmID2Bts5W3" - }, - "source": [ - "## weights_summary\n", - "Prints a summary of the weights when training begins. Default is set to `top`- print summary of top level modules.\n", - "\n", - "Options: ‘full’, ‘top’, None." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "KTl6EdwDs6j2" - }, - "source": [ - "\n", - "# print full summary of all modules and submodules\n", - "trainer = pl.Trainer(weights_summary='full')\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "R57cSLl9w9ma" - }, - "source": [ - "# don't print a summary\n", - "trainer = Trainer(weights_summary=None)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bSc2hU5AotAP" - }, - "source": [ - "# progress bar" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GgvbyDsBxcH6" - }, - "source": [ - "## process_position\n", - "\n", - "Orders the progress bar. Useful when running multiple trainers on the same node.\n", - "\n", - "(This argument is ignored if a custom callback is passed to callbacks)\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "6ekz8Es8owDn" - }, - "source": [ - "# default used by the Trainer\n", - "trainer = pl.Trainer(process_position=0)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "itivQFgEphBU" - }, - "source": [ - "## progress_bar_refresh_rate\n", - "\n", - "How often to refresh the progress bar (in steps). In notebooks, faster refresh rates (lower number) is known to crash them because of their screen refresh rates, so raise it to 50 or more." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "GKe6eVxmplL5" - }, - "source": [ - "# default used by the Trainer\n", - "trainer = pl.Trainer(progress_bar_refresh_rate=1)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "8rDHJOJbxNtf" - }, - "source": [ - "# disable progress bar\n", - "trainer = Trainer(progress_bar_refresh_rate=0)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NCNvYLwjpWne" - }, - "source": [ - "# profiler" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "pRknrG_zpY6M" - }, - "source": [ - "# to profile standard training events\n", - "trainer = pl.Trainer(profiler=True)\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Ji6aWpU73kMM" - }, - "source": [ - "You can also use Lightning AdvancedProfiler if you want more detailed information about time spent in each function call recorded during a given action. The output is quite verbose and you should only use this if you want very detailed reports.\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "layG55pt316C" - }, - "source": [ - "from pytorch_lightning.profiler import AdvancedProfiler\n", - "\n", - "trainer = Trainer(profiler=AdvancedProfiler())\n", - "\n", - "trainer.fit(model, train_loader, val_loader)" - ], - "execution_count": null, - "outputs": [] - } - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "goRmGIRI5cfC" + }, + "source": [ + "# Introduction to Lightning Flags ⚡🚩\n", + "\n", + "In this notebook, we'll go over the flags available in the `Trainer` object. Note that not everything will work in the Colab environment (multi-gpu, etc). This notebook accompanies the Trainer videos we'll be putting out.\n", + "\n", + "---\n", + " - Give us a ⭐ [on Github](https://www.github.com/PytorchLightning/pytorch-lightning/)\n", + " - Check out [the documentation](https://pytorch-lightning.readthedocs.io/en/latest/)\n", + " - Join us [on Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jKj5lgdr5j48" + }, + "source": [ + "--- \n", + "### Setup \n", + "First thing first, we need to install Lightning. Simply ```pip install pytorch-lightning```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UGjilEHk4vb7" + }, + "outputs": [], + "source": [ + "! pip install pytorch-lightning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zaVUShmQ5n8Y" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "from argparse import ArgumentParser\n", + "import torch\n", + "from torch import nn\n", + "from torch.nn import functional as F\n", + "from torch.utils.data import DataLoader\n", + "from torch.utils.data import random_split\n", + "from torchvision.datasets import MNIST\n", + "from torchvision import transforms\n", + "import pytorch_lightning as pl\n", + "from pytorch_lightning.metrics.functional import accuracy\n", + "\n", + "from torchvision.datasets.mnist import MNIST\n", + "from torchvision import transforms" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6tgkS8IYZwY_" + }, + "outputs": [], + "source": [ + "# ------------\n", + "# data\n", + "# ------------\n", + "pl.seed_everything(1234)\n", + "batch_size = 32\n", + "\n", + "# Init DataLoader from MNIST Dataset\n", + "\n", + "dataset = MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())\n", + "mnist_test = MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor())\n", + "mnist_train, mnist_val = random_split(dataset, [55000, 5000])\n", + "\n", + "train_loader = DataLoader(mnist_train, batch_size=batch_size)\n", + "val_loader = DataLoader(mnist_val, batch_size=batch_size)\n", + "test_loader = DataLoader(mnist_test, batch_size=batch_size)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gEulmrbxwaYL" + }, + "source": [ + "### Simple AutoEncoder Model\n", + "\n", + "Were gonna define a simple Lightning model so we can play with all the settings of the Lightning Trainer.\n", + "\n", + "LightningModule is simply pure Pytorch reorganized into hooks, that represents all the steps in the training process.\n", + "\n", + "You can use LightningModule hooks to control every part of your model, but for the purpose of this video we will use a very simple MNIST classifier, a model that takes 28*28 grayscale images of hand written images, and can predict the digit between 0-9.\n", + "\n", + "The LightningModule can encompass a single model, like an image classifier, or a deep learning system composed of multiple models, like this auto encoder that contains an encoder and a decoder.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "x-34xKCI40yW" + }, + "outputs": [], + "source": [ + "class LitAutoEncoder(pl.LightningModule):\n", + "\n", + " def __init__(self, batch_size=32, lr=1e-3):\n", + " super().__init__()\n", + " self.encoder = nn.Sequential(\n", + " nn.Linear(28 * 28, 64),\n", + " nn.ReLU(),\n", + " nn.Linear(64, 3)\n", + " )\n", + " self.decoder = nn.Sequential(\n", + " nn.Linear(3, 64),\n", + " nn.ReLU(),\n", + " nn.Linear(64, 28 * 28)\n", + " )\n", + " self.batch_size=batch_size\n", + " self.learning_rate=lr\n", + "\n", + " def forward(self, x):\n", + " # in lightning, forward defines the prediction/inference actions\n", + " embedding = self.encoder(x)\n", + " return embedding\n", + "\n", + " def training_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " x = x.view(x.size(0), -1)\n", + " z = self.encoder(x)\n", + " x_hat = self.decoder(z)\n", + " loss = F.mse_loss(x_hat, x)\n", + " self.log('train_loss', loss)\n", + " return loss\n", + "\n", + " def validation_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " x = x.view(x.size(0), -1)\n", + " z = self.encoder(x)\n", + " x_hat = self.decoder(z)\n", + " loss = F.mse_loss(x_hat, x)\n", + " self.log('val_loss', loss)\n", + " \n", + " def test_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " x = x.view(x.size(0), -1)\n", + " z = self.encoder(x)\n", + " x_hat = self.decoder(z)\n", + " loss = F.mse_loss(x_hat, x)\n", + " self.log('test_loss', loss)\n", + "\n", + " def configure_optimizers(self):\n", + " optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)\n", + " return optimizer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VbxcRCrxiYly" + }, + "source": [ + "You'll notice the LightningModule doesn't have epoch and batch loops, we're not calling model.train() and model.eval(), and no mentions of CUDA or hardware. That's because it is all automated by the Lightning Trainer. All the engineering boilerplate is automated by the trainer: \n", + "\n", + "* Training loops\n", + "* Evaluation and test loops\n", + "* Calling model.train(), model.eval(), no_grad at the right time\n", + "* CUDA or to_device calls\n", + "\n", + "It also allows you to train your models on different hardware like GPUs and TPUs without changing your code!\n", + "\n", + "\n", + "### To use the lightning trainer simply:\n", + "\n", + "1. init your LightningModule and datasets\n", + "\n", + "2. init lightning trainer\n", + "\n", + "3. call trainer.fit\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HOk9c4_35FKg" + }, + "outputs": [], + "source": [ + "#####################\n", + "# 1. Init Model\n", + "#####################\n", + "\n", + "model = LitAutoEncoder()\n", + "\n", + "#####################\n", + "# 2. Init Trainer\n", + "#####################\n", + "\n", + "# these 2 flags are explained in the later sections...but for short explanation:\n", + "# - progress_bar_refresh_rate: limits refresh rate of tqdm progress bar so Colab doesn't freak out\n", + "# - max_epochs: only run 2 epochs instead of default of 1000\n", + "trainer = pl.Trainer(progress_bar_refresh_rate=20, max_epochs=2)\n", + "\n", + "#####################\n", + "# 3. Train\n", + "#####################\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3meDako-Qa_6" + }, + "source": [ + "Our model is training just like that, using the Lightning defaults. The beauty of Lightning is that everything is easily configurable.\n", + "In our next videos were going to show you all the ways you can control your Trainer to do things like controlling your training, validation and test loops, running on GPUs and TPUs, checkpointing, early stopping, and a lot more.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z_Wry2MckQkI" + }, + "source": [ + "# Training loop and eval loop Flags" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0MkI1xB2vsLj" + }, + "source": [ + "\n", + "To really scale up your networks, you can use accelerators like GPUs. GPUs or Graphical Processing Units, parallelize matrix multiplications which enable speed ups of at least 100x over training on CPUs.\n", + "\n", + "Let's say you have a machine with 8 GPUs on it. You can set this flag to 1, 4, or 8 GPUs and lightning will automatically distribute your training for you.\n", + "\n", + "```\n", + "trainer = pl.Trainer(gpus=1)\n", + "```\n", + "\n", + "---------\n", + "\n", + "Lightning makes your code hardware agnostic... This means, you can switch between CPUs, GPUs without code changes.\n", + "\n", + "However, it requires forming good PyTorch habits:\n", + "\n", + "1. First, remove the .cuda() or .to() calls in your code.\n", + "2. Second, when you initialize a new tensor, set the device=self.device in the call since every lightningModule knows what gpu index or TPU core it is on.\n", + "\n", + "You can also use type_as and or you can register the tensor as a buffer in your module’s __init__ method with register_buffer().\n", + "\n", + "```\n", + "# before lightning\n", + "def forward(self, x):\n", + " z = torch.Tensor(2, 3)\n", + " z = z.cuda(0)\n", + "\n", + "# with lightning\n", + "def forward(self, x):\n", + " z = torch.Tensor(2, 3)\n", + " z = z.type_as(x, device=self.device)\n", + "```\n", + "\n", + "\n", + "```\n", + "class LitModel(LightningModule):\n", + "\n", + " def __init__(self):\n", + " ...\n", + " self.register_buffer(\"sigma\", torch.eye(3))\n", + " # you can now access self.sigma anywhere in your module\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hw6jJhhjvlSL" + }, + "source": [ + "Lightning Trainer automates all the engineering boilerplate like iterating over epochs and batches, training eval and test loops, CUDA and to(device) calls, calling model.train and model.eval.\n", + "\n", + "You still have full control over the loops, by using the following trainer flags:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pT5-ETH9eUg6" + }, + "source": [ + "## Calling validation steps\n", + "Sometimes, training an epoch may be pretty fast, like minutes per epoch. In this case, you might not need to validate on every epoch. Instead, you can actually validate after a few epochs.\n", + "\n", + "Use `check_val_every_n_epoch` flag to control the frequency of validation step:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z-EMVvKheu3D" + }, + "outputs": [], + "source": [ + "# run val loop every 10 training epochs\n", + "trainer = pl.Trainer(check_val_every_n_epoch=10)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UOzZr9S2UcSO" + }, + "source": [ + "## val_check_interval\n", + "\n", + "In some cases where your epoch is very long, you might want to check validation within an epoch.\n", + "\n", + "You can also run validation step within your training epochs, by setting `val_check_interval` flag.\n", + "\n", + "Set `val_check_interval` to a float between [0.0 to 1.0] to check your validation set within a training epoch. For example, setting it to 0.25 will check your validation set 4 times during a training epoch.\n", + "\n", + "Default is set to 1.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9kbUbvrUVLrT" + }, + "outputs": [], + "source": [ + "# check validation set 4 times during a training epoch\n", + "trainer = pl.Trainer(val_check_interval=0.25)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Onm1gBsKVaw4" + }, + "source": [ + "When you have iterable data sets, or when streaming data for production use cases, it is useful to check the validation set every number of steps. \n", + "Set val_check_interval to an int:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "psn6DVb5Vi85" + }, + "outputs": [], + "source": [ + "# check validation set every 1000 training batches\n", + "# use this when using iterableDataset and your dataset has no length\n", + "# (ie: production cases with streaming data)\n", + "trainer = pl.Trainer(val_check_interval=1000)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QkoYonrWkb7-" + }, + "source": [ + "## num_sanity_val_steps \n", + "\n", + "You may have run into an issue, where you have a bug in your validation loop, but won't catch it until your training loop ends.\n", + "\n", + "and if your training loop takes hours or days, you will waste valuable compute.\n", + "\n", + "Instead, lightning automatically runs through 2 steps of validation in the beginning to catch these kinds of bugs up front.\n", + "\n", + "\n", + "The `num_sanity_val_steps` flag can help you run n batches of validation before starting the training routine.\n", + "\n", + "You can set it to 0 to turn it off" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zOcT-ugSkiKW" + }, + "outputs": [], + "source": [ + "# turn it off\n", + "trainer = pl.Trainer(num_sanity_val_steps=0)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zS0ob1ZmTw56" + }, + "source": [ + "Set it to -1 to check all validation data before training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rzqvjA4UT263" + }, + "outputs": [], + "source": [ + "# check all validation data\n", + "trainer = pl.Trainer(num_sanity_val_steps=-1)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uMB41wq4T3Z2" + }, + "source": [ + "Or use any arbitrary number of validation steps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lGP78aQzT7VS" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(num_sanity_val_steps=10)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "H-xaYRtd1rb-" + }, + "source": [ + "## Limit train, validation, and test batches\n", + "\n", + "You can set limits on how much of training, validation and test dataset you want your model to check. This is useful if you have really large validation or tests sets, for debugging or testing something that happens at the end of an epoch.\n", + "\n", + "Set the flag to int to specify the number of batches to run\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XiK5cFKL1rcA" + }, + "outputs": [], + "source": [ + "# run for only 10 batches\n", + "trainer = pl.Trainer(limit_test_batches=10)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y4LK0g65RrBm" + }, + "source": [ + "For example, some metrics need to be computed on the entire validation results, such as AUC ROC. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8MmeRs2DR3dD" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(limit_val_batches=10)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xmigcNa1A2Vy" + }, + "source": [ + "You can use a float to limit the batches be percentage of the set on every epoch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "W7uGJt8nA4tv" + }, + "outputs": [], + "source": [ + "# run through only 25% of the test set each epoch\n", + "trainer = pl.Trainer(limit_test_batches=0.25)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YRI8THtUN7_e" + }, + "source": [ + "# Training on GPUs\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R8FFkX_FwlfE" + }, + "source": [ + "To run on 1 GPU set the flag to 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nnzkf3KaOE27" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gpus=1)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cxBg47s5PB1P" + }, + "source": [ + "to run on 2 or 4 GPUs, set the flag to 2 or 4." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cSEM4ihLrohT" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gpus=2)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZE6ZgwtNudro" + }, + "source": [ + "You can also select which GPU devices to run on, using a list of indices like [1, 4] \n", + "\n", + "or a string containing a comma separated list of GPU ids like '1,2'\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gQkJtq0urrjq" + }, + "outputs": [], + "source": [ + "# list: train on GPUs 1, 4 (by bus ordering)\n", + "# trainer = Trainer(gpus='1, 4') # equivalent\n", + "trainer = pl.Trainer(gpus=[1, 4])\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XghDPad4us74" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gpus=list(range(4)))\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6FVkKHpSPMTW" + }, + "source": [ + "You can use all the GPUs you have available by setting `gpus=-1`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "r6cKQijYrtPe" + }, + "outputs": [], + "source": [ + "# trainer = Trainer(gpus='-1') - equivalent\n", + "trainer = pl.Trainer(gpus=-1)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2C-fNLm3UGCV" + }, + "source": [ + "Lightning uses the PCI bus_id as the index for ordering GPUs." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_V75s7EhOFhE" + }, + "source": [ + "### `auto_select_gpus`\n", + "\n", + "You can save on GPUs by running in “exclusive mode”, meaning only one process at a time can access them. If your not sure which GPUs you should use when running exclusive mode, Lightning can automatically find unoccupied GPUs for you. \n", + "\n", + "Simply specify the number of gpus as an integer `gpus=k`, and set the trainer flag `auto_select_gpus=True`. Lightning will automatically help you find k gpus that are not occupied by other processes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_Sd3XFsAOIwd" + }, + "outputs": [], + "source": [ + "# enable auto selection (will find two available gpus on system)\n", + "trainer = pl.Trainer(gpus=2, auto_select_gpus=True)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a5JGSBMQhJNp" + }, + "source": [ + "## analyzing GPU usage\n", + "\n", + "### log_gpu_memory\n", + "\n", + "This is useful to analyze the memory usage of your GPUs.\n", + "\n", + "To get the GPU memory usage for every GPU on the master node, set the flag to log_gpu_memory=all.\n", + "\n", + "Under the hood, lightning uses the nvidia-smi command which may slow your training down.\n", + "\n", + "Your logs can become overwhelmed if you log the usage from many GPUs at once. In this case, you can also set the flag to min_max which will log only the min and max usage across all the GPUs of the master node.\n", + "\n", + "Note that lightning is not logging the usage across all nodes for performance reasons." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "idus3ZGahOki" + }, + "outputs": [], + "source": [ + "# log all the GPUs (on master node only)\n", + "trainer = Trainer(log_gpu_memory='all')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-mevgiy_hkip" + }, + "source": [ + "To avoid the performance decrease you can also set `log_gpu_memory=min_max` to only log the min and max memory on the master node.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SlvLJnWyhs7J" + }, + "outputs": [], + "source": [ + "# log only the min and max memory on the master node\n", + "trainer = Trainer(log_gpu_memory='min_max')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "K82FLLIJVQG3" + }, + "source": [ + "\n", + "But what if you want to train on multiple machines and not just one?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YViQ6PXesAue" + }, + "source": [ + "# Training on multiple GPUs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WacbBQUivxQq" + }, + "source": [ + "Lightning makes your models hardware agnostic, and you can run on GPUs with a flip of a flag. Lightning also supports training on multiple GPUs across many machines.\n", + "\n", + "You can do this by setting the num_nodes flag.\n", + "\n", + "The world size, or the total number of GPUs you are using, will be gpus*num_nodes.\n", + "\n", + "If i set gpus=8 and num_nodes=32 then I will be training on 256 GPUs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5iKckmDvr8zZ" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gpus=8, num_nodes=32)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GgcSbDjjlSTh" + }, + "source": [ + "## distributed backends\n", + "\n", + "Under the hood, Lightning uses distributed data parallel (or DDP) by default to distribute training across GPUs.\n", + "\n", + "This Lightning implementation of DDP calls your script under the hood multiple times with the correct environment variables.\n", + "\n", + "Under the hood it's as if you had called your script like this:\n", + "\n", + "1. Each GPU across each node gets its own process.\n", + "2. Each GPU gets visibility into a subset of the overall dataset. It will only ever see that subset.\n", + "3. Each process inits the model. (Make sure to set the random seed so that each model initializes with the same weights.)\n", + "4. Each process performs a full forward and backward pass in parallel.\n", + "5. The gradients are synced and averaged across all processes.\n", + "6. Each process updates its optimizer.\n", + "If you request multiple GPUs or nodes without setting a mode, DDP will be automatically used.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "n_Brr7F5wdtj" + }, + "outputs": [], + "source": [ + "# ddp = DistributedDataParallel\n", + "# trainer = pl.Trainer(gpus=2, num_nodes=2) equivalent\n", + "trainer = pl.Trainer(gpus=2, num_nodes=2, distributed_backend='ddp')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "edxHyttC5J3e" + }, + "source": [ + "DDP is the fastest and recommended way to distribute your training, but you can pass in other backends to `distributed_backend` trainer flag, when DDP is not supported.\n", + "\n", + "DDP isn't available in\n", + "* Jupyter Notebook, Google COLAB, Kaggle, etc.\n", + "* If You have a nested script without a root package\n", + "* or if Your script needs to invoke .fit or .test multiple times" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZDh96mavxHxf" + }, + "source": [ + "### DDP_SPAWN\n", + "\n", + "In these cases, you can use `ddp_spawn` instead. `ddp_spawn` is exactly like DDP except that it uses `.spawn()` to start the training processes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JM5TKtgLxo37" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gpus=2, num_nodes=2, distributed_backend='ddp_spawn')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sebhVE3qrhKK" + }, + "source": [ + "We STRONGLY discourage this use because it has limitations (due to Python and PyTorch):\n", + "\n", + "* Since .spawn() trains the model in subprocesses, the model on the main process does not get updated.\n", + "\n", + "* Dataloader(num_workers=N), where N is large, bottlenecks training with DDP… ie: it will be VERY slow or won’t work at all. This is a PyTorch limitation.\n", + "\n", + "* Forces everything to be picklable.\n", + "\n", + "DDP is MUCH faster than DDP_spawn. To be able to use DDP we recommend you: \n", + "\n", + "1. Install a top-level module for your project using setup.py\n", + "\n", + "```\n", + "# setup.py\n", + "#!/usr/bin/env python\n", + "\n", + "from setuptools import setup, find_packages\n", + "\n", + "setup(name='src',\n", + " version='0.0.1',\n", + " description='Describe Your Cool Project',\n", + " author='',\n", + " author_email='',\n", + " url='https://github.com/YourSeed', # REPLACE WITH YOUR OWN GITHUB PROJECT LINK\n", + " install_requires=[\n", + " 'pytorch-lightning'\n", + " ],\n", + " packages=find_packages()\n", + " )\n", + "\n", + "```\n", + "\n", + "2. Setup your project like so:\n", + "\n", + "```\n", + "/project\n", + " /src\n", + " some_file.py\n", + " /or_a_folder\n", + " setup.py\n", + "```\n", + "3. Install as a root-level package\n", + "```\n", + "cd /project\n", + "pip install -e .\n", + "```\n", + "4. You can then call your scripts anywhere\n", + "```\n", + "cd /project/src\n", + "\n", + "python some_file.py --distributed_backend 'ddp' --gpus 8\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cmB3I_oyw7a8" + }, + "source": [ + "### DP\n", + "\n", + "If you're using windows, DDP is not supported. You can use `dp` for DataParallel instead: DataParallel uses multithreading, instead of multiprocessing. It splits a batch across k GPUs. That is, if you have a batch of 32 and use DP with 2 gpus, each GPU will process 16 samples, after which the root node will aggregate the results.\n", + "\n", + "DP use is discouraged by PyTorch and Lightning. Use DDP which is more stable and at least 3x faster.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OO-J0ISvlVCg" + }, + "outputs": [], + "source": [ + "# dp = DataParallel\n", + "trainer = pl.Trainer(gpus=2, distributed_backend='dp')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y7E2eHZKwUn9" + }, + "source": [ + "### DDP2\n", + "\n", + "In certain cases, it’s advantageous to use ***all*** batches on the same machine, instead of a subset. For instance, in self-supervised learning, a common performance boost comes from increasing the number of negative samples.\n", + "\n", + "In this case, we can use DDP2 which behaves like DP in a machine and DDP across nodes. DDP2 does the following:\n", + "\n", + "* Copies a subset of the data to each node.\n", + "* Inits a model on each node.\n", + "* Runs a forward and backward pass using DP.\n", + "* Syncs gradients across nodes.\n", + "* Applies the optimizer updates.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Y4xweqL3xHER" + }, + "outputs": [], + "source": [ + "# ddp2 = DistributedDataParallel + dp\n", + "trainer = pl.Trainer(gpus=2, num_nodes=2, distributed_backend='ddp2')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lhKNCnveeeq5" + }, + "source": [ + "- The second mode is ddp_spawn. This works like ddp, but instead of calling your script multiple times, lightning will use multiprocessing spawn to start a subprocess per GPU. \n", + "\n", + "However, you should be careful of mixing this mode with num_workers > 0 in your dataloaders because it will bottleneck your training. This is a current known limitation of PyTorch which is why we recommend using our ddp implementation instead.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HUf9ANyQkFFO" + }, + "source": [ + "\n", + "### mocking ddp\n", + "\n", + "Testing or debugging DDP can be hard, so we have a distributed backend that simulates ddp on cpus to make it easier. Set `num_processes` to a number greater than 1 when using distributed_backend=\"ddp_cpu\" to mimic distributed training on a machine without GPUs. Note that while this is useful for debugging, it will not provide any speedup, since single-process Torch already makes efficient use of multiple CPUs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZSal5Da9kHOf" + }, + "outputs": [], + "source": [ + "# Simulate DDP for debugging on your GPU-less laptop\n", + "trainer = Trainer(distributed_backend=\"ddp_cpu\", num_processes=2)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Br_btCy5lgES" + }, + "source": [ + "# Training on TPUS\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DXkBNITdv44d" + }, + "source": [ + "Another option for accelerating your training is using TPUs.\n", + "A TPU is a Tensor processing unit, designed specifically for deep learning. Each TPU has 8 cores where each core is optimized for 128x128 matrix multiplies. Google estimates that 8 TPU cores are about as fast as 4 V100 GPUs!\n", + "\n", + "A TPU pod hosts many TPUs on it. Currently, TPU pod v2 has 2048 cores! You can request a full pod from Google cloud or a “slice” which gives you some subset of those 2048 cores.\n", + "\n", + "At this moment, TPUs are available on Google Cloud (GCP), Google Colab and Kaggle Environments.\n", + "\n", + "Lightning supports training on TPUs without any code adjustments to your model. Just like when using GPUs, Lightning automatically inserts the correct samplers - no need to do this yourself!\n", + "\n", + "Under the hood, lightning uses the XLA framework developed jointly by the facebook and google XLA teams. And we want to recognize their efforts in advancing TPU adoption of PyTorch.\n", + "\n", + "## tpu_cores\n", + "To train on TPUs, set the tpu_cores flag.\n", + "\n", + "When using colab or kaggle, the allowed values are 1 or 8 cores. When using google cloud, any value above 8 is allowed.\n", + "\n", + "Your effective batch size is the batch size passed into a dataloader times the total number of tpu cores." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "itP9y70gmD9M" + }, + "outputs": [], + "source": [ + "# int: train on a single core\n", + "trainer = pl.Trainer(tpu_cores=1)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NJKnzPb3mKEg" + }, + "outputs": [], + "source": [ + "# int: train on all cores few cores\n", + "trainer = pl.Trainer(tpu_cores=8)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8a4exfWUmOHq" + }, + "source": [ + "You can also choose which TPU core to train on, by passing a list [1-8]. This is not an officially supported use case but we are working with the XLA team to improve this user experience.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "S6OrjE_bmT-_" + }, + "outputs": [], + "source": [ + "# list: train on a single selected core\n", + "trainer = pl.Trainer(tpu_cores=[2])\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Afqx3sFUmfWD" + }, + "source": [ + "To train on more than 8 cores (ie: a POD), submit this script using the xla_dist script.\n", + "\n", + "\n", + "\n", + "```\n", + "python -m torch_xla.distributed.xla_dist\n", + "--tpu=$TPU_POD_NAME\n", + "--conda-env=torch-xla-nightly\n", + "--env=XLA_USE_BF16=1\n", + "-- python your_trainer_file.py\n", + "```\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ncPvbUVQqKOh" + }, + "source": [ + "# Advanced distributed training\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4MP7bEgnv7qK" + }, + "source": [ + "\n", + "Lightning supports distributed training across multiple GPUs and TPUs out of the box by setting trainer flags, but it also allows you to control the way sampling is done if you need to." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wdHiTfAMepKH" + }, + "source": [ + "## replace_sampler_ddp\n", + "In PyTorch, you must use torch.nn.DistributedSampler for multi-node or GPU training. The sampler makes sure each GPU sees the appropriate part of your data.\n", + "\n", + "```\n", + "# without lightning\n", + "def train_dataloader(self):\n", + " dataset = MNIST(...)\n", + " sampler = None\n", + "\n", + " if self.on_tpu:\n", + " sampler = DistributedSampler(dataset)\n", + "\n", + " return DataLoader(dataset, sampler=sampler)\n", + "```\n", + "Lightning adds the correct samplers when needed, so no need to explicitly add samplers. By default it will add `shuffle=True` for train sampler and `shuffle=False` for val/test sampler.\n", + "\n", + "If you want to customize this behaviour, you can set `replace_sampler_ddp=False` and add your own distributed sampler.\n", + "\n", + "(note: For iterable datasets, we don’t do this automatically.)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZfmcB_e_7HbE" + }, + "outputs": [], + "source": [ + "sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=False)\n", + "dataloader = DataLoader(dataset, batch_size=32, sampler=sampler)\n", + "\n", + "trainer = pl.Trainer(gpus=2, num_nodes=2, replace_sampler_ddp=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-IOhk1n0lL3_" + }, + "source": [ + "## prepare_data_per_node\n", + "\n", + "When doing multi NODE training, if your nodes share the same file system, then you don't want to download data more than once to avoid possible collisions. \n", + "\n", + "Lightning automatically calls the prepare_data hook on the root GPU of the master node (ie: only a single GPU).\n", + "\n", + "In some cases where your nodes don't share the same file system, you need to download the data on each node. In this case you can set this flag to true and lightning will download the data on the root GPU of each node.\n", + "\n", + "This flag is defaulted to True." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WFBMUR48lM04" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gpus=2, num_nodes=2, prepare_data_per_node=False)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FKBwXqo4q-Vp" + }, + "source": [ + "## sync_batchnorm\n", + "\n", + "Batch norm is computed per GPU/TPU. This flag enables synchronization between batchnorm layers across all GPUs.\n", + "It is recommended if you have small batch sizes.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GhaCLTEZrAQi" + }, + "outputs": [], + "source": [ + "trainer = Trainer(gpus=4, sync_batchnorm=True)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XuFA7VTFMY9-" + }, + "source": [ + "# Debugging flags\n", + "\n", + "Lightning offers a couple of flags to make debugging your models easier:\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AKoS3fdml4Jx" + }, + "source": [ + "## Fast Dev Run\n", + "\n", + "To help you save time debugging, your first run should use the fast_dev_run flag.\n", + "\n", + "This won't generate logs or save checkpoints but will touch every line of your code to make sure that it is working as intended.\n", + "\n", + "Think about this flag like a compiler. You make changes to your code, and run Trainer with this flag to verify that your changes are bug free.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "L5vuG7GSmhzK" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(fast_dev_run=True)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HRP1qQR5nT4p" + }, + "source": [ + "## overfit_batches\n", + "\n", + "Uses this much data of the training set. If nonzero, will use the same training set for validation and testing. If the training dataloaders have shuffle=True, Lightning will automatically disable it.\n", + "\n", + "Useful for quickly debugging or trying to overfit on purpose." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NTM-dqGMnXms" + }, + "outputs": [], + "source": [ + "# use only 1% of the train set (and use the train set for val and test)\n", + "trainer = pl.Trainer(overfit_batches=0.01)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c0LV0gC3nl1X" + }, + "outputs": [], + "source": [ + "# overfit on 10 of the same batches\n", + "trainer = pl.Trainer(overfit_batches=10)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lt3UHU6WgtS_" + }, + "source": [ + "Or a float to represent percentage of data to run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "K3yUqADhgnkf" + }, + "outputs": [], + "source": [ + "# run through only 25% of the test set each epoch\n", + "trainer = pl.Trainer(limit_test_batches=0.25)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ODN66NeVg_2o" + }, + "source": [ + "In the case of multiple test dataloaders, the limit applies to each dataloader individually.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8aQx5SLeMz1R" + }, + "source": [ + "# accumulate_grad_batches\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "g8GczZXFwKC7" + }, + "source": [ + "The batch size controls the accuracy of the estimate of the gradients. Small batch size use less memory, but decrease accuracy. When training large models, such as NLP transformers, it is useful to accumulate gradients before calling backwards(). It allows for bigger batch sizes than what can actually fit on a GPU/TPU in a single step.\n", + "\n", + "Use accumulate_grad_batches to accumulate gradients every k batches or as set up in the dict. Trainer also calls optimizer.step() for the last indivisible step number.\n", + "\n", + "For example, set accumulate_grad_batches to 4 to accumulate every 4 batches. In this case the effective batch size is batch_size*4, so if your batch size is 32, effectively it will be 128." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2jB6-Z_yPhhf" + }, + "outputs": [], + "source": [ + "# accumulate every 4 batches (effective batch size is batch*4)\n", + "trainer = pl.Trainer(accumulate_grad_batches=4)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_Yi-bdTOgINC" + }, + "source": [ + "You can also pass a dictionary to specify different accumulation per epoch. We can set it to `{5: 3, 10: 20}` to have no accumulation for epochs 1 to 4, accumulate 3 batches for epoch 5 to 10, and 20 batches after that." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "X3xsoZ3YPgBv" + }, + "outputs": [], + "source": [ + "# no accumulation for epochs 1-4. accumulate 3 for epochs 5-10. accumulate 20 after that\n", + "trainer = pl.Trainer(accumulate_grad_batches={5: 3, 10: 20})\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "myzH8mV4M1_9" + }, + "source": [ + "# 16 bit precision\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v9EaFAonwOk6" + }, + "source": [ + "Most deep learning frameworks like PyTorch, train with 32-bit floating point arithmetic. \n", + "\n", + "But many models can still achieve full accuracy using half the precision.\n", + "\n", + "In 2017, NVIDIA researchers successfully used a combination of 32 and 16 bit precision (also known as mixed precision) and achieved the same accuracy as 32 bit precision training.\n", + "\n", + "The main two advantages are:\n", + "\n", + "- a reduction in memory requirements which enables larger batch sizes and models.\n", + "- and a speed up in compute. On ampere, turing and volta architectures 16 bit precision models can train at least 3 times faster.\n", + "\n", + "As of PyTorch 1.6, NVIDIA and Facebook moved mixed precision functionality into PyTorch core as the AMP package, torch.cuda.amp. \n", + "\n", + "This package supersedes the apex package developed by NVIDIA." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TjNypZPHnxvJ" + }, + "source": [ + "## precision\n", + "\n", + "Use precision flag to switch between full precision (32) to half precision (16). Can be used on CPU, GPU or TPUs.\n", + "\n", + "When using PyTorch 1.6+ Lightning uses the native amp implementation to support 16-bit.\n", + "\n", + "If used on TPU will use torch.bfloat16 but tensor printing will still show torch.float32" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kBZKMVx1nw-D" + }, + "outputs": [], + "source": [ + "# 16-bit precision\n", + "trainer = pl.Trainer(gpus=1, precision=16)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VJGj3Jh7oQXU" + }, + "source": [ + "In earlier version of Lightning, we use NVIDIA Apex for 16-bit precision. Apex was the first library to attempt 16-bit and the automatic mixed precision library (amp), has since been merged into core PyTorch as of 1.6.\n", + "\n", + "If you insist in using Apex, you can set the amp_backend flag to 'apex' and install Apex on your own." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BDV1trAUPc9h" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gpus=1, precision=16, amp_backend='apex')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HK5c_aVfNV4e" + }, + "source": [ + "## amp_level\n", + "Apex includes 4 optimization levels:\n", + "O0 (FP32 training)\n", + "O1 (Conservative Mixed Precision): only some whitelist ops are done in FP16.\n", + "O2 (Fast Mixed Precision): this is the standard mixed precision training. It maintains FP32 master weights and optimizer.step acts directly on the FP32 master weights.\n", + "O3 (FP16 training): full FP16. Passing keep_batchnorm_fp32=True can speed things up as cudnn batchnorm is faster anyway.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FshMFPowNbWt" + }, + "outputs": [], + "source": [ + "# default used by the Trainer\n", + "trainer = pl.Trainer(gpus=1, precision=16, amp_backend='apex', amp_level='O2')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y8KEr1YvNgkC" + }, + "source": [ + "# `auto_scale_batch_size`\n", + "\n", + " \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7F1pKFIuwSFl" + }, + "source": [ + "Lightning can help you improve your model by using auto_scale_batch_size flag, which tries to find the largest batch size that fits into memory, before you start your training.\n", + "Larger batch size often yields better estimates of gradients, but may also result in longer training time. \n", + "\n", + "Set it to True to initially run a batch size finder trying to find the largest batch size that fits into memory. The result will be stored in self.batch_size in the LightningModule.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9_jE-iyyheIv" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(auto_scale_batch_size=True)\n", + "\n", + "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yaHsJvwFhNJt" + }, + "source": [ + "You can set the value to `power`. `power` scaling starts from a batch size of 1 and keeps doubling the batch size until an out-of-memory (OOM) error is encountered.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Qx0FbQrphgw1" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(auto_scale_batch_size='power')\n", + "\n", + "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8bwgVF9zhZ75" + }, + "source": [ + "You can also set it to `binsearch`, that continues to finetune the batch size by performing a binary search.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QObXNs3yNrg9" + }, + "outputs": [], + "source": [ + "# run batch size scaling, result overrides hparams.batch_size\n", + "trainer = pl.Trainer(auto_scale_batch_size='binsearch')\n", + "\n", + "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5OWdhSsZjqW7" + }, + "source": [ + "This feature expects that a batch_size field in the hparams of your model, i.e., model.hparams.batch_size should exist and will be overridden by the results of this algorithm. \n", + "\n", + "Additionally, your train_dataloader() method should depend on this field for this feature to work.\n", + "\n", + "The algorithm in short works by:\n", + "1. Dumping the current state of the model and trainer\n", + "\n", + "2. Iteratively until convergence or maximum number of tries max_trials (default 25) has been reached:\n", + "* Call fit() method of trainer. This evaluates steps_per_trial (default 3) number of training steps. Each training step can trigger an OOM error if the tensors (training batch, weights, gradients etc.) allocated during the steps have a too large memory footprint.\n", + " * If an OOM error is encountered, decrease the batch size\n", + " * Else increase it.\n", + "* How much the batch size is increased/decreased is determined by the chosen strategy.\n", + "\n", + "3. The found batch size is saved to model.hparams.batch_size\n", + "\n", + "4. Restore the initial state of model and trainer\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "q4CvxfZmOWBd" + }, + "source": [ + "# `auto_lr_find`\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "j85e8usNwdBV" + }, + "source": [ + "Selecting a good learning rate for your deep learning training is essential for both better performance and faster convergence.\n", + "\n", + "Even optimizers such as Adam that are self-adjusting the learning rate can benefit from more optimal choices.\n", + "\n", + "To reduce the amount of guesswork concerning choosing a good initial learning rate, you can use Lightning auto learning rate finder.\n", + "\n", + "The learning rate finder does a small run where the learning rate is increased after each processed batch and the corresponding loss is logged. The result of this is a lr vs. loss plot that can be used as guidance for choosing an optimal initial lr.\n", + "\n", + "\n", + "warning: For the moment, this feature only works with models having a single optimizer. LR support for DDP is not implemented yet, it is coming soon.\n", + "\n", + "\n", + "***auto_lr_find=***\n", + "\n", + "In the most basic use case, this feature can be enabled during trainer construction with Trainer(auto_lr_find=True).\n", + "When .fit(model) is called, the LR finder will automatically run before any training is done. The lr that is found and used will be written to the console and logged together with all other hyperparameters of the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iuhve9RBOfFh" + }, + "outputs": [], + "source": [ + "# default used by the Trainer (no learning rate finder)\n", + "trainer = pl.Trainer(mnist_model, auto_lr_find=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BL-gjXNCPDXk" + }, + "source": [ + "This flag sets your learning rate which can be accessed via self.lr or self.learning_rate.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wEb-vIMmPJQf" + }, + "outputs": [], + "source": [ + "class LitModel(LightningModule):\n", + "\n", + " def __init__(self, learning_rate):\n", + " self.learning_rate = learning_rate\n", + "\n", + " def configure_optimizers(self):\n", + " return Adam(self.parameters(), lr=(self.lr or self.learning_rate))\n", + "\n", + "# finds learning rate automatically\n", + "# sets hparams.lr or hparams.learning_rate to that learning rate\n", + "trainer = pl.Trainer(mnist_model, auto_lr_find=True)\n", + "\n", + "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RweqvpnVPPSh" + }, + "source": [ + "To use an arbitrary value set it as auto_lr_find\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4LKI39IfPLJv" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(mnist_model, auto_lr_find='my_value')\n", + "\n", + "trainer.tune(model, train_dataloader=train_loader, val_dataloaders=val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9VAhPRKbPX-m" + }, + "source": [ + "Under the hood, when you call tune it runs the learning rate finder.\n", + "\n", + "If you want to inspect the results of the learning rate finder before doing any actual training or just play around with the parameters of the algorithm, this can be done by invoking the lr_find method of the trainer. A typical example of this would look like\n", + "\n", + "\n", + "```\n", + "trainer = pl.Trainer(auto_lr_find=True)\n", + "\n", + "# Run learning rate finder\n", + "lr_finder = trainer.lr_find(model)\n", + "\n", + "# Results can be found in\n", + "lr_finder.results\n", + "\n", + "# Plot with\n", + "fig = lr_finder.plot(suggest=True)\n", + "fig.show()\n", + "\n", + "# Pick point based on plot, or get suggestion\n", + "new_lr = lr_finder.suggestion()\n", + "\n", + "# update hparams of the model\n", + "model.hparams.lr = new_lr\n", + "\n", + "# Fit model\n", + "trainer.fit(model)\n", + "```\n", + "\n", + "The figure produced by lr_finder.plot() should look something like the figure below. It is recommended to not pick the learning rate that achieves the lowest loss, but instead something in the middle of the sharpest downward slope (red point). This is the point returned py lr_finder.suggestion().\n", + "\n", + "![image.png]()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tn1RV-jfOjt1" + }, + "source": [ + "# `benchmark`\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rsmTl5zfwjM3" + }, + "source": [ + "You can try to speed your system by setting `benchmark=True`, which enables cudnn.benchmark. This flag is likely to increase the speed of your system if your input sizes don’t change. This flag makes cudnn auto-tuner look for the optimal set of algorithms for the given hardware configuration. This usually leads to faster runtime.\n", + "But if your input sizes changes at each iteration, then cudnn will benchmark every time a new size appears, possibly leading to worse runtime performances." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dWr-OCBgQCeb" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gpus=1, benchmark=True)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qwAvSKYGa24K" + }, + "source": [ + "# `deterministic`\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tl5mfmafwmat" + }, + "source": [ + "PyTorch does not guarantee reproducible results, even when using identical seeds. To guarentee reproducible results, you can remove most of the randomness from your process by setting the `deterministic` flag to True.\n", + "\n", + "Note that it might make your system slower." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Mhv5LZ3HbNCK" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gpus=1, deterministic=True)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u_5eJSvTf60f" + }, + "source": [ + "# Exploding and vanishing gradients" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "B6drjh4pq6Jv" + }, + "source": [ + "## track_grad_norm\n", + "\n", + "You can debug your grad norm to identify exploding or vanishing gradients using the `track_grad_norm` flag.\n", + "\n", + "Set value to 2 to track the 2-norm. or p to any p-norm." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2taHUir8rflR" + }, + "outputs": [], + "source": [ + "# track the 2-norm\n", + "trainer = pl.Trainer(track_grad_norm=2)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3vHKxmruk62f" + }, + "source": [ + "May be set to ‘inf’ infinity-norm." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "g7TbD6SxlAjP" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(track_grad_norm='inf')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TcMlRe7ywpe6" + }, + "source": [ + "## Gradient clipping\n", + "\n", + "\n", + "Exploding gradients refer to the problem that the gradients get too large and overflow in training, making the model unstable. Gradient clipping will ‘clip’ the gradients or cap them to a Threshold value to prevent the gradients from getting too large. To avoid this, we can set `gradient_clip_val` (default is set to 0.0).\n", + "\n", + "[when to use it, what are relevant values]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jF9JwmbOgOWF" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(gradient_clip_val=0.1)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ggb4MkkQrr1h" + }, + "source": [ + "# truncated_bptt_steps\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "s1Iu6PyAw9_r" + }, + "source": [ + "If you have a large recurrent model, you can use truncated_bptt_steps flag to split up the backprop over portions of the sequence. This flag will automatically truncate your batches and the trainer will apply Truncated Backprop to it.\n", + "\n", + "Make sure your batches have a sequence dimension.\n", + "\n", + "Lightning takes care of splitting your batch along the time-dimension.\n", + "```\n", + "# we use the second as the time dimension\n", + "# (batch, time, ...)\n", + "sub_batch = batch[0, 0:t, ...]\n", + "Using this feature requires updating your LightningModule’s pytorch_lightning.core.LightningModule.training_step() to include a hiddens arg with the hidden\n", + "\n", + "# Truncated back-propagation through time\n", + "def training_step(self, batch, batch_idx, hiddens):\n", + " # hiddens are the hiddens from the previous truncated backprop step\n", + " out, hiddens = self.lstm(data, hiddens)\n", + "\n", + " return {\n", + " \"loss\": ...,\n", + " \"hiddens\": hiddens # remember to detach() this\n", + " }\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WiTF1VMtruMU" + }, + "outputs": [], + "source": [ + "# backprop every 5 steps in a batch\n", + "trainer = pl.Trainer(truncated_bptt_steps=5)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8XI_kEWkS-nT" + }, + "source": [ + "To modify how the batch is split, override pytorch_lightning.core.LightningModule.tbptt_split_batch():\n", + "\n", + "```\n", + "class LitMNIST(LightningModule):\n", + " def tbptt_split_batch(self, batch, split_size):\n", + " # do your own splitting on the batch\n", + " return splits\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oLbEmbmupwQ8" + }, + "source": [ + "# reload_dataloaders_every_epoch\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CLdNGVv9xD_L" + }, + "source": [ + "Set to True to reload dataloaders every epoch (instead of loading just once in the beginning of training).\n", + "\n", + "```\n", + "# if False (default)\n", + "train_loader = model.train_dataloader()\n", + "for epoch in epochs:\n", + " for batch in train_loader:\n", + " ...\n", + "\n", + "# if True\n", + "for epoch in epochs:\n", + " train_loader = model.train_dataloader()\n", + " for batch in train_loader:\n", + "\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "10AXthXxp311" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(reload_dataloaders_every_epoch=True)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f513EYl0bmmL" + }, + "source": [ + "# Callbacks\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2pt7iGh4xNs5" + }, + "source": [ + "\n", + "Lightning Callbacks are self-contained programs that can be reused across projects.\n", + "Callbacks should capture NON-ESSENTIAL logic that is NOT required for your LightningModule to run. Lightning includes some a few built-in callbacks that can be used with flags like early stopping and Model Checkpointing, but you can also create your own callbacks to add any functionality to your models.\n", + "\n", + "The callback API includes hooks that allow you to add logic at every point of your training:\n", + "setup, teardown, on_epoch_start, on_epoch_end, on_batch_start, on_batch_end, on_init_start, on_keyboard_interrupt etc. \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1t84gvDNsUuh" + }, + "source": [ + "## callbacks\n", + "\n", + "Use **callbacks=** to pass a list of user defined callbacks. These callbacks DO NOT replace the built-in callbacks (loggers or EarlyStopping). \n", + "\n", + "In this example, we create a dummy callback that prints a message when training starts and ends, using on_train_start and on_train_end hooks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oIXZYabub3f0" + }, + "outputs": [], + "source": [ + "from pytorch_lightning.callbacks import Callback\n", + "\n", + "class PrintCallback(Callback):\n", + " def on_train_start(self, trainer, pl_module):\n", + " print(\"Training is started!\")\n", + " def on_train_end(self, trainer, pl_module):\n", + " print(\"Training is done.\")\n", + "\n", + "# a list of callbacks\n", + "callbacks = [PrintCallback()]\n", + "trainer = pl.Trainer(callbacks=callbacks)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cNF74CLYfJJu" + }, + "source": [ + "# Model checkpointing\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2blgquBrxLtS" + }, + "source": [ + "Checkpoints capture the exact value of all parameters used by a model.\n", + "\n", + "Checkpointing your training allows you to resume a training process in case it was interrupted, fine-tune a model or use a pre-trained model for inference without having to retrain the model.\n", + "\n", + "Lightning automates saving and loading checkpoints so you restore a training session, saving all the required parameters including: \n", + "* 16-bit scaling factor (apex)\n", + "* Current epoch\n", + "* Global step\n", + "* Model state_dict\n", + "* State of all optimizers\n", + "* State of all learningRate schedulers\n", + "* State of all callbacks\n", + "* The hyperparameters used for that model if passed in as hparams (Argparse.Namespace)\n", + "\n", + "By default Lightning will save a checkpoint in the working directory, which will be updated every epoch.\n", + "\n", + "### Automatic saving\n", + "By default Lightning will save a checkpoint in the end of the first epoch in the working directory, which will be updated every epoch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XGu0JULrg9l7" + }, + "outputs": [], + "source": [ + "# default used by the Trainer\n", + "trainer = pl.Trainer(default_root_path=os.getcwd())\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3s9OjkGuhq1W" + }, + "source": [ + "To change the checkpoint path pass in **default_root_dir=**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DgdxkrIQhvfw" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(default_root_dir='/your/path/to/save/checkpoints')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Qyvj_bkWrJiE" + }, + "source": [ + "\n", + "You can also have Lightning update your checkpoint based on a specific metric that you are logging (using self.log), by passing the key to `monitor=`. For example, if we want to save checkpoint based on the validation loss, logged as `val_loss`, you can pass:\n", + "\n", + "\n", + "```\n", + "checkpoint_callback = ModelCheckpoint(\n", + " filepath=os.getcwd(),\n", + " save_top_k=1,\n", + " verbose=True,\n", + " monitor='val_loss',\n", + " mode='min',\n", + " prefix=''\n", + ")\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YzYMivw1rO1O" + }, + "outputs": [], + "source": [ + "from pytorch_lightning.callbacks import ModelCheckpoint\n", + "\n", + "trainer = pl.Trainer(callbacks=[ModelCheckpoint(monitor='val_loss')])\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5hYs_FV8iDMn" + }, + "source": [ + "You can modify the behavior of checkpointing by creating your own callback, and passing it to the trainer. \n", + "You can control\n", + "* filepath- where logs are saved\n", + "* save_top_k- save k top models\n", + "* verbose\n", + "* monitor- the metric to monitor\n", + "* mode\n", + "* prefix\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Tb1K2VYDiNTu" + }, + "outputs": [], + "source": [ + "from pytorch_lightning.callbacks import ModelCheckpoint\n", + "\n", + "# DEFAULTS used by the Trainer\n", + "checkpoint_callback = ModelCheckpoint(\n", + " filepath=os.getcwd(),\n", + " save_top_k=3,\n", + " verbose=True,\n", + " monitor='val_loss',\n", + " mode='min',\n", + " prefix='',\n", + ")\n", + "\n", + "trainer = Trainer(callbacks=[checkpoint_callback])\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YKhZ6xRojJcl" + }, + "source": [ + "You can disable checkpointing it by passing\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Yt8zd2ZFjOXX" + }, + "outputs": [], + "source": [ + "trainer = Trainer(checkpoint_callback=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HcLy8asCjrj9" + }, + "source": [ + "### Manual saving\n", + "\n", + "You can manually save checkpoints and restore your model from the checkpointed state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kZSkMJf0jR4x" + }, + "outputs": [], + "source": [ + "trainer.fit(model)\n", + "trainer.save_checkpoint(\"example.ckpt\")\n", + "new_model = LitAutoEncoder.load_from_checkpoint(checkpoint_path=\"example.ckpt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "X2d9cjVPj7CP" + }, + "source": [ + "### Checkpoint Loading\n", + "To load a model along with its weights, biases and module_arguments use following method:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BpAFfg5zkFmH" + }, + "outputs": [], + "source": [ + "model = LitAutoEncoder.load_from_checkpoint(PATH)\n", + "\n", + "print(model.learning_rate)\n", + "# prints the learning_rate you used in this checkpoint\n", + "\n", + "model.eval()\n", + "y_hat = model(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jTQ3mxSJkhFN" + }, + "source": [ + "But if you don’t want to use the values saved in the checkpoint, pass in your own here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IoMcOh9-kfUP" + }, + "outputs": [], + "source": [ + "class LitAutoEncoder(LightningModule):\n", + "\n", + " def __init__(self, in_dim, out_dim):\n", + " super().__init__()\n", + " self.save_hyperparameters()\n", + " self.l1 = nn.Linear(self.hparams.in_dim, self.hparams.out_dim)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ITPVY8mNknut" + }, + "source": [ + "you can restore the model like this\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "H7XeRJzVkuY8" + }, + "outputs": [], + "source": [ + "# if you train and save the model like this it will use these values when loading\n", + "# the weights. But you can overwrite this\n", + "LitAutoEncoder(in_dim=32, out_dim=10)\n", + "\n", + "# uses in_dim=32, out_dim=10\n", + "model = LitAutoEncoder.load_from_checkpoint(PATH)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "14WwGpnVk0a4" + }, + "outputs": [], + "source": [ + "# uses in_dim=128, out_dim=10\n", + "model = LitAutoEncoder.load_from_checkpoint(PATH, in_dim=128, out_dim=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bY5s6wP_k1CU" + }, + "source": [ + "\n", + "\n", + "## Restoring Training State (resume_from_checkpoint)\n", + "If your training was cut short for some reason, you can resume exactly from where you left off using the `resume_from_checkpoint` flag, which will automatically restore model, epoch, step, LR schedulers, apex, etc..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9zfhHtyrk3rO" + }, + "outputs": [], + "source": [ + "model = LitAutoEncoder()\n", + "trainer = pl.Trainer(resume_from_checkpoint='some/path/to/my_checkpoint.ckpt')\n", + "\n", + "# automatically restores model, epoch, step, LR schedulers, apex, etc...\n", + "trainer.fit(model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xkKdvALFsmT2" + }, + "source": [ + "## weights_save_path\n", + "You can specify a directory for saving weights file using `weights_save_path`.\n", + "\n", + "(If you are using a custom checkpoint callback, the checkpoint callback will override this flag)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9OwHHFcCsrgT" + }, + "outputs": [], + "source": [ + "# save to your custom path\n", + "trainer = pl.Trainer(weights_save_path='my/path')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PbNtlJ9Wsscf" + }, + "outputs": [], + "source": [ + "# if checkpoint callback used, then overrides the weights path\n", + "# **NOTE: this saves weights to some/path NOT my/path\n", + "checkpoint = ModelCheckpoint(filepath='some/path')\n", + "trainer = pl.Trainer(\n", + " callbacks=[checkpoint],\n", + " weights_save_path='my/path'\n", + ")\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uDdxCuyHdWQt" + }, + "source": [ + "# Early stopping\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fqAy3ihRxTfR" + }, + "source": [ + "The EarlyStopping callback can be used to monitor a validation metric and stop the training when no improvement is observed, to help you avoid overfitting.\n", + "\n", + "To enable Early Stopping you can init the EarlyStopping callback, and pass it to `callbacks=` trainer flag. The callback will look for a logged metric to early stop on.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lFx976CheH93" + }, + "outputs": [], + "source": [ + "from pytorch_lightning.callbacks.early_stopping import EarlyStopping\n", + "\n", + "trainer = pl.Trainer(callbacks=[EarlyStopping('val_loss')])\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MwpJfTvjeOwF" + }, + "source": [ + "You can customize the callback using the following params:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "V6I9h6HteK2U" + }, + "outputs": [], + "source": [ + "from pytorch_lightning.callbacks.early_stopping import EarlyStopping\n", + "\n", + "early_stop_callback = EarlyStopping(\n", + " monitor='val_accuracy',\n", + " min_delta=0.00,\n", + " patience=3,\n", + " verbose=False,\n", + " mode='max'\n", + ")\n", + "trainer = pl.Trainer(callbacks=[early_stop_callback])\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7TAIerPYe_Q1" + }, + "source": [ + "The EarlyStopping callback runs at the end of every validation epoch, which, under the default configuration, happens after every training epoch. However, the frequency of validation can be modified by setting various parameters on the Trainer, for example check_val_every_n_epoch and val_check_interval. It must be noted that the patience parameter counts the number of validation epochs with no improvement, and not the number of training epochs. Therefore, with parameters check_val_every_n_epoch=10 and patience=3, the trainer will perform at least 40 training epochs before being stopped." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VoKrX2ENh9Fg" + }, + "source": [ + "# Logging" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-CQTPKd7iKLm" + }, + "source": [ + "Lightning has built in integration with various loggers such as TensorBoard, wandb, commet, etc.\n", + "\n", + "\n", + "You can pass any metrics you want to log during training to `self.log`, such as loss or accuracy. Similarly, pass in to self.log any metric you want to log during validation step.\n", + "\n", + "These values will be passed in to the logger of your choise. simply pass in any supported logger to logger trainer flag.\n", + "\n", + "\n", + "\n", + "Use the as`logger=` trainer flag to pass in a Logger, or iterable collection of Loggers, for experiment tracking.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ty5VPS3AiS8L" + }, + "outputs": [], + "source": [ + "from pytorch_lightning.loggers import TensorBoardLogger\n", + "\n", + "# default logger used by trainer\n", + "logger = TensorBoardLogger(\n", + " save_dir=os.getcwd(),\n", + " version=1,\n", + " name='lightning_logs'\n", + ")\n", + "trainer = pl.Trainer(logger=logger)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jc5oWNpoiuuc" + }, + "source": [ + "Lightning supports the use of multiple loggers, just pass a list to the Trainer.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BlYwMRRyivp_" + }, + "outputs": [], + "source": [ + "from pytorch_lightning.loggers import TensorBoardLogger, TestTubeLogger\n", + "logger1 = TensorBoardLogger('tb_logs', name='my_model')\n", + "logger2 = TestTubeLogger('tb_logs', name='my_model')\n", + "trainer = pl.Trainer(logger=[logger1, logger2])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a7EyspQPh7iQ" + }, + "source": [ + "## flush_logs_every_n_steps\n", + "\n", + "Use this flag to determine when logging to disc should happen." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Em_XvsmyiBbk" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(flush_logs_every_n_steps=100)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_vDeKE98qsl1" + }, + "source": [ + "## log_every_n_steps\n", + "How often to add logging rows (does not write to disk)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HkqD7D_0w1Tt" + }, + "outputs": [], + "source": [ + "trainer = pl.Trainer(log_every_n_steps=1000)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9uw0gfe422CT" + }, + "source": [ + "# info logging" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dQXpt0aatDGo" + }, + "source": [ + "### default_root_dir\n", + "\n", + "---\n", + "\n", + "\n", + "\n", + "Default path for logs and weights when no logger or pytorch_lightning.callbacks.ModelCheckpoint callback passed. On certain clusters you might want to separate where logs and checkpoints are stored. If you don’t then use this argument for convenience. Paths can be local paths or remote paths such as s3://bucket/path or ‘hdfs://path/’. Credentials will need to be set up to use remote filepaths." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CMmID2Bts5W3" + }, + "source": [ + "## weights_summary\n", + "Prints a summary of the weights when training begins. Default is set to `top`- print summary of top level modules.\n", + "\n", + "Options: ‘full’, ‘top’, None." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KTl6EdwDs6j2" + }, + "outputs": [], + "source": [ + "\n", + "# print full summary of all modules and submodules\n", + "trainer = pl.Trainer(weights_summary='full')\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R57cSLl9w9ma" + }, + "outputs": [], + "source": [ + "# don't print a summary\n", + "trainer = Trainer(weights_summary=None)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bSc2hU5AotAP" + }, + "source": [ + "# progress bar" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GgvbyDsBxcH6" + }, + "source": [ + "## process_position\n", + "\n", + "Orders the progress bar. Useful when running multiple trainers on the same node.\n", + "\n", + "(This argument is ignored if a custom callback is passed to callbacks)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6ekz8Es8owDn" + }, + "outputs": [], + "source": [ + "# default used by the Trainer\n", + "trainer = pl.Trainer(process_position=0)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "itivQFgEphBU" + }, + "source": [ + "## progress_bar_refresh_rate\n", + "\n", + "How often to refresh the progress bar (in steps). In notebooks, faster refresh rates (lower number) is known to crash them because of their screen refresh rates, so raise it to 50 or more." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GKe6eVxmplL5" + }, + "outputs": [], + "source": [ + "# default used by the Trainer\n", + "trainer = pl.Trainer(progress_bar_refresh_rate=1)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8rDHJOJbxNtf" + }, + "outputs": [], + "source": [ + "# disable progress bar\n", + "trainer = Trainer(progress_bar_refresh_rate=0)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NCNvYLwjpWne" + }, + "source": [ + "# profiler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pRknrG_zpY6M" + }, + "outputs": [], + "source": [ + "# to profile standard training events\n", + "trainer = pl.Trainer(profiler=True)\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ji6aWpU73kMM" + }, + "source": [ + "You can also use Lightning AdvancedProfiler if you want more detailed information about time spent in each function call recorded during a given action. The output is quite verbose and you should only use this if you want very detailed reports.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "layG55pt316C" + }, + "outputs": [], + "source": [ + "from pytorch_lightning.profiler import AdvancedProfiler\n", + "\n", + "trainer = Trainer(profiler=AdvancedProfiler())\n", + "\n", + "trainer.fit(model, train_loader, val_loader)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "

Congratulations - Time to Join the Community!

\n", + "
\n", + "\n", + "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the Lightning movement, you can do so in the following ways!\n", + "\n", + "### Star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) on GitHub\n", + "The easiest way to help our community is just by starring the GitHub repos! This helps raise awareness of the cool tools we're building.\n", + "\n", + "* Please, star [Lightning](https://github.com/PyTorchLightning/pytorch-lightning)\n", + "\n", + "### Join our [Slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-f6bl2l0l-JYMK3tbAgAmGRrlNr00f1A)!\n", + "The best way to keep up to date on the latest advancements is to join our community! Make sure to introduce yourself and share your interests in `#general` channel\n", + "\n", + "### Interested by SOTA AI models ! Check out [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "Bolts has a collection of state-of-the-art models, all implemented in [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) and can be easily integrated within your own projects.\n", + "\n", + "* Please, star [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts)\n", + "\n", + "### Contributions !\n", + "The best way to contribute to our community is to become a code contributor! At any time you can go to [Lightning](https://github.com/PyTorchLightning/pytorch-lightning) or [Bolt](https://github.com/PyTorchLightning/pytorch-lightning-bolts) GitHub Issues page and filter for \"good first issue\". \n", + "\n", + "* [Lightning good first issue](https://github.com/PyTorchLightning/pytorch-lightning/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* [Bolt good first issue](https://github.com/PyTorchLightning/pytorch-lightning-bolts/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22)\n", + "* You can also contribute your own notebooks with useful examples !\n", + "\n", + "### Great thanks from the entire Pytorch Lightning Team for your interest !\n", + "\n", + "" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "05-trainer-flags-overview.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } From bb356a73cb37a8061e5bfdc5154fd135dc67d515 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 7 Nov 2020 14:18:45 -0500 Subject: [PATCH 65/88] added trainer api docs (#4569) --- docs/source/trainer.rst | 1670 ++++++++++++++++++++++++- pytorch_lightning/trainer/__init__.py | 1538 ----------------------- 2 files changed, 1651 insertions(+), 1557 deletions(-) diff --git a/docs/source/trainer.rst b/docs/source/trainer.rst index 792d50f15d856..d7cbae7a56a3b 100644 --- a/docs/source/trainer.rst +++ b/docs/source/trainer.rst @@ -1,26 +1,1658 @@ .. role:: hidden :class: hidden-section +.. testsetup:: * + + import os + from pytorch_lightning.trainer.trainer import Trainer + from pytorch_lightning.core.lightning import LightningModule + from pytorch_lightning.utilities.seed import seed_everything + .. _trainer: Trainer ======= -.. automodule:: pytorch_lightning.trainer - :members: fit, test - :noindex: - :exclude-members: - setup_training, - _abc_impl, - set_random_port, - _Trainer__set_root_gpu, - _Trainer__init_optimizers, - _Trainer__parse_gpu_ids, - _Trainer__configure_schedulers, - data_parallel, - num_gpus, - slurm_job_id, - tng_tqdm_dic, - training_tqdm_dict, - progress_bar_dict, - init_optimizers, - configure_schedulers + +Once you've organized your PyTorch code into a LightningModule, +the Trainer automates everything else. + +.. raw:: html + + + +| + +This abstraction achieves the following: + +1. You maintain control over all aspects via PyTorch code without an added abstraction. + +2. The trainer uses best practices embedded by contributors and users + from top AI labs such as Facebook AI Research, NYU, MIT, Stanford, etc... + +3. The trainer allows overriding any key part that you don't want automated. + +| + +----------- + +Basic use +--------- + +This is the basic use of the trainer: + +.. code-block:: python + + model = MyLightningModule() + + trainer = Trainer() + trainer.fit(model, train_dataloader, val_dataloader) + + +-------- + +Trainer in Python scripts +------------------------- +In Python scripts, it's recommended you use a main function to call the Trainer. + +.. code-block:: python + + from argparse import ArgumentParser + + def main(hparams): + model = LightningModule() + trainer = Trainer(gpus=hparams.gpus) + trainer.fit(model) + + if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--gpus', default=None) + args = parser.parse_args() + + main(args) + +So you can run it like so: + +.. code-block:: bash + + python main.py --gpus 2 + +.. note:: + + Pro-tip: You don't need to define all flags manually. Lightning can add them automatically + +.. code-block:: python + + from argparse import ArgumentParser + + def main(args): + model = LightningModule() + trainer = Trainer.from_argparse_args(args) + trainer.fit(model) + + if __name__ == '__main__': + parser = ArgumentParser() + parser = Trainer.add_argparse_args(parser) + args = parser.parse_args() + + main(args) + +So you can run it like so: + +.. code-block:: bash + + python main.py --gpus 2 --max_steps 10 --limit_train_batches 10 --any_trainer_arg x + +.. note:: + If you want to stop a training run early, you can press "Ctrl + C" on your keyboard. + The trainer will catch the `KeyboardInterrupt` and attempt a graceful shutdown, including + running callbacks such as `on_train_end`. The trainer object will also set an attribute + `interrupted` to `True` in such cases. If you have a callback which shuts down compute + resources, for example, you can conditionally run the shutdown logic for only uninterrupted runs. + +------------ + +Testing +------- +Once you're done training, feel free to run the test set! +(Only right before publishing your paper or pushing to production) + +.. code-block:: python + + trainer.test(test_dataloader=test_dataloader) + +------------ + +Deployment / prediction +----------------------- +You just trained a LightningModule which is also just a torch.nn.Module. +Use it to do whatever! + +.. code-block:: python + + # load model + pretrained_model = LightningModule.load_from_checkpoint(PATH) + pretrained_model.freeze() + + # use it for finetuning + def forward(self, x): + features = pretrained_model(x) + classes = classifier(features) + + # or for prediction + out = pretrained_model(x) + api_write({'response': out} + + +You may wish to run the model on a variety of devices. Instead of moving the data +manually to the correct device, decorate the forward method (or any other method you use for inference) +with :func:`~pytorch_lightning.core.decorators.auto_move_data` and Lightning will take care of the rest. + +------------ + +Reproducibility +--------------- + +To ensure full reproducibility from run to run you need to set seeds for pseudo-random generators, +and set ``deterministic`` flag in ``Trainer``. + +Example:: + + from pytorch_lightning import Trainer, seed_everything + + seed_everything(42) + # sets seeds for numpy, torch, python.random and PYTHONHASHSEED. + model = Model() + trainer = Trainer(deterministic=True) + + +------- + +Trainer flags +------------- + +accelerator +^^^^^^^^^^^ + +.. raw:: html + + + +| + +The accelerator backend to use (previously known as distributed_backend). + +- (```dp```) is DataParallel (split batch among GPUs of same machine) +- (```ddp```) is DistributedDataParallel (each gpu on each node trains, and syncs grads) +- (```ddp_cpu```) is DistributedDataParallel on CPU (same as `ddp`, but does not use GPUs. + Useful for multi-node CPU training or single-node debugging. Note that this will **not** give + a speedup on a single node, since Torch already makes effient use of multiple CPUs on a single + machine.) +- (```ddp2```) dp on node, ddp across nodes. Useful for things like increasing + the number of negative samples + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(accelerator=None) + +Example:: + + # dp = DataParallel + trainer = Trainer(gpus=2, accelerator='dp') + + # ddp = DistributedDataParallel + trainer = Trainer(gpus=2, num_nodes=2, accelerator='ddp') + + # ddp2 = DistributedDataParallel + dp + trainer = Trainer(gpus=2, num_nodes=2, accelerator='ddp2') + +.. note:: this option does not apply to TPU. TPUs use ```ddp``` by default (over each core) + +You can also modify hardware behavior by subclassing an existing accelerator to adjust for your needs. + +Example:: + + class MyOwnDDP(DDPAccelerator): + ... + + Trainer(accelerator=MyOwnDDP()) + +.. warning:: Passing in custom accelerators is experimental but work is in progress to enable full compatibility. + +accumulate_grad_batches +^^^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Accumulates grads every k batches or as set up in the dict. +Trainer also calls ``optimizer.step()`` for the last indivisible step number. + +.. testcode:: + + # default used by the Trainer (no accumulation) + trainer = Trainer(accumulate_grad_batches=1) + +Example:: + + # accumulate every 4 batches (effective batch size is batch*4) + trainer = Trainer(accumulate_grad_batches=4) + + # no accumulation for epochs 1-4. accumulate 3 for epochs 5-10. accumulate 20 after that + trainer = Trainer(accumulate_grad_batches={5: 3, 10: 20}) + +amp_backend +^^^^^^^^^^^ + +.. raw:: html + + + +| + +Use PyTorch AMP ('native') (available PyTorch 1.6+), or NVIDIA apex ('apex'). + +.. testcode:: + + # using PyTorch built-in AMP, default used by the Trainer + trainer = Trainer(amp_backend='native') + + # using NVIDIA Apex + trainer = Trainer(amp_backend='apex') + +amp_level +^^^^^^^^^ + +.. raw:: html + + + +| + +The optimization level to use (O1, O2, etc...) +for 16-bit GPU precision (using NVIDIA apex under the hood). + +Check `NVIDIA apex docs `_ for level + +Example:: + + # default used by the Trainer + trainer = Trainer(amp_level='O2') + +automatic_optimization +^^^^^^^^^^^^^^^^^^^^^^ +When set to False, Lightning does not automate the optimization process. This means you are responsible for your own +optimizer behavior + +Example:: + + def training_step(self, batch, batch_idx): + opt = self.optimizers() + + loss = ... + self.manual_backward(loss, opt) + opt.step() + opt.zero_grad() + +This is not recommended when using a single optimizer, instead it's recommended when using 2+ optimizers +AND you are an expert user. Most useful for research like RL, sparse coding and GAN research. + +In the multi-optimizer case, ignore the optimizer_idx flag and use the optimizers directly + +Example:: + + def training_step(self, batch, batch_idx, optimizer_idx): + (opt_a, opt_b) = self.optimizers() + + gen_loss = ... + self.manual_backward(gen_loss, opt_a) + opt_a.step() + opt_a.zero_grad() + + disc_loss = ... + self.manual_backward(disc_loss, opt_b) + opt_b.step() + opt_b.zero_grad() + +auto_scale_batch_size +^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Automatically tries to find the largest batch size that fits into memory, +before any training. + +.. code-block:: + + # default used by the Trainer (no scaling of batch size) + trainer = Trainer(auto_scale_batch_size=None) + + # run batch size scaling, result overrides hparams.batch_size + trainer = Trainer(auto_scale_batch_size='binsearch') + + # call tune to find the batch size + trainer.tune(model) + +auto_select_gpus +^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +If enabled and `gpus` is an integer, pick available gpus automatically. +This is especially useful when GPUs are configured to be in "exclusive mode", +such that only one process at a time can access them. + +Example:: + + # no auto selection (picks first 2 gpus on system, may fail if other process is occupying) + trainer = Trainer(gpus=2, auto_select_gpus=False) + + # enable auto selection (will find two available gpus on system) + trainer = Trainer(gpus=2, auto_select_gpus=True) + + # specifies all GPUs regardless of its availability + Trainer(gpus=-1, auto_select_gpus=False) + + # specifies all available GPUs (if only one GPU is not occupied, uses one gpu) + Trainer(gpus=-1, auto_select_gpus=True) + +auto_lr_find +^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Runs a learning rate finder algorithm (see this `paper `_) +when calling trainer.tune(), to find optimal initial learning rate. + +.. code-block:: python + + # default used by the Trainer (no learning rate finder) + trainer = Trainer(auto_lr_find=False) + +Example:: + + # run learning rate finder, results override hparams.learning_rate + trainer = Trainer(auto_lr_find=True) + + # call tune to find the lr + trainer.tune(model) + +Example:: + + # run learning rate finder, results override hparams.my_lr_arg + trainer = Trainer(auto_lr_find='my_lr_arg') + + # call tune to find the lr + trainer.tune(model) + +.. note:: + See the :ref:`learning rate finder guide `. + +benchmark +^^^^^^^^^ + +.. raw:: html + + + +| + +If true enables cudnn.benchmark. +This flag is likely to increase the speed of your system if your +input sizes don't change. However, if it does, then it will likely +make your system slower. + +The speedup comes from allowing the cudnn auto-tuner to find the best +algorithm for the hardware `[see discussion here] +`_. + +Example:: + + # default used by the Trainer + trainer = Trainer(benchmark=False) + +deterministic +^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +If true enables cudnn.deterministic. +Might make your system slower, but ensures reproducibility. +Also sets ``$HOROVOD_FUSION_THRESHOLD=0``. + +For more info check `[pytorch docs] +`_. + +Example:: + + # default used by the Trainer + trainer = Trainer(deterministic=False) + +callbacks +^^^^^^^^^ + +.. raw:: html + + + +| + +Add a list of :class:`~pytorch_lightning.callbacks.Callback`. + +.. code-block:: python + + # a list of callbacks + callbacks = [PrintCallback()] + trainer = Trainer(callbacks=callbacks) + +Example:: + + from pytorch_lightning.callbacks import Callback + + class PrintCallback(Callback): + def on_train_start(self, trainer, pl_module): + print("Training is started!") + def on_train_end(self, trainer, pl_module): + print("Training is done.") + +check_val_every_n_epoch +^^^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Check val every n train epochs. + +Example:: + + # default used by the Trainer + trainer = Trainer(check_val_every_n_epoch=1) + + # run val loop every 10 training epochs + trainer = Trainer(check_val_every_n_epoch=10) + +checkpoint_callback +^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +By default Lightning saves a checkpoint for you in your current working directory, with the state of your last training epoch, +Checkpoints capture the exact value of all parameters used by a model. +To disable automatic checkpointing, set this to `False`. + +.. code-block:: python + + # default used by Trainer + trainer = Trainer(checkpoint_callback=True) + + # turn off automatic checkpointing + trainer = Trainer(checkpoint_callback=False) + + +You can override the default behavior by initializing the :class:`~pytorch_lightning.callbacks.ModelCheckpoint` +callback, and adding it to the :paramref:`~pytorch_lightning.trainer.trainer.Trainer.callbacks` list. +See :ref:`Saving and Loading Weights ` for how to customize checkpointing. + + +.. warning:: Passing a ModelCheckpoint instance to this argument is deprecated since + v1.1.0 and will be unsupported from v1.3.0. + + +default_root_dir +^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Default path for logs and weights when no logger or +:class:`pytorch_lightning.callbacks.ModelCheckpoint` callback passed. On +certain clusters you might want to separate where logs and checkpoints are +stored. If you don't then use this argument for convenience. Paths can be local +paths or remote paths such as `s3://bucket/path` or 'hdfs://path/'. Credentials +will need to be set up to use remote filepaths. + +Example:: + + # default used by the Trainer + trainer = Trainer(default_root_path=os.getcwd()) + +distributed_backend +^^^^^^^^^^^^^^^^^^^ +This has been renamed "accelerator". + +fast_dev_run +^^^^^^^^^^^^ + +.. raw:: html + + + +| + +.. raw:: html + + + +| + +Runs 1 batch of train, test and val to find any bugs (ie: a sort of unit test). + +Under the hood the pseudocode looks like this: + +.. code-block:: python + + # loading + __init__() + prepare_data + + # test training step + training_batch = next(train_dataloader) + training_step(training_batch) + + # test val step + val_batch = next(val_dataloader) + out = validation_step(val_batch) + validation_epoch_end([out]) + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(fast_dev_run=False) + + # runs 1 train, val, test batch and program ends + trainer = Trainer(fast_dev_run=True) + +gpus +^^^^ + +.. raw:: html + + + +| + +- Number of GPUs to train on (int) +- or which GPUs to train on (list) +- can handle strings + +.. testcode:: + + # default used by the Trainer (ie: train on CPU) + trainer = Trainer(gpus=None) + + # equivalent + trainer = Trainer(gpus=0) + +Example:: + + # int: train on 2 gpus + trainer = Trainer(gpus=2) + + # list: train on GPUs 1, 4 (by bus ordering) + trainer = Trainer(gpus=[1, 4]) + trainer = Trainer(gpus='1, 4') # equivalent + + # -1: train on all gpus + trainer = Trainer(gpus=-1) + trainer = Trainer(gpus='-1') # equivalent + + # combine with num_nodes to train on multiple GPUs across nodes + # uses 8 gpus in total + trainer = Trainer(gpus=2, num_nodes=4) + + # train only on GPUs 1 and 4 across nodes + trainer = Trainer(gpus=[1, 4], num_nodes=4) + +See Also: + - :ref:`Multi-GPU training guide `. + +gradient_clip_val +^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Gradient clipping value + +- 0 means don't clip. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(gradient_clip_val=0.0) + + +limit_test_batches +^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +How much of test dataset to check. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(limit_test_batches=1.0) + + # run through only 25% of the test set each epoch + trainer = Trainer(limit_test_batches=0.25) + + # run for only 10 batches + trainer = Trainer(limit_test_batches=10) + +In the case of multiple test dataloaders, the limit applies to each dataloader individually. + +limit_val_batches +^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +How much of validation dataset to check. +Useful when debugging or testing something that happens at the end of an epoch. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(limit_val_batches=1.0) + + # run through only 25% of the validation set each epoch + trainer = Trainer(limit_val_batches=0.25) + + # run for only 10 batches + trainer = Trainer(limit_val_batches=10) + +In the case of multiple validation dataloaders, the limit applies to each dataloader individually. + +log_gpu_memory +^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Options: + +- None +- 'min_max' +- 'all' + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(log_gpu_memory=None) + + # log all the GPUs (on master node only) + trainer = Trainer(log_gpu_memory='all') + + # log only the min and max memory on the master node + trainer = Trainer(log_gpu_memory='min_max') + +.. note:: Might slow performance because it uses the output of nvidia-smi. + +flush_logs_every_n_steps +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Writes logs to disk this often. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(flush_logs_every_n_steps=100) + +See Also: + - :ref:`logging` + +logger +^^^^^^ + +.. raw:: html + + + +| + +:ref:`Logger ` (or iterable collection of loggers) for experiment tracking. + +.. testcode:: + + from pytorch_lightning.loggers import TensorBoardLogger + + # default logger used by trainer + logger = TensorBoardLogger( + save_dir=os.getcwd(), + version=1, + name='lightning_logs' + ) + Trainer(logger=logger) + +max_epochs +^^^^^^^^^^ + +.. raw:: html + + + +| + +Stop training once this number of epochs is reached + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(max_epochs=1000) + +min_epochs +^^^^^^^^^^ + +.. raw:: html + + + +| + +Force training for at least these many epochs + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(min_epochs=1) + +max_steps +^^^^^^^^^ + +.. raw:: html + + + +| + +Stop training after this number of steps +Training will stop if max_steps or max_epochs have reached (earliest). + +.. testcode:: + + # Default (disabled) + trainer = Trainer(max_steps=None) + + # Stop after 100 steps + trainer = Trainer(max_steps=100) + +min_steps +^^^^^^^^^ + +.. raw:: html + + + +| + +Force training for at least these number of steps. +Trainer will train model for at least min_steps or min_epochs (latest). + +.. testcode:: + + # Default (disabled) + trainer = Trainer(min_steps=None) + + # Run at least for 100 steps (disable min_epochs) + trainer = Trainer(min_steps=100, min_epochs=0) + +num_nodes +^^^^^^^^^ + +.. raw:: html + + + +| + +Number of GPU nodes for distributed training. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(num_nodes=1) + + # to train on 8 nodes + trainer = Trainer(num_nodes=8) + +num_processes +^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Number of processes to train with. Automatically set to the number of GPUs +when using ``accelerator="ddp"``. Set to a number greater than 1 when +using ``accelerator="ddp_cpu"`` to mimic distributed training on a +machine without GPUs. This is useful for debugging, but **will not** provide +any speedup, since single-process Torch already makes effient use of multiple +CPUs. + +.. testcode:: + + # Simulate DDP for debugging on your GPU-less laptop + trainer = Trainer(accelerator="ddp_cpu", num_processes=2) + +num_sanity_val_steps +^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Sanity check runs n batches of val before starting the training routine. +This catches any bugs in your validation without having to wait for the first validation check. +The Trainer uses 2 steps by default. Turn it off or modify it here. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(num_sanity_val_steps=2) + + # turn it off + trainer = Trainer(num_sanity_val_steps=0) + + # check all validation data + trainer = Trainer(num_sanity_val_steps=-1) + + +This option will reset the validation dataloader unless ``num_sanity_val_steps=0``. + + +plugins +^^^^^^^ + +.. raw:: html + + + +| + +Plugins allow you to connect arbitrary backends, precision libraries, SLURM, etc... For example: + +- DDP +- SLURM +- TorchElastic +- Apex + +To define your own behavior, subclass the relevant class and pass it in. Here's an example linking up your own cluster. + +.. code-block:: python + + from pytorch_lightning.cluster_environments import cluster_environment + + class MyCluster(ClusterEnvironment): + + def master_address(self): + return your_master_address + + def master_port(self): + return your_master_port + + def world_size(self): + return the_world_size + + trainer = Trainer(cluster_environment=cluster_environment()) + +prepare_data_per_node +^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +If True will call `prepare_data()` on LOCAL_RANK=0 for every node. +If False will only call from NODE_RANK=0, LOCAL_RANK=0 + +.. testcode:: + + # default + Trainer(prepare_data_per_node=True) + + # use only NODE_RANK=0, LOCAL_RANK=0 + Trainer(prepare_data_per_node=False) + +tpu_cores +^^^^^^^^^ + +.. raw:: html + + + +| + +- How many TPU cores to train on (1 or 8). +- Which TPU core to train on [1-8] + +A single TPU v2 or v3 has 8 cores. A TPU pod has +up to 2048 cores. A slice of a POD means you get as many cores +as you request. + +Your effective batch size is batch_size * total tpu cores. + +.. note:: No need to add a DistributedDataSampler, Lightning automatically does it for you. + +This parameter can be either 1 or 8. + +.. testcode:: + + # your_trainer_file.py + + # default used by the Trainer (ie: train on CPU) + trainer = Trainer(tpu_cores=None) + + # int: train on a single core + trainer = Trainer(tpu_cores=1) + + # list: train on a single selected core + trainer = Trainer(tpu_cores=[2]) + + # int: train on all cores few cores + trainer = Trainer(tpu_cores=8) + + # for 8+ cores must submit via xla script with + # a max of 8 cores specified. The XLA script + # will duplicate script onto each TPU in the POD + trainer = Trainer(tpu_cores=8) + +To train on more than 8 cores (ie: a POD), +submit this script using the xla_dist script. + +Example:: + + python -m torch_xla.distributed.xla_dist + --tpu=$TPU_POD_NAME + --conda-env=torch-xla-nightly + --env=XLA_USE_BF16=1 + -- python your_trainer_file.py + +overfit_batches +^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Uses this much data of the training set. If nonzero, will use the same training set for validation and testing. +If the training dataloaders have `shuffle=True`, Lightning will automatically disable it. + +Useful for quickly debugging or trying to overfit on purpose. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(overfit_batches=0.0) + + # use only 1% of the train set (and use the train set for val and test) + trainer = Trainer(overfit_batches=0.01) + + # overfit on 10 of the same batches + trainer = Trainer(overfit_batches=10) + +precision +^^^^^^^^^ + +.. raw:: html + + + +| + +Full precision (32), half precision (16). +Can be used on CPU, GPU or TPUs. + +If used on TPU will use torch.bfloat16 but tensor printing +will still show torch.float32. + +.. testcode:: + :skipif: not APEX_AVAILABLE and not NATIVE_AMP_AVALAIBLE + + # default used by the Trainer + trainer = Trainer(precision=32) + + # 16-bit precision + trainer = Trainer(precision=16) + +Example:: + + # one day + trainer = Trainer(precision=8|4|2) + +process_position +^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Orders the progress bar. Useful when running multiple trainers on the same node. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(process_position=0) + +Note: + This argument is ignored if a custom callback is passed to :paramref:`~Trainer.callbacks`. + +profiler +^^^^^^^^ + +.. raw:: html + + + +| + +To profile individual steps during training and assist in identifying bottlenecks. + +See the :ref:`profiler documentation `. for more details. + +.. testcode:: + + from pytorch_lightning.profiler import SimpleProfiler, AdvancedProfiler + + # default used by the Trainer + trainer = Trainer(profiler=None) + + # to profile standard training events, equivalent to `profiler=SimpleProfiler()` + trainer = Trainer(profiler="simple") + + # advanced profiler for function-level stats, equivalent to `profiler=AdvancedProfiler()` + trainer = Trainer(profiler="advanced") + +progress_bar_refresh_rate +^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +How often to refresh progress bar (in steps). +In notebooks, faster refresh rates (lower number) is known to crash them +because of their screen refresh rates, so raise it to 50 or more. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(progress_bar_refresh_rate=1) + + # disable progress bar + trainer = Trainer(progress_bar_refresh_rate=0) + +Note: + This argument is ignored if a custom callback is passed to :paramref:`~Trainer.callbacks`. + +reload_dataloaders_every_epoch +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Set to True to reload dataloaders every epoch. + +.. code-block:: python + + # if False (default) + train_loader = model.train_dataloader() + for epoch in epochs: + for batch in train_loader: + ... + + # if True + for epoch in epochs: + train_loader = model.train_dataloader() + for batch in train_loader: + +replace_sampler_ddp +^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Enables auto adding of distributed sampler. By default it will add ``shuffle=True`` +for train sampler and ``shuffle=False`` for val/test sampler. If you want to customize +it, you can set ``replace_sampler_ddp=False`` and add your own distributed sampler. +If ``replace_sampler_ddp=True`` and a distributed sampler was already added, +Lightning will not replace the existing one. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(replace_sampler_ddp=True) + +By setting to False, you have to add your own distributed sampler: + +.. code-block:: python + + # default used by the Trainer + sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=True) + dataloader = DataLoader(dataset, batch_size=32, sampler=sampler) + +resume_from_checkpoint +^^^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +To resume training from a specific checkpoint pass in the path here. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(resume_from_checkpoint=None) + + # resume from a specific checkpoint + trainer = Trainer(resume_from_checkpoint='some/path/to/my_checkpoint.ckpt') + +log_every_n_steps +^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + + +How often to add logging rows (does not write to disk) + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(log_every_n_steps=50) + +See Also: + - :ref:`logging` + + +sync_batchnorm +^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Enable synchronization between batchnorm layers across all GPUs. + +.. testcode:: + + trainer = Trainer(sync_batchnorm=True) + +track_grad_norm +^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +- no tracking (-1) +- Otherwise tracks that norm (2 for 2-norm) + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(track_grad_norm=-1) + + # track the 2-norm + trainer = Trainer(track_grad_norm=2) + +limit_train_batches +^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +How much of training dataset to check. +Useful when debugging or testing something that happens at the end of an epoch. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(limit_train_batches=1.0) + +Example:: + + # default used by the Trainer + trainer = Trainer(limit_train_batches=1.0) + + # run through only 25% of the training set each epoch + trainer = Trainer(limit_train_batches=0.25) + + # run through only 10 batches of the training set each epoch + trainer = Trainer(limit_train_batches=10) + +truncated_bptt_steps +^^^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Truncated back prop breaks performs backprop every k steps of +a much longer sequence. + +If this is enabled, your batches will automatically get truncated +and the trainer will apply Truncated Backprop to it. + +(`Williams et al. "An efficient gradient-based algorithm for on-line training of +recurrent network trajectories." +`_) + +.. testcode:: + + # default used by the Trainer (ie: disabled) + trainer = Trainer(truncated_bptt_steps=None) + + # backprop every 5 steps in a batch + trainer = Trainer(truncated_bptt_steps=5) + +.. note:: Make sure your batches have a sequence dimension. + +Lightning takes care to split your batch along the time-dimension. + +.. code-block:: python + + # we use the second as the time dimension + # (batch, time, ...) + sub_batch = batch[0, 0:t, ...] + +Using this feature requires updating your LightningModule's +:meth:`pytorch_lightning.core.LightningModule.training_step` to include a `hiddens` arg +with the hidden + +.. code-block:: python + + # Truncated back-propagation through time + def training_step(self, batch, batch_idx, hiddens): + # hiddens are the hiddens from the previous truncated backprop step + out, hiddens = self.lstm(data, hiddens) + + return { + "loss": ..., + "hiddens": hiddens # remember to detach() this + } + +To modify how the batch is split, +override :meth:`pytorch_lightning.core.LightningModule.tbptt_split_batch`: + +.. testcode:: + + class LitMNIST(LightningModule): + def tbptt_split_batch(self, batch, split_size): + # do your own splitting on the batch + return splits + +val_check_interval +^^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +How often within one training epoch to check the validation set. +Can specify as float or int. + +- use (float) to check within a training epoch +- use (int) to check every n steps (batches) + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(val_check_interval=1.0) + + # check validation set 4 times during a training epoch + trainer = Trainer(val_check_interval=0.25) + + # check validation set every 1000 training batches + # use this when using iterableDataset and your dataset has no length + # (ie: production cases with streaming data) + trainer = Trainer(val_check_interval=1000) + + +weights_save_path +^^^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Directory of where to save weights if specified. + +.. testcode:: + + # default used by the Trainer + trainer = Trainer(weights_save_path=os.getcwd()) + + # save to your custom path + trainer = Trainer(weights_save_path='my/path') + +Example:: + + # if checkpoint callback used, then overrides the weights path + # **NOTE: this saves weights to some/path NOT my/path + checkpoint = ModelCheckpoint(dirpath='some/path') + trainer = Trainer( + callbacks=[checkpoint], + weights_save_path='my/path' + ) + +weights_summary +^^^^^^^^^^^^^^^ + +.. raw:: html + + + +| + +Prints a summary of the weights when training begins. +Options: 'full', 'top', None. + +.. testcode:: + + # default used by the Trainer (ie: print summary of top level modules) + trainer = Trainer(weights_summary='top') + + # print full summary of all modules and submodules + trainer = Trainer(weights_summary='full') + + # don't print a summary + trainer = Trainer(weights_summary=None) + +----- + +Trainer class API +----------------- + +Methods +^^^^^^^ + +init +**** + +.. automethod:: pytorch_lightning.trainer.Trainer.__init__ + :noindex: + +fit +**** + +.. automethod:: pytorch_lightning.trainer.Trainer.fit + :noindex: + +test +**** + +.. automethod:: pytorch_lightning.trainer.Trainer.test + :noindex: + +tune +**** + +.. automethod:: pytorch_lightning.trainer.Trainer.tune + :noindex: + +Properties +^^^^^^^^^^ + +callback_metrics +**************** + +The metrics available to callbacks. These are automatically set when you log via `self.log` + +.. code-block:: python + + def training_step(self, batch, batch_idx): + self.log('a_val', 2) + + + callback_metrics = trainer.callback_metrics + assert callback_metrics['a_val'] == 2 + +current_epoch +************* + +The current epoch + +.. code-block:: python + + def training_step(self, batch, batch_idx): + current_epoch = self.trainer.current_epoch + if current_epoch > 100: + # do something + pass + + +logger (p) +********** + +The current logger being used. Here's an example using tensorboard + +.. code-block:: python + + def training_step(self, batch, batch_idx): + logger = self.trainer.logger + tensorboard = logger.experiment + + +logged_metrics +************** + +The metrics sent to the logger (visualizer). + +.. code-block:: python + + def training_step(self, batch, batch_idx): + self.log('a_val', 2, log=True) + + + logged_metrics = trainer.logged_metrics + assert logged_metrics['a_val'] == 2 + + +is_global_zero +************** + +Whether this process is the global zero in multi-node training + +.. code-block:: python + + def training_step(self, batch, batch_idx): + if self.trainer.is_global_zero: + print('in node 0, accelerator 0') + +progress_bar_metrics +******************** + +The metrics sent to the progress bar. + +.. code-block:: python + + def training_step(self, batch, batch_idx): + self.log('a_val', 2, prog_bar=True) + + + progress_bar_metrics = trainer.progress_bar_metrics + assert progress_bar_metrics['a_val'] == 2 + diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py index c0a3744f162fd..98abd994b531d 100644 --- a/pytorch_lightning/trainer/__init__.py +++ b/pytorch_lightning/trainer/__init__.py @@ -12,1544 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -.. testsetup:: * - - import os - from pytorch_lightning.trainer.trainer import Trainer - from pytorch_lightning.core.lightning import LightningModule - from pytorch_lightning.utilities.seed import seed_everything - - -Once you've organized your PyTorch code into a LightningModule, -the Trainer automates everything else. - -.. raw:: html - - - -| - -This abstraction achieves the following: - -1. You maintain control over all aspects via PyTorch code without an added abstraction. - -2. The trainer uses best practices embedded by contributors and users - from top AI labs such as Facebook AI Research, NYU, MIT, Stanford, etc... - -3. The trainer allows overriding any key part that you don't want automated. - -| - ------------ - -Basic use ---------- - -This is the basic use of the trainer: - -.. code-block:: python - - model = MyLightningModule() - - trainer = Trainer() - trainer.fit(model, train_dataloader, val_dataloader) - - --------- - -Trainer in Python scripts -------------------------- -In Python scripts, it's recommended you use a main function to call the Trainer. - -.. code-block:: python - - from argparse import ArgumentParser - - def main(hparams): - model = LightningModule() - trainer = Trainer(gpus=hparams.gpus) - trainer.fit(model) - - if __name__ == '__main__': - parser = ArgumentParser() - parser.add_argument('--gpus', default=None) - args = parser.parse_args() - - main(args) - -So you can run it like so: - -.. code-block:: bash - - python main.py --gpus 2 - -.. note:: - - Pro-tip: You don't need to define all flags manually. Lightning can add them automatically - -.. code-block:: python - - from argparse import ArgumentParser - - def main(args): - model = LightningModule() - trainer = Trainer.from_argparse_args(args) - trainer.fit(model) - - if __name__ == '__main__': - parser = ArgumentParser() - parser = Trainer.add_argparse_args(parser) - args = parser.parse_args() - - main(args) - -So you can run it like so: - -.. code-block:: bash - - python main.py --gpus 2 --max_steps 10 --limit_train_batches 10 --any_trainer_arg x - -.. note:: - If you want to stop a training run early, you can press "Ctrl + C" on your keyboard. - The trainer will catch the `KeyboardInterrupt` and attempt a graceful shutdown, including - running callbacks such as `on_train_end`. The trainer object will also set an attribute - `interrupted` to `True` in such cases. If you have a callback which shuts down compute - resources, for example, you can conditionally run the shutdown logic for only uninterrupted runs. - ------------- - -Testing -------- -Once you're done training, feel free to run the test set! -(Only right before publishing your paper or pushing to production) - -.. code-block:: python - - trainer.test(test_dataloader=test_dataloader) - ------------- - -Deployment / prediction ------------------------ -You just trained a LightningModule which is also just a torch.nn.Module. -Use it to do whatever! - -.. code-block:: python - - # load model - pretrained_model = LightningModule.load_from_checkpoint(PATH) - pretrained_model.freeze() - - # use it for finetuning - def forward(self, x): - features = pretrained_model(x) - classes = classifier(features) - - # or for prediction - out = pretrained_model(x) - api_write({'response': out} - - -You may wish to run the model on a variety of devices. Instead of moving the data -manually to the correct device, decorate the forward method (or any other method you use for inference) -with :func:`~pytorch_lightning.core.decorators.auto_move_data` and Lightning will take care of the rest. - ------------- - -Reproducibility ---------------- - -To ensure full reproducibility from run to run you need to set seeds for pseudo-random generators, -and set ``deterministic`` flag in ``Trainer``. - -Example:: - - from pytorch_lightning import Trainer, seed_everything - - seed_everything(42) - # sets seeds for numpy, torch, python.random and PYTHONHASHSEED. - model = Model() - trainer = Trainer(deterministic=True) - - -------- - -Trainer flags -------------- - -accelerator -^^^^^^^^^^^ - -.. raw:: html - - - -| - -The accelerator backend to use (previously known as distributed_backend). - -- (```dp```) is DataParallel (split batch among GPUs of same machine) -- (```ddp```) is DistributedDataParallel (each gpu on each node trains, and syncs grads) -- (```ddp_cpu```) is DistributedDataParallel on CPU (same as `ddp`, but does not use GPUs. - Useful for multi-node CPU training or single-node debugging. Note that this will **not** give - a speedup on a single node, since Torch already makes effient use of multiple CPUs on a single - machine.) -- (```ddp2```) dp on node, ddp across nodes. Useful for things like increasing - the number of negative samples - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(accelerator=None) - -Example:: - - # dp = DataParallel - trainer = Trainer(gpus=2, accelerator='dp') - - # ddp = DistributedDataParallel - trainer = Trainer(gpus=2, num_nodes=2, accelerator='ddp') - - # ddp2 = DistributedDataParallel + dp - trainer = Trainer(gpus=2, num_nodes=2, accelerator='ddp2') - -.. note:: this option does not apply to TPU. TPUs use ```ddp``` by default (over each core) - -You can also modify hardware behavior by subclassing an existing accelerator to adjust for your needs. - -Example:: - - class MyOwnDDP(DDPAccelerator): - ... - - Trainer(accelerator=MyOwnDDP()) - -.. warning:: Passing in custom accelerators is experimental but work is in progress to enable full compatibility. - -accumulate_grad_batches -^^^^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Accumulates grads every k batches or as set up in the dict. -Trainer also calls ``optimizer.step()`` for the last indivisible step number. - -.. testcode:: - - # default used by the Trainer (no accumulation) - trainer = Trainer(accumulate_grad_batches=1) - -Example:: - - # accumulate every 4 batches (effective batch size is batch*4) - trainer = Trainer(accumulate_grad_batches=4) - - # no accumulation for epochs 1-4. accumulate 3 for epochs 5-10. accumulate 20 after that - trainer = Trainer(accumulate_grad_batches={5: 3, 10: 20}) - -amp_backend -^^^^^^^^^^^ - -.. raw:: html - - - -| - -Use PyTorch AMP ('native') (available PyTorch 1.6+), or NVIDIA apex ('apex'). - -.. testcode:: - - # using PyTorch built-in AMP, default used by the Trainer - trainer = Trainer(amp_backend='native') - - # using NVIDIA Apex - trainer = Trainer(amp_backend='apex') - -amp_level -^^^^^^^^^ - -.. raw:: html - - - -| - -The optimization level to use (O1, O2, etc...) -for 16-bit GPU precision (using NVIDIA apex under the hood). - -Check `NVIDIA apex docs `_ for level - -Example:: - - # default used by the Trainer - trainer = Trainer(amp_level='O2') - -automatic_optimization -^^^^^^^^^^^^^^^^^^^^^^ -When set to False, Lightning does not automate the optimization process. This means you are responsible for your own -optimizer behavior - -Example:: - - def training_step(self, batch, batch_idx): - opt = self.optimizers() - - loss = ... - self.manual_backward(loss, opt) - opt.step() - opt.zero_grad() - -This is not recommended when using a single optimizer, instead it's recommended when using 2+ optimizers -AND you are an expert user. Most useful for research like RL, sparse coding and GAN research. - -In the multi-optimizer case, ignore the optimizer_idx flag and use the optimizers directly - -Example:: - - def training_step(self, batch, batch_idx, optimizer_idx): - (opt_a, opt_b) = self.optimizers() - - gen_loss = ... - self.manual_backward(gen_loss, opt_a) - opt_a.step() - opt_a.zero_grad() - - disc_loss = ... - self.manual_backward(disc_loss, opt_b) - opt_b.step() - opt_b.zero_grad() - -auto_scale_batch_size -^^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Automatically tries to find the largest batch size that fits into memory, -before any training. - -.. code-block:: - - # default used by the Trainer (no scaling of batch size) - trainer = Trainer(auto_scale_batch_size=None) - - # run batch size scaling, result overrides hparams.batch_size - trainer = Trainer(auto_scale_batch_size='binsearch') - - # call tune to find the batch size - trainer.tune(model) - -auto_select_gpus -^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -If enabled and `gpus` is an integer, pick available gpus automatically. -This is especially useful when GPUs are configured to be in "exclusive mode", -such that only one process at a time can access them. - -Example:: - - # no auto selection (picks first 2 gpus on system, may fail if other process is occupying) - trainer = Trainer(gpus=2, auto_select_gpus=False) - - # enable auto selection (will find two available gpus on system) - trainer = Trainer(gpus=2, auto_select_gpus=True) - - # specifies all GPUs regardless of its availability - Trainer(gpus=-1, auto_select_gpus=False) - - # specifies all available GPUs (if only one GPU is not occupied, uses one gpu) - Trainer(gpus=-1, auto_select_gpus=True) - -auto_lr_find -^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Runs a learning rate finder algorithm (see this `paper `_) -when calling trainer.tune(), to find optimal initial learning rate. - -.. code-block:: python - - # default used by the Trainer (no learning rate finder) - trainer = Trainer(auto_lr_find=False) - -Example:: - - # run learning rate finder, results override hparams.learning_rate - trainer = Trainer(auto_lr_find=True) - - # call tune to find the lr - trainer.tune(model) - -Example:: - - # run learning rate finder, results override hparams.my_lr_arg - trainer = Trainer(auto_lr_find='my_lr_arg') - - # call tune to find the lr - trainer.tune(model) - -.. note:: - See the :ref:`learning rate finder guide `. - -benchmark -^^^^^^^^^ - -.. raw:: html - - - -| - -If true enables cudnn.benchmark. -This flag is likely to increase the speed of your system if your -input sizes don't change. However, if it does, then it will likely -make your system slower. - -The speedup comes from allowing the cudnn auto-tuner to find the best -algorithm for the hardware `[see discussion here] -`_. - -Example:: - - # default used by the Trainer - trainer = Trainer(benchmark=False) - -deterministic -^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -If true enables cudnn.deterministic. -Might make your system slower, but ensures reproducibility. -Also sets ``$HOROVOD_FUSION_THRESHOLD=0``. - -For more info check `[pytorch docs] -`_. - -Example:: - - # default used by the Trainer - trainer = Trainer(deterministic=False) - -callbacks -^^^^^^^^^ - -.. raw:: html - - - -| - -Add a list of :class:`~pytorch_lightning.callbacks.Callback`. - -.. code-block:: python - - # a list of callbacks - callbacks = [PrintCallback()] - trainer = Trainer(callbacks=callbacks) - -Example:: - - from pytorch_lightning.callbacks import Callback - - class PrintCallback(Callback): - def on_train_start(self, trainer, pl_module): - print("Training is started!") - def on_train_end(self, trainer, pl_module): - print("Training is done.") - -check_val_every_n_epoch -^^^^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Check val every n train epochs. - -Example:: - - # default used by the Trainer - trainer = Trainer(check_val_every_n_epoch=1) - - # run val loop every 10 training epochs - trainer = Trainer(check_val_every_n_epoch=10) - -checkpoint_callback -^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -By default Lightning saves a checkpoint for you in your current working directory, with the state of your last training epoch, -Checkpoints capture the exact value of all parameters used by a model. -To disable automatic checkpointing, set this to `False`. - -.. code-block:: python - - # default used by Trainer - trainer = Trainer(checkpoint_callback=True) - - # turn off automatic checkpointing - trainer = Trainer(checkpoint_callback=False) - - -You can override the default behavior by initializing the :class:`~pytorch_lightning.callbacks.ModelCheckpoint` -callback, and adding it to the :paramref:`~pytorch_lightning.trainer.trainer.Trainer.callbacks` list. -See :ref:`Saving and Loading Weights ` for how to customize checkpointing. - - -.. warning:: Passing a ModelCheckpoint instance to this argument is deprecated since - v1.1.0 and will be unsupported from v1.3.0. - - -default_root_dir -^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Default path for logs and weights when no logger or -:class:`pytorch_lightning.callbacks.ModelCheckpoint` callback passed. On -certain clusters you might want to separate where logs and checkpoints are -stored. If you don't then use this argument for convenience. Paths can be local -paths or remote paths such as `s3://bucket/path` or 'hdfs://path/'. Credentials -will need to be set up to use remote filepaths. - -Example:: - - # default used by the Trainer - trainer = Trainer(default_root_path=os.getcwd()) - -distributed_backend -^^^^^^^^^^^^^^^^^^^ -This has been renamed "accelerator". - -fast_dev_run -^^^^^^^^^^^^ - -.. raw:: html - - - -| - -.. raw:: html - - - -| - -Runs 1 batch of train, test and val to find any bugs (ie: a sort of unit test). - -Under the hood the pseudocode looks like this: - -.. code-block:: python - - # loading - __init__() - prepare_data - - # test training step - training_batch = next(train_dataloader) - training_step(training_batch) - - # test val step - val_batch = next(val_dataloader) - out = validation_step(val_batch) - validation_epoch_end([out]) - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(fast_dev_run=False) - - # runs 1 train, val, test batch and program ends - trainer = Trainer(fast_dev_run=True) - -gpus -^^^^ - -.. raw:: html - - - -| - -- Number of GPUs to train on (int) -- or which GPUs to train on (list) -- can handle strings - -.. testcode:: - - # default used by the Trainer (ie: train on CPU) - trainer = Trainer(gpus=None) - - # equivalent - trainer = Trainer(gpus=0) - -Example:: - - # int: train on 2 gpus - trainer = Trainer(gpus=2) - - # list: train on GPUs 1, 4 (by bus ordering) - trainer = Trainer(gpus=[1, 4]) - trainer = Trainer(gpus='1, 4') # equivalent - - # -1: train on all gpus - trainer = Trainer(gpus=-1) - trainer = Trainer(gpus='-1') # equivalent - - # combine with num_nodes to train on multiple GPUs across nodes - # uses 8 gpus in total - trainer = Trainer(gpus=2, num_nodes=4) - - # train only on GPUs 1 and 4 across nodes - trainer = Trainer(gpus=[1, 4], num_nodes=4) - -See Also: - - :ref:`Multi-GPU training guide `. - -gradient_clip_val -^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Gradient clipping value - -- 0 means don't clip. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(gradient_clip_val=0.0) - - -limit_test_batches -^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -How much of test dataset to check. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(limit_test_batches=1.0) - - # run through only 25% of the test set each epoch - trainer = Trainer(limit_test_batches=0.25) - - # run for only 10 batches - trainer = Trainer(limit_test_batches=10) - -In the case of multiple test dataloaders, the limit applies to each dataloader individually. - -limit_val_batches -^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -How much of validation dataset to check. -Useful when debugging or testing something that happens at the end of an epoch. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(limit_val_batches=1.0) - - # run through only 25% of the validation set each epoch - trainer = Trainer(limit_val_batches=0.25) - - # run for only 10 batches - trainer = Trainer(limit_val_batches=10) - -In the case of multiple validation dataloaders, the limit applies to each dataloader individually. - -log_gpu_memory -^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Options: - -- None -- 'min_max' -- 'all' - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(log_gpu_memory=None) - - # log all the GPUs (on master node only) - trainer = Trainer(log_gpu_memory='all') - - # log only the min and max memory on the master node - trainer = Trainer(log_gpu_memory='min_max') - -.. note:: Might slow performance because it uses the output of nvidia-smi. - -flush_logs_every_n_steps -^^^^^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Writes logs to disk this often. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(flush_logs_every_n_steps=100) - -See Also: - - :ref:`logging` - -logger -^^^^^^ - -.. raw:: html - - - -| - -:ref:`Logger ` (or iterable collection of loggers) for experiment tracking. - -.. testcode:: - - from pytorch_lightning.loggers import TensorBoardLogger - - # default logger used by trainer - logger = TensorBoardLogger( - save_dir=os.getcwd(), - version=1, - name='lightning_logs' - ) - Trainer(logger=logger) - -max_epochs -^^^^^^^^^^ - -.. raw:: html - - - -| - -Stop training once this number of epochs is reached - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(max_epochs=1000) - -min_epochs -^^^^^^^^^^ - -.. raw:: html - - - -| - -Force training for at least these many epochs - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(min_epochs=1) - -max_steps -^^^^^^^^^ - -.. raw:: html - - - -| - -Stop training after this number of steps -Training will stop if max_steps or max_epochs have reached (earliest). - -.. testcode:: - - # Default (disabled) - trainer = Trainer(max_steps=None) - - # Stop after 100 steps - trainer = Trainer(max_steps=100) - -min_steps -^^^^^^^^^ - -.. raw:: html - - - -| - -Force training for at least these number of steps. -Trainer will train model for at least min_steps or min_epochs (latest). - -.. testcode:: - - # Default (disabled) - trainer = Trainer(min_steps=None) - - # Run at least for 100 steps (disable min_epochs) - trainer = Trainer(min_steps=100, min_epochs=0) - -num_nodes -^^^^^^^^^ - -.. raw:: html - - - -| - -Number of GPU nodes for distributed training. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(num_nodes=1) - - # to train on 8 nodes - trainer = Trainer(num_nodes=8) - -num_processes -^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Number of processes to train with. Automatically set to the number of GPUs -when using ``accelerator="ddp"``. Set to a number greater than 1 when -using ``accelerator="ddp_cpu"`` to mimic distributed training on a -machine without GPUs. This is useful for debugging, but **will not** provide -any speedup, since single-process Torch already makes effient use of multiple -CPUs. - -.. testcode:: - - # Simulate DDP for debugging on your GPU-less laptop - trainer = Trainer(accelerator="ddp_cpu", num_processes=2) - -num_sanity_val_steps -^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Sanity check runs n batches of val before starting the training routine. -This catches any bugs in your validation without having to wait for the first validation check. -The Trainer uses 2 steps by default. Turn it off or modify it here. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(num_sanity_val_steps=2) - - # turn it off - trainer = Trainer(num_sanity_val_steps=0) - - # check all validation data - trainer = Trainer(num_sanity_val_steps=-1) - - -This option will reset the validation dataloader unless ``num_sanity_val_steps=0``. - - -plugins -^^^^^^^ - -.. raw:: html - - - -| - -Plugins allow you to connect arbitrary backends, precision libraries, SLURM, etc... For example: - -- DDP -- SLURM -- TorchElastic -- Apex - -To define your own behavior, subclass the relevant class and pass it in. Here's an example linking up your own cluster. - -.. code-block:: python - - from pytorch_lightning.cluster_environments import cluster_environment - - class MyCluster(ClusterEnvironment): - - def master_address(self): - return your_master_address - - def master_port(self): - return your_master_port - - def world_size(self): - return the_world_size - - trainer = Trainer(cluster_environment=cluster_environment()) - -prepare_data_per_node -^^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -If True will call `prepare_data()` on LOCAL_RANK=0 for every node. -If False will only call from NODE_RANK=0, LOCAL_RANK=0 - -.. testcode:: - - # default - Trainer(prepare_data_per_node=True) - - # use only NODE_RANK=0, LOCAL_RANK=0 - Trainer(prepare_data_per_node=False) - -tpu_cores -^^^^^^^^^ - -.. raw:: html - - - -| - -- How many TPU cores to train on (1 or 8). -- Which TPU core to train on [1-8] - -A single TPU v2 or v3 has 8 cores. A TPU pod has -up to 2048 cores. A slice of a POD means you get as many cores -as you request. - -Your effective batch size is batch_size * total tpu cores. - -.. note:: No need to add a DistributedDataSampler, Lightning automatically does it for you. - -This parameter can be either 1 or 8. - -.. testcode:: - - # your_trainer_file.py - - # default used by the Trainer (ie: train on CPU) - trainer = Trainer(tpu_cores=None) - - # int: train on a single core - trainer = Trainer(tpu_cores=1) - - # list: train on a single selected core - trainer = Trainer(tpu_cores=[2]) - - # int: train on all cores few cores - trainer = Trainer(tpu_cores=8) - - # for 8+ cores must submit via xla script with - # a max of 8 cores specified. The XLA script - # will duplicate script onto each TPU in the POD - trainer = Trainer(tpu_cores=8) - -To train on more than 8 cores (ie: a POD), -submit this script using the xla_dist script. - -Example:: - - python -m torch_xla.distributed.xla_dist - --tpu=$TPU_POD_NAME - --conda-env=torch-xla-nightly - --env=XLA_USE_BF16=1 - -- python your_trainer_file.py - -overfit_batches -^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Uses this much data of the training set. If nonzero, will use the same training set for validation and testing. -If the training dataloaders have `shuffle=True`, Lightning will automatically disable it. - -Useful for quickly debugging or trying to overfit on purpose. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(overfit_batches=0.0) - - # use only 1% of the train set (and use the train set for val and test) - trainer = Trainer(overfit_batches=0.01) - - # overfit on 10 of the same batches - trainer = Trainer(overfit_batches=10) - -precision -^^^^^^^^^ - -.. raw:: html - - - -| - -Full precision (32), half precision (16). -Can be used on CPU, GPU or TPUs. - -If used on TPU will use torch.bfloat16 but tensor printing -will still show torch.float32. - -.. testcode:: - :skipif: not APEX_AVAILABLE and not NATIVE_AMP_AVALAIBLE - - # default used by the Trainer - trainer = Trainer(precision=32) - - # 16-bit precision - trainer = Trainer(precision=16) - -Example:: - - # one day - trainer = Trainer(precision=8|4|2) - -process_position -^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Orders the progress bar. Useful when running multiple trainers on the same node. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(process_position=0) - -Note: - This argument is ignored if a custom callback is passed to :paramref:`~Trainer.callbacks`. - -profiler -^^^^^^^^ - -.. raw:: html - - - -| - -To profile individual steps during training and assist in identifying bottlenecks. - -See the :ref:`profiler documentation `. for more details. - -.. testcode:: - - from pytorch_lightning.profiler import SimpleProfiler, AdvancedProfiler - - # default used by the Trainer - trainer = Trainer(profiler=None) - - # to profile standard training events, equivalent to `profiler=SimpleProfiler()` - trainer = Trainer(profiler="simple") - - # advanced profiler for function-level stats, equivalent to `profiler=AdvancedProfiler()` - trainer = Trainer(profiler="advanced") - -progress_bar_refresh_rate -^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -How often to refresh progress bar (in steps). -In notebooks, faster refresh rates (lower number) is known to crash them -because of their screen refresh rates, so raise it to 50 or more. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(progress_bar_refresh_rate=1) - - # disable progress bar - trainer = Trainer(progress_bar_refresh_rate=0) - -Note: - This argument is ignored if a custom callback is passed to :paramref:`~Trainer.callbacks`. - -reload_dataloaders_every_epoch -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Set to True to reload dataloaders every epoch. - -.. code-block:: python - - # if False (default) - train_loader = model.train_dataloader() - for epoch in epochs: - for batch in train_loader: - ... - - # if True - for epoch in epochs: - train_loader = model.train_dataloader() - for batch in train_loader: - -replace_sampler_ddp -^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Enables auto adding of distributed sampler. By default it will add ``shuffle=True`` -for train sampler and ``shuffle=False`` for val/test sampler. If you want to customize -it, you can set ``replace_sampler_ddp=False`` and add your own distributed sampler. -If ``replace_sampler_ddp=True`` and a distributed sampler was already added, -Lightning will not replace the existing one. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(replace_sampler_ddp=True) - -By setting to False, you have to add your own distributed sampler: - -.. code-block:: python - - # default used by the Trainer - sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=True) - dataloader = DataLoader(dataset, batch_size=32, sampler=sampler) - -resume_from_checkpoint -^^^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -To resume training from a specific checkpoint pass in the path here. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(resume_from_checkpoint=None) - - # resume from a specific checkpoint - trainer = Trainer(resume_from_checkpoint='some/path/to/my_checkpoint.ckpt') - -log_every_n_steps -^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - - -How often to add logging rows (does not write to disk) - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(log_every_n_steps=50) - -See Also: - - :ref:`logging` - - -sync_batchnorm -^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Enable synchronization between batchnorm layers across all GPUs. - -.. testcode:: - - trainer = Trainer(sync_batchnorm=True) - -track_grad_norm -^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -- no tracking (-1) -- Otherwise tracks that norm (2 for 2-norm) - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(track_grad_norm=-1) - - # track the 2-norm - trainer = Trainer(track_grad_norm=2) - -limit_train_batches -^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -How much of training dataset to check. -Useful when debugging or testing something that happens at the end of an epoch. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(limit_train_batches=1.0) - -Example:: - - # default used by the Trainer - trainer = Trainer(limit_train_batches=1.0) - - # run through only 25% of the training set each epoch - trainer = Trainer(limit_train_batches=0.25) - - # run through only 10 batches of the training set each epoch - trainer = Trainer(limit_train_batches=10) - -truncated_bptt_steps -^^^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Truncated back prop breaks performs backprop every k steps of -a much longer sequence. - -If this is enabled, your batches will automatically get truncated -and the trainer will apply Truncated Backprop to it. - -(`Williams et al. "An efficient gradient-based algorithm for on-line training of -recurrent network trajectories." -`_) - -.. testcode:: - - # default used by the Trainer (ie: disabled) - trainer = Trainer(truncated_bptt_steps=None) - - # backprop every 5 steps in a batch - trainer = Trainer(truncated_bptt_steps=5) - -.. note:: Make sure your batches have a sequence dimension. - -Lightning takes care to split your batch along the time-dimension. - -.. code-block:: python - - # we use the second as the time dimension - # (batch, time, ...) - sub_batch = batch[0, 0:t, ...] - -Using this feature requires updating your LightningModule's -:meth:`pytorch_lightning.core.LightningModule.training_step` to include a `hiddens` arg -with the hidden - -.. code-block:: python - - # Truncated back-propagation through time - def training_step(self, batch, batch_idx, hiddens): - # hiddens are the hiddens from the previous truncated backprop step - out, hiddens = self.lstm(data, hiddens) - - return { - "loss": ..., - "hiddens": hiddens # remember to detach() this - } - -To modify how the batch is split, -override :meth:`pytorch_lightning.core.LightningModule.tbptt_split_batch`: - -.. testcode:: - - class LitMNIST(LightningModule): - def tbptt_split_batch(self, batch, split_size): - # do your own splitting on the batch - return splits - -val_check_interval -^^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -How often within one training epoch to check the validation set. -Can specify as float or int. - -- use (float) to check within a training epoch -- use (int) to check every n steps (batches) - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(val_check_interval=1.0) - - # check validation set 4 times during a training epoch - trainer = Trainer(val_check_interval=0.25) - - # check validation set every 1000 training batches - # use this when using iterableDataset and your dataset has no length - # (ie: production cases with streaming data) - trainer = Trainer(val_check_interval=1000) - - -weights_save_path -^^^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Directory of where to save weights if specified. - -.. testcode:: - - # default used by the Trainer - trainer = Trainer(weights_save_path=os.getcwd()) - - # save to your custom path - trainer = Trainer(weights_save_path='my/path') - -Example:: - - # if checkpoint callback used, then overrides the weights path - # **NOTE: this saves weights to some/path NOT my/path - checkpoint = ModelCheckpoint(dirpath='some/path') - trainer = Trainer( - callbacks=[checkpoint], - weights_save_path='my/path' - ) - -weights_summary -^^^^^^^^^^^^^^^ - -.. raw:: html - - - -| - -Prints a summary of the weights when training begins. -Options: 'full', 'top', None. - -.. testcode:: - - # default used by the Trainer (ie: print summary of top level modules) - trainer = Trainer(weights_summary='top') - - # print full summary of all modules and submodules - trainer = Trainer(weights_summary='full') - - # don't print a summary - trainer = Trainer(weights_summary=None) - -Trainer class API ------------------ """ From e0bdf8124b7ec91dad19554b413052a5942164c3 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 7 Nov 2020 14:53:04 -0500 Subject: [PATCH 66/88] updated trainer docs (#4570) --- docs/source/lightning_module.rst | 6 ++++-- docs/source/trainer.rst | 36 ++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/docs/source/lightning_module.rst b/docs/source/lightning_module.rst index d13b449614113..11641fc35e8a0 100644 --- a/docs/source/lightning_module.rst +++ b/docs/source/lightning_module.rst @@ -172,10 +172,11 @@ Under the hood, Lightning does the following (pseudocode): model.train() torch.set_grad_enabled(True) - outs = [] + losses = [] for batch in train_dataloader: # forward - out = training_step(val_batch) + loss = training_step(batch) + losses.append(loss.detach()) # backward loss.backward() @@ -184,6 +185,7 @@ Under the hood, Lightning does the following (pseudocode): optimizer.step() optimizer.zero_grad() + Training epoch-level metrics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If you want to calculate epoch-level metrics and log them, use the `.log` method diff --git a/docs/source/trainer.rst b/docs/source/trainer.rst index d7cbae7a56a3b..0076c84e96a39 100644 --- a/docs/source/trainer.rst +++ b/docs/source/trainer.rst @@ -48,6 +48,42 @@ This is the basic use of the trainer: trainer = Trainer() trainer.fit(model, train_dataloader, val_dataloader) +-------- + +Under the hood +-------------- +Under the hood, the Lightning Trainer handles the training loop details for you, some examples include: + +- Automatically eenabling/disabling grads +- Running the training, validation and test dataloaders +- Calling the Callbacks at the appropriate times +- Putting batches and computations on the correct devices + +Here's the pseudocode for what the trainer does under the hood (showing the train loop only) + +.. code-block:: python + + # put model in train mode + model.train() + torch.set_grad_enabled(True) + + losses = [] + for batch in train_dataloader: + # calls hooks like this one + on_train_batch_start() + + # train step + loss = training_step(batch) + + # backward + loss.backward() + + # apply and clear grads + optimizer.step() + optimizer.zero_grad() + + losses.append(loss) + -------- From f63fec9323c319720d78b452e9fe84b97ce7644e Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 7 Nov 2020 15:41:02 -0500 Subject: [PATCH 67/88] updated trainer docs (#4571) --- docs/source/new-project.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/source/new-project.rst b/docs/source/new-project.rst index fabc5cf6e5766..e5ba47351a98b 100644 --- a/docs/source/new-project.rst +++ b/docs/source/new-project.rst @@ -608,7 +608,13 @@ Here's an example adding a not-so-fancy learning rate decay rule: new_lr_group.append(new_lr) param_group['lr'] = new_lr self.old_lrs[opt_idx] = new_lr_group - + +And pass the callback to the Trainer + +.. code-block:: python + + decay_callback = DecayLearningRate() + trainer = Trainer(callbacks=[decay_callback]) Things you can do with a callback: From 09a51697ed2a437f4b5b41aa7dc6fba18b01d554 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 8 Nov 2020 12:16:22 -0500 Subject: [PATCH 68/88] Adds shortcut for path to log (#4573) * added log_dir shortcut to trainer properties for writing logs * added log_dir shortcut * added log_dir shortcut * added log_dir shortcut * added log_dir shortcut * added log_dir shortcut * added log_dir shortcut * added log_dir shortcut * added log_dir shortcut --- .pyrightconfig.json | 1 + docs/source/loggers.rst | 9 ++ docs/source/trainer.rst | 11 ++ .../callbacks/model_checkpoint.py | 2 +- pytorch_lightning/trainer/properties.py | 23 ++++ tests/trainer/properties/__init__.py | 0 tests/trainer/properties/log_dir.py | 125 ++++++++++++++++++ 7 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 tests/trainer/properties/__init__.py create mode 100644 tests/trainer/properties/log_dir.py diff --git a/.pyrightconfig.json b/.pyrightconfig.json index 3f00d9a3e4454..416b33f6ad6d2 100644 --- a/.pyrightconfig.json +++ b/.pyrightconfig.json @@ -30,6 +30,7 @@ "pytorch_lightning/trainer/training_tricks.py", "pytorch_lightning/trainer/batch_size_scaling.py", "pytorch_lightning/trainer/distrib_data_parallel.py", + "pytorch_lightning/trainer/properties.py", "pytorch_lightning/trainer/lr_scheduler_connector.py", "pytorch_lightning/trainer/training_loop_temp.py", "pytorch_lightning/trainer/connectors/checkpoint_connector.py", diff --git a/docs/source/loggers.rst b/docs/source/loggers.rst index 1c97fa8e2cc3c..a3b85450e6233 100644 --- a/docs/source/loggers.rst +++ b/docs/source/loggers.rst @@ -19,6 +19,15 @@ but you can pass to the :class:`~pytorch_lightning.trainer.trainer.Trainer` any Read more about :ref:`logging` options. +To log arbitrary artifacts like images or audio samples use the `trainer.log_dir` property to resolve +the path. + +.. code-block:: python + + def training_step(self, batch, batch_idx): + img = ... + log_image(img, self.trainer.log_dir) + Comet.ml ======== diff --git a/docs/source/trainer.rst b/docs/source/trainer.rst index 0076c84e96a39..e82b0871ef85b 100644 --- a/docs/source/trainer.rst +++ b/docs/source/trainer.rst @@ -1666,6 +1666,17 @@ The metrics sent to the logger (visualizer). logged_metrics = trainer.logged_metrics assert logged_metrics['a_val'] == 2 +log_dir +******* +The directory for the current experiment. Use this to save images to, etc... + +.. code-block:: python + + def training_step(self, batch, batch_idx): + img = ... + save_img(img, self.trainer.log_dir) + + is_global_zero ************** diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 413e96a39afcd..0d8d4cec81518 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -291,7 +291,7 @@ def __init_ckpt_dir(self, filepath, dirpath, filename, save_top_k): if dirpath and self._fs.protocol == 'file': dirpath = os.path.realpath(dirpath) - self.dirpath = dirpath or None + self.dirpath: Union[str, None] = dirpath or None self.filename = filename or None def __init_monitor_mode(self, monitor, mode): diff --git a/pytorch_lightning/trainer/properties.py b/pytorch_lightning/trainer/properties.py index 42b5f7a36641c..af06b1bbc1352 100644 --- a/pytorch_lightning/trainer/properties.py +++ b/pytorch_lightning/trainer/properties.py @@ -26,6 +26,9 @@ from pytorch_lightning.utilities import argparse_utils from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.model_utils import is_overridden +from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.loggers.base import LightningLoggerBase +from pytorch_lightning.loggers.tensorboard import TensorBoardLogger class TrainerProperties(ABC): @@ -44,10 +47,30 @@ class TrainerProperties(ABC): limit_val_batches: int _default_root_dir: str _weights_save_path: str + default_root_path: str + accelerator_backend: Accelerator + logger: LightningLoggerBase model_connector: ModelConnector checkpoint_connector: CheckpointConnector callbacks: List[Callback] + @property + def log_dir(self): + if self.checkpoint_callback is not None: + dir = self.checkpoint_callback.dirpath + dir = os.path.split(dir)[0] + elif self.logger is not None: + if isinstance(self.logger, TensorBoardLogger): + dir = self.logger.log_dir + else: + dir = self.logger.save_dir + else: + dir = self._default_root_dir + + if self.accelerator_backend is not None: + dir = self.accelerator_backend.broadcast(dir) + return dir + @property def use_amp(self) -> bool: return self.precision == 16 diff --git a/tests/trainer/properties/__init__.py b/tests/trainer/properties/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/trainer/properties/log_dir.py b/tests/trainer/properties/log_dir.py new file mode 100644 index 0000000000000..021bb04a7c917 --- /dev/null +++ b/tests/trainer/properties/log_dir.py @@ -0,0 +1,125 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import torch +import pytest +from tests.base.boring_model import BoringModel, RandomDataset +from pytorch_lightning import Trainer +from pytorch_lightning.utilities import APEX_AVAILABLE +from pytorch_lightning.utilities.exceptions import MisconfigurationException + + +def test_logdir(tmpdir): + """ + Tests that the path is correct when checkpoint and loggers are used + """ + class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + + expected = os.path.join(self.trainer.default_root_dir, 'lightning_logs', 'version_0') + assert self.trainer.log_dir == expected + return {"loss": loss} + + model = TestModel() + + limit_train_batches = 2 + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=limit_train_batches, + limit_val_batches=2, + max_epochs=1, + ) + + trainer.fit(model) + + +def test_logdir_no_checkpoint_cb(tmpdir): + """ + Tests that the path is correct with no checkpoint + """ + class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + expected = os.path.join(self.trainer.default_root_dir, 'lightning_logs', 'version_0') + assert self.trainer.log_dir == expected + return {"loss": loss} + + model = TestModel() + + limit_train_batches = 2 + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=limit_train_batches, + limit_val_batches=2, + max_epochs=1, + checkpoint_callback=False + ) + + trainer.fit(model) + + +def test_logdir_no_logger(tmpdir): + """ + Tests that the path is correct even when there is no logger + """ + class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + expected = os.path.join(self.trainer.default_root_dir) + assert self.trainer.log_dir == expected + return {"loss": loss} + + model = TestModel() + + limit_train_batches = 2 + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=limit_train_batches, + limit_val_batches=2, + max_epochs=1, + logger=False, + ) + + trainer.fit(model) + + +def test_logdir_no_logger_no_checkpoint(tmpdir): + """ + Tests that the path is correct even when there is no logger + """ + class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + expected = os.path.join(self.trainer.default_root_dir) + assert self.trainer.log_dir == expected + return {"loss": loss} + + model = TestModel() + + limit_train_batches = 2 + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=limit_train_batches, + limit_val_batches=2, + max_epochs=1, + logger=False, + checkpoint_callback=False + ) + + trainer.fit(model) From 0f64f15f525ae887bb1db7b643684d6f4e1b2d1e Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 8 Nov 2020 14:28:55 -0500 Subject: [PATCH 69/88] ref: unify slurm and TE under backendPlugin 1/n (#4578) * ref: unify slurm and TE under backendPlugin * ref: unify slurm and TE under backendPlugin --- .../accelerators/ddp2_accelerator.py | 13 +---------- .../accelerators/ddp_cpu_slurm_accelerator.py | 4 ++-- .../ddp_cpu_torchelastic_accelerator.py | 4 ++-- .../accelerators/ddp_slurm_accelerator.py | 11 +++------- .../ddp_torchelastic_accelerator.py | 7 +++--- .../cluster_environment.py | 4 ++++ .../cluster_environments/slurm_environment.py | 3 +++ .../torchelastic_environment.py | 3 +++ tests/backends/test_accelerator_connector.py | 22 ++++++++++++++----- 9 files changed, 38 insertions(+), 33 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp2_accelerator.py b/pytorch_lightning/accelerators/ddp2_accelerator.py index 452c15ba3bc38..db4ccbda01bf0 100644 --- a/pytorch_lightning/accelerators/ddp2_accelerator.py +++ b/pytorch_lightning/accelerators/ddp2_accelerator.py @@ -46,19 +46,8 @@ def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): self.nickname = 'ddp2' def setup(self, model): - self._resolve_task_idx() self.trainer.model = model - - def _resolve_task_idx(self): - if self.trainer.is_slurm_managing_tasks: - self.task_idx = int(os.environ['SLURM_LOCALID']) - else: - # torchelastic or general non_slurm ddp2 - try: - self.task_idx = int(os.environ['LOCAL_RANK']) - except Exception as exp: - m = 'ddp2 only works in SLURM or via torchelastic with the WORLD_SIZE, LOCAL_RANK, GROUP_RANK flags' - raise MisconfigurationException(m) from exp + self.task_idx = self.cluster_environment.local_rank() def train(self): model = self.trainer.model diff --git a/pytorch_lightning/accelerators/ddp_cpu_slurm_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_slurm_accelerator.py index c80e8a4ec355c..dbaa4a244f629 100644 --- a/pytorch_lightning/accelerators/ddp_cpu_slurm_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_cpu_slurm_accelerator.py @@ -53,7 +53,7 @@ def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): def setup(self, model): self.trainer.model = model - self.task_idx = int(os.environ['SLURM_LOCALID']) + self.task_idx = self.cluster_environment.local_rank() def train(self): model = self.trainer.model @@ -118,7 +118,7 @@ def ddp_train(self, process_idx, model): self.set_world_ranks(process_idx) # toggle prog bar - if self.trainer.global_rank == 0 and self.trainer.progress_bar_callback is not None: + if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: self.trainer.progress_bar_callback.disable() # set warning rank diff --git a/pytorch_lightning/accelerators/ddp_cpu_torchelastic_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_torchelastic_accelerator.py index a90d7750eaeea..016468a4517cf 100644 --- a/pytorch_lightning/accelerators/ddp_cpu_torchelastic_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_cpu_torchelastic_accelerator.py @@ -52,7 +52,7 @@ def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): def setup(self, model): self.trainer.model = model - self.task_idx = int(os.environ['LOCAL_RANK']) + self.task_idx = self.cluster_environment.local_rank() def train(self): model = self.trainer.model @@ -117,7 +117,7 @@ def ddp_train(self, process_idx, model): self.set_world_ranks(process_idx) # toggle prog bar - if self.trainer.global_rank == 0 and self.trainer.progress_bar_callback is not None: + if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: self.trainer.progress_bar_callback.disable() # set warning rank diff --git a/pytorch_lightning/accelerators/ddp_slurm_accelerator.py b/pytorch_lightning/accelerators/ddp_slurm_accelerator.py index 1ea4461c3c3cc..6bdd1930ecfe0 100644 --- a/pytorch_lightning/accelerators/ddp_slurm_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_slurm_accelerator.py @@ -25,7 +25,6 @@ from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.distributed import rank_zero_only, sync_ddp_if_available -from pytorch_lightning.utilities.seed import seed_everything try: from hydra.utils import to_absolute_path, get_original_cwd @@ -52,7 +51,7 @@ def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): def setup(self, model): self.trainer.model = model - self.task_idx = int(os.environ['SLURM_LOCALID']) + self.task_idx = self.cluster_environment.local_rank() def train(self): model = self.trainer.model @@ -88,7 +87,7 @@ def test_step(self, args): output = self.training_step(args) return output - def barrier(self, name: str = None): + def barrier(self, name: Optional[str] = None): if torch_distrib.is_initialized(): torch_distrib.barrier() @@ -115,15 +114,11 @@ def ddp_train(self, process_idx, model): Dict with evaluation results """ - seed = os.environ.get("PL_GLOBAL_SEED") - if seed is not None: - seed_everything(int(seed)) - # determine which process we are and world size self.set_world_ranks(process_idx) # toggle prog bar - if self.trainer.global_rank != 0 and self.trainer.progress_bar_callback is not None: + if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: self.trainer.progress_bar_callback.disable() # set warning rank diff --git a/pytorch_lightning/accelerators/ddp_torchelastic_accelerator.py b/pytorch_lightning/accelerators/ddp_torchelastic_accelerator.py index e54ad905de80e..53e784ee949d4 100644 --- a/pytorch_lightning/accelerators/ddp_torchelastic_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_torchelastic_accelerator.py @@ -24,8 +24,7 @@ from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning.utilities import AMPType -from pytorch_lightning.utilities.distributed import rank_zero_only -from pytorch_lightning.utilities.distributed import sync_ddp_if_available +from pytorch_lightning.utilities.distributed import rank_zero_only, sync_ddp_if_available try: @@ -53,7 +52,7 @@ def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): def setup(self, model): self.trainer.model = model - self.task_idx = int(os.environ['LOCAL_RANK']) + self.task_idx = self.cluster_environment.local_rank() def train(self): model = self.trainer.model @@ -120,7 +119,7 @@ def ddp_train(self, process_idx, model): self.set_world_ranks(process_idx) # toggle prog bar - if self.trainer.global_rank == 0 and self.trainer.progress_bar_callback is not None: + if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: self.trainer.progress_bar_callback.disable() # set warning rank diff --git a/pytorch_lightning/cluster_environments/cluster_environment.py b/pytorch_lightning/cluster_environments/cluster_environment.py index ff3436e66204c..08fbbf4095ca3 100644 --- a/pytorch_lightning/cluster_environments/cluster_environment.py +++ b/pytorch_lightning/cluster_environments/cluster_environment.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + class ClusterEnvironment: def __init__(self): @@ -25,3 +26,6 @@ def master_port(self): def world_size(self): return self._world_size + + def local_rank(self): + pass diff --git a/pytorch_lightning/cluster_environments/slurm_environment.py b/pytorch_lightning/cluster_environments/slurm_environment.py index 44cdc2207899c..6df1cf680c57f 100644 --- a/pytorch_lightning/cluster_environments/slurm_environment.py +++ b/pytorch_lightning/cluster_environments/slurm_environment.py @@ -67,6 +67,9 @@ def master_port(self): def world_size(self): return self._world_size + def local_rank(self): + return int(os.environ['SLURM_LOCALID']) + def _resolve_root_node_address(self, root_node): if '[' in root_node: name, numbers = root_node.split('[', maxsplit=1) diff --git a/pytorch_lightning/cluster_environments/torchelastic_environment.py b/pytorch_lightning/cluster_environments/torchelastic_environment.py index d50a10a782dbb..a4d769518d252 100644 --- a/pytorch_lightning/cluster_environments/torchelastic_environment.py +++ b/pytorch_lightning/cluster_environments/torchelastic_environment.py @@ -46,3 +46,6 @@ def master_port(self): def world_size(self): return os.environ.get('WORLD_SIZE') + + def local_rank(self): + return int(os.environ['LOCAL_RANK']) diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py index cbc96b0793062..57ccdfc594238 100644 --- a/tests/backends/test_accelerator_connector.py +++ b/tests/backends/test_accelerator_connector.py @@ -104,7 +104,7 @@ def on_fit_start(self, trainer, pl_module): "SLURM_NTASKS": "2", "SLURM_JOB_NAME": "SOME_NAME", "SLURM_NODEID": "0", - "SLURM_LOCALID": "0" + "SLURM_LOCALID": "10" }) @mock.patch('torch.cuda.device_count', return_value=2) def test_accelerator_choice_ddp_slurm(tmpdir): @@ -113,6 +113,8 @@ def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp assert isinstance(trainer.accelerator_backend, accelerators.DDPSLURMAccelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, SLURMEnvironment) + assert trainer.accelerator_backend.task_idx == 10 + assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx raise SystemExit() model = BoringModel() @@ -133,7 +135,7 @@ def on_fit_start(self, trainer, pl_module): "SLURM_JOB_NAME": "SOME_NAME", "SLURM_NODEID": "0", "LOCAL_RANK": "0", - "SLURM_LOCALID": "0" + "SLURM_LOCALID": "10" }) @mock.patch('torch.cuda.device_count', return_value=2) def test_accelerator_choice_ddp2_slurm(tmpdir): @@ -142,6 +144,9 @@ def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp2 assert isinstance(trainer.accelerator_backend, accelerators.DDP2Accelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, SLURMEnvironment) + assert trainer.accelerator_backend.task_idx == 10 + assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx + raise SystemExit() model = BoringModel() @@ -159,7 +164,7 @@ def on_fit_start(self, trainer, pl_module): @mock.patch.dict(os.environ, { "CUDA_VISIBLE_DEVICES": "0,1", "WORLD_SIZE": "2", - "LOCAL_RANK": "0", + "LOCAL_RANK": "10", "NODE_RANK": "0" }) @mock.patch('torch.cuda.device_count', return_value=2) @@ -169,6 +174,8 @@ def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp assert isinstance(trainer.accelerator_backend, accelerators.DDPTorchElasticAccelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) + assert trainer.accelerator_backend.task_idx == 10 + assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx raise SystemExit() model = BoringModel() @@ -186,7 +193,7 @@ def on_fit_start(self, trainer, pl_module): @mock.patch.dict(os.environ, { "CUDA_VISIBLE_DEVICES": "0,1", "WORLD_SIZE": "2", - "LOCAL_RANK": "0", + "LOCAL_RANK": "10", "NODE_RANK": "0" }) @mock.patch('torch.cuda.device_count', return_value=2) @@ -196,6 +203,8 @@ def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp2 assert isinstance(trainer.accelerator_backend, accelerators.DDP2Accelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) + assert trainer.accelerator_backend.task_idx == 10 + assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx raise SystemExit() model = BoringModel() @@ -212,7 +221,7 @@ def on_fit_start(self, trainer, pl_module): @mock.patch.dict(os.environ, { "WORLD_SIZE": "1", - "LOCAL_RANK": "0", + "LOCAL_RANK": "10", "NODE_RANK": "0" }) @mock.patch('torch.cuda.device_count', return_value=0) @@ -222,6 +231,9 @@ def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUTorchElasticAccelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) + assert trainer.accelerator_backend.task_idx == 10 + assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx + raise SystemExit() model = BoringModel() From bfaf014096c01a464ed14402f4b8c2e6f4f716c5 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 8 Nov 2020 15:07:16 -0500 Subject: [PATCH 70/88] ref: unify slurm and TE under backendPlugin 2/n (#4580) --- pytorch_lightning/accelerators/__init__.py | 3 +- .../accelerators/accelerator_connector.py | 4 +- ..._accelerator.py => ddp_hpc_accelerator.py} | 8 +- .../ddp_torchelastic_accelerator.py | 209 ------------------ tests/backends/test_accelerator_connector.py | 4 +- 5 files changed, 7 insertions(+), 221 deletions(-) rename pytorch_lightning/accelerators/{ddp_slurm_accelerator.py => ddp_hpc_accelerator.py} (96%) delete mode 100644 pytorch_lightning/accelerators/ddp_torchelastic_accelerator.py diff --git a/pytorch_lightning/accelerators/__init__.py b/pytorch_lightning/accelerators/__init__.py index 65f3161675e4f..13709753da7df 100644 --- a/pytorch_lightning/accelerators/__init__.py +++ b/pytorch_lightning/accelerators/__init__.py @@ -20,8 +20,7 @@ from pytorch_lightning.accelerators.gpu_accelerator import GPUAccelerator from pytorch_lightning.accelerators.tpu_accelerator import TPUAccelerator from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator -from pytorch_lightning.accelerators.ddp_slurm_accelerator import DDPSLURMAccelerator -from pytorch_lightning.accelerators.ddp_torchelastic_accelerator import DDPTorchElasticAccelerator +from pytorch_lightning.accelerators.ddp_hpc_accelerator import DDPHPCAccelerator from pytorch_lightning.accelerators.ddp_cpu_torchelastic_accelerator import DDPCPUTorchElasticAccelerator from pytorch_lightning.accelerators.ddp_cpu_slurm_accelerator import DDPCPUSLURMAccelerator from pytorch_lightning.accelerators.accelerator import Accelerator diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index df70d90755632..173ff971f063b 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -227,7 +227,7 @@ def select_accelerator(self): ) elif use_slurm_ddp: - accelerator_backend = accelerators.DDPSLURMAccelerator( + accelerator_backend = accelerators.DDPHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin @@ -241,7 +241,7 @@ def select_accelerator(self): ) elif use_torchelastic_ddp: - accelerator_backend = accelerators.DDPTorchElasticAccelerator( + accelerator_backend = accelerators.DDPHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin diff --git a/pytorch_lightning/accelerators/ddp_slurm_accelerator.py b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py similarity index 96% rename from pytorch_lightning/accelerators/ddp_slurm_accelerator.py rename to pytorch_lightning/accelerators/ddp_hpc_accelerator.py index 6bdd1930ecfe0..0e0822d5747d8 100644 --- a/pytorch_lightning/accelerators/ddp_slurm_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py @@ -26,6 +26,7 @@ from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.distributed import rank_zero_only, sync_ddp_if_available + try: from hydra.utils import to_absolute_path, get_original_cwd from hydra.core.hydra_config import HydraConfig @@ -35,12 +36,7 @@ HYDRA_AVAILABLE = True -# ------------------------------------------- -# !!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!! -# TEMP CLASS WHILE WE DECOUPLE SLURM FROM DDP -# !!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!! -# ------------------------------------------- -class DDPSLURMAccelerator(Accelerator): +class DDPHPCAccelerator(Accelerator): def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): super().__init__(trainer, cluster_environment, ddp_plugin) diff --git a/pytorch_lightning/accelerators/ddp_torchelastic_accelerator.py b/pytorch_lightning/accelerators/ddp_torchelastic_accelerator.py deleted file mode 100644 index 53e784ee949d4..0000000000000 --- a/pytorch_lightning/accelerators/ddp_torchelastic_accelerator.py +++ /dev/null @@ -1,209 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License -import os -from typing import Any, List, Optional, Union - -import torch -import torch.distributed as torch_distrib -import torch.distributed as dist -from torch.nn.parallel import DistributedDataParallel - -from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp -from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.distributed.dist import LightningDistributed -from pytorch_lightning.utilities import AMPType -from pytorch_lightning.utilities.distributed import rank_zero_only, sync_ddp_if_available - - -try: - from hydra.utils import to_absolute_path, get_original_cwd - from hydra.core.hydra_config import HydraConfig -except ImportError: - HYDRA_AVAILABLE = False -else: - HYDRA_AVAILABLE = True - - -# ------------------------------------------- -# !!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!! -# TEMP CLASS WHILE WE DECOUPLE SLURM FROM DDP -# !!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!! -# ------------------------------------------- -class DDPTorchElasticAccelerator(Accelerator): - - def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): - super().__init__(trainer, cluster_environment, ddp_plugin) - self.task_idx = None - self._has_spawned_children = False - self.dist = LightningDistributed() - self.nickname = 'ddp' - - def setup(self, model): - self.trainer.model = model - self.task_idx = self.cluster_environment.local_rank() - - def train(self): - model = self.trainer.model - self.ddp_train(process_idx=self.task_idx, model=model) - - def set_world_ranks(self, process_idx): - self.trainer.local_rank = process_idx - self.trainer.global_rank = self.trainer.node_rank * self.trainer.num_processes + process_idx - self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes - - def model_to_device(self, model, process_idx): - self.trainer.root_gpu = process_idx - torch.cuda.set_device(self.trainer.root_gpu) - model.cuda(self.trainer.root_gpu) - - def get_device_ids(self): - device_ids = [self.trainer.root_gpu] - return device_ids - - def training_step(self, args): - if self.trainer.amp_backend == AMPType.NATIVE: - with torch.cuda.amp.autocast(): - output = self.trainer.model(*args) - else: - output = self.trainer.model(*args) - return output - - def validation_step(self, args): - output = self.training_step(args) - return output - - def test_step(self, args): - output = self.training_step(args) - return output - - def barrier(self, name: Optional[str] = None): - if torch_distrib.is_initialized(): - torch_distrib.barrier() - - def early_stopping_should_stop(self, pl_module): - stop = torch.tensor(int(self.trainer.should_stop), device=pl_module.device) - dist.all_reduce(stop, op=dist.reduce_op.SUM) - dist.barrier() - should_stop = stop == self.trainer.world_size - return should_stop - - def broadcast(self, obj, src=0): - return self.dist.broadcast(obj) - - def ddp_train(self, process_idx, model): - """ - Entry point for ddp - - Args: - process_idx: - mp_queue: multiprocessing queue - model: - - Returns: - Dict with evaluation results - - """ - # determine which process we are and world size - self.set_world_ranks(process_idx) - - # toggle prog bar - if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: - self.trainer.progress_bar_callback.disable() - - # set warning rank - rank_zero_only.rank = self.trainer.global_rank - - # set up server using proc 0's ip address - # try to init for 20 times at max in case ports are taken - # where to store ip_table - model.trainer = self.trainer - self.init_ddp_connection( - self.trainer.global_rank, - self.trainer.world_size, - self.trainer.is_slurm_managing_tasks - ) - - # call setup after the ddp process has connected - self.trainer.call_setup_hook(model) - - # on world_size=0 let everyone know training is starting - if self.trainer.is_global_zero and not torch.distributed.is_initialized(): - log.info('-' * 100) - log.info(f'distributed_backend={self.trainer.distributed_backend} (on SLURM)') - log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes') - log.info('-' * 100) - - # call sync_bn before .cuda(), configure_apex and configure_ddp - if self.trainer.sync_batchnorm: - model = self.configure_sync_batchnorm(model) - - # move the model to the correct device - self.model_to_device(model, process_idx) - - # CHOOSE OPTIMIZER - # allow for lr schedulers as well - self.setup_optimizers(model) - - # set model properties before going into wrapper - self.trainer.model_connector.copy_trainer_model_properties(model) - - # 16-bit - model = self.trainer.precision_connector.connect(model) - - # device ids change depending on the DDP setup - device_ids = self.get_device_ids() - - # allow user to configure ddp - model = self.configure_ddp(model, device_ids) - - # set up training routine - self.trainer.train_loop.setup_training(model) - - # train or test - results = self.train_or_test() - - # clean up memory - torch.cuda.empty_cache() - - return results - - def configure_ddp( - self, model: LightningModule, device_ids: List[int] - ) -> DistributedDataParallel: - model = self.ddp_plugin.configure_ddp(model, device_ids) - return model - - def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: - """ - Add global batchnorm for a model spread across multiple GPUs and nodes. - - Override to synchronize batchnorm between specific process groups instead - of the whole world or use a different sync_bn like `apex`'s version. - - Args: - model: pointer to current :class:`LightningModule`. - - Return: - LightningModule with batchnorm layers synchronized between process groups - """ - model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) - - return model - - def sync_tensor(self, - tensor: Union[torch.Tensor], - group: Optional[Any] = None, - reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: - return sync_ddp_if_available(tensor, group, reduce_op) diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py index 57ccdfc594238..a0349096cb1f5 100644 --- a/tests/backends/test_accelerator_connector.py +++ b/tests/backends/test_accelerator_connector.py @@ -111,7 +111,7 @@ def test_accelerator_choice_ddp_slurm(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp - assert isinstance(trainer.accelerator_backend, accelerators.DDPSLURMAccelerator) + assert isinstance(trainer.accelerator_backend, accelerators.DDPHPCAccelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, SLURMEnvironment) assert trainer.accelerator_backend.task_idx == 10 assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx @@ -172,7 +172,7 @@ def test_accelerator_choice_ddp_te(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp - assert isinstance(trainer.accelerator_backend, accelerators.DDPTorchElasticAccelerator) + assert isinstance(trainer.accelerator_backend, accelerators.DDPHPCAccelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) assert trainer.accelerator_backend.task_idx == 10 assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx From 624f5b59388703bf6cceecdb592b94e5278d9029 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 8 Nov 2020 15:32:37 -0500 Subject: [PATCH 71/88] ref: unify slurm and TE under backendPlugin 3/n (#4581) --- pytorch_lightning/accelerators/__init__.py | 3 +- .../accelerators/accelerator_connector.py | 4 +- ...elerator.py => ddp_cpu_hpc_accelerator.py} | 7 +- .../ddp_cpu_torchelastic_accelerator.py | 207 ------------------ tests/backends/test_accelerator_connector.py | 8 +- 5 files changed, 8 insertions(+), 221 deletions(-) rename pytorch_lightning/accelerators/{ddp_cpu_slurm_accelerator.py => ddp_cpu_hpc_accelerator.py} (96%) delete mode 100644 pytorch_lightning/accelerators/ddp_cpu_torchelastic_accelerator.py diff --git a/pytorch_lightning/accelerators/__init__.py b/pytorch_lightning/accelerators/__init__.py index 13709753da7df..ec6e90caa6452 100644 --- a/pytorch_lightning/accelerators/__init__.py +++ b/pytorch_lightning/accelerators/__init__.py @@ -21,6 +21,5 @@ from pytorch_lightning.accelerators.tpu_accelerator import TPUAccelerator from pytorch_lightning.accelerators.horovod_accelerator import HorovodAccelerator from pytorch_lightning.accelerators.ddp_hpc_accelerator import DDPHPCAccelerator -from pytorch_lightning.accelerators.ddp_cpu_torchelastic_accelerator import DDPCPUTorchElasticAccelerator -from pytorch_lightning.accelerators.ddp_cpu_slurm_accelerator import DDPCPUSLURMAccelerator +from pytorch_lightning.accelerators.ddp_cpu_hpc_accelerator import DDPCPUHPCAccelerator from pytorch_lightning.accelerators.accelerator import Accelerator diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py index 173ff971f063b..f8d90945e9e77 100644 --- a/pytorch_lightning/accelerators/accelerator_connector.py +++ b/pytorch_lightning/accelerators/accelerator_connector.py @@ -220,7 +220,7 @@ def select_accelerator(self): ) elif use_ddp_cpu_slurm: - accelerator_backend = accelerators.DDPCPUSLURMAccelerator( + accelerator_backend = accelerators.DDPCPUHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin @@ -234,7 +234,7 @@ def select_accelerator(self): ) elif use_ddp_cpu_torch_elastic: - accelerator_backend = accelerators.DDPCPUTorchElasticAccelerator( + accelerator_backend = accelerators.DDPCPUHPCAccelerator( self.trainer, cluster_env, self.trainer.plugin_connector.ddp_plugin diff --git a/pytorch_lightning/accelerators/ddp_cpu_slurm_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py similarity index 96% rename from pytorch_lightning/accelerators/ddp_cpu_slurm_accelerator.py rename to pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py index dbaa4a244f629..4b04c17d505e5 100644 --- a/pytorch_lightning/accelerators/ddp_cpu_slurm_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py @@ -37,12 +37,7 @@ HYDRA_AVAILABLE = True -# ------------------------------------------- -# !!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!! -# TEMP CLASS WHILE WE DECOUPLE TE FROM DDP -# !!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!! -# ------------------------------------------- -class DDPCPUSLURMAccelerator(Accelerator): +class DDPCPUHPCAccelerator(Accelerator): def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): super().__init__(trainer, cluster_environment, ddp_plugin) diff --git a/pytorch_lightning/accelerators/ddp_cpu_torchelastic_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_torchelastic_accelerator.py deleted file mode 100644 index 016468a4517cf..0000000000000 --- a/pytorch_lightning/accelerators/ddp_cpu_torchelastic_accelerator.py +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License -import os -from typing import Any, List, Optional, Union - -import torch -import torch.distributed as torch_distrib -import torch.distributed as dist -from torch.nn.parallel import DistributedDataParallel - -from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp -from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.distributed.dist import LightningDistributed -from pytorch_lightning.utilities import AMPType -from pytorch_lightning.utilities.distributed import rank_zero_only -from pytorch_lightning.utilities.distributed import sync_ddp_if_available - -try: - from hydra.utils import to_absolute_path, get_original_cwd - from hydra.core.hydra_config import HydraConfig -except ImportError: - HYDRA_AVAILABLE = False -else: - HYDRA_AVAILABLE = True - - -# ------------------------------------------- -# !!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!! -# TEMP CLASS WHILE WE DECOUPLE TE FROM DDP -# !!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!! -# ------------------------------------------- -class DDPCPUTorchElasticAccelerator(Accelerator): - - def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): - super().__init__(trainer, cluster_environment, ddp_plugin) - self.task_idx = None - self._has_spawned_children = False - self.dist = LightningDistributed() - self.nickname = 'ddp_cpu' - - def setup(self, model): - self.trainer.model = model - self.task_idx = self.cluster_environment.local_rank() - - def train(self): - model = self.trainer.model - self.ddp_train(process_idx=self.task_idx, model=model) - - def set_world_ranks(self, process_idx): - self.trainer.local_rank = process_idx - self.trainer.global_rank = self.trainer.node_rank * self.trainer.num_processes + process_idx - self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes - - def model_to_device(self, model, process_idx): - model.cpu() - - def get_device_ids(self): - device_ids = None - return device_ids - - def training_step(self, args): - if self.trainer.amp_backend == AMPType.NATIVE: - with torch.cuda.amp.autocast(): - output = self.trainer.model(*args) - else: - output = self.trainer.model(*args) - return output - - def validation_step(self, args): - output = self.training_step(args) - return output - - def test_step(self, args): - output = self.training_step(args) - return output - - def barrier(self, name: Optional[str] = None): - if torch_distrib.is_initialized(): - torch_distrib.barrier() - - def early_stopping_should_stop(self, pl_module): - stop = torch.tensor(int(self.trainer.should_stop), device=pl_module.device) - dist.all_reduce(stop, op=dist.reduce_op.SUM) - dist.barrier() - should_stop = stop == self.trainer.world_size - return should_stop - - def broadcast(self, obj, src=0): - return self.dist.broadcast(obj) - - def ddp_train(self, process_idx, model): - """ - Entry point for ddp - - Args: - process_idx: - mp_queue: multiprocessing queue - model: - - Returns: - Dict with evaluation results - - """ - # determine which process we are and world size - self.set_world_ranks(process_idx) - - # toggle prog bar - if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: - self.trainer.progress_bar_callback.disable() - - # set warning rank - rank_zero_only.rank = self.trainer.global_rank - - # set up server using proc 0's ip address - # try to init for 20 times at max in case ports are taken - # where to store ip_table - model.trainer = self.trainer - self.init_ddp_connection( - self.trainer.global_rank, - self.trainer.world_size, - self.trainer.is_slurm_managing_tasks - ) - - # call setup after the ddp process has connected - self.trainer.call_setup_hook(model) - - # on world_size=0 let everyone know training is starting - if self.trainer.is_global_zero and not torch.distributed.is_initialized(): - log.info('-' * 100) - log.info(f'distributed_backend={self.trainer.distributed_backend} (TORCH_ELASTIC)') - log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes') - log.info('-' * 100) - - # call sync_bn before .cuda(), configure_apex and configure_ddp - if self.trainer.sync_batchnorm: - model = self.configure_sync_batchnorm(model) - - # move the model to the correct device - self.model_to_device(model, process_idx) - - # CHOOSE OPTIMIZER - # allow for lr schedulers as well - self.setup_optimizers(model) - - # set model properties before going into wrapper - self.trainer.model_connector.copy_trainer_model_properties(model) - - # 16-bit - model = self.trainer.precision_connector.connect(model) - - # device ids change depending on the DDP setup - device_ids = self.get_device_ids() - - # allow user to configure ddp - model = self.configure_ddp(model, device_ids) - - # set up training routine - self.trainer.train_loop.setup_training(model) - - # train or test - results = self.train_or_test() - - # clean up memory - torch.cuda.empty_cache() - - return results - - def configure_ddp( - self, model: LightningModule, device_ids: List[int] - ) -> DistributedDataParallel: - model = self.ddp_plugin.configure_ddp(model, device_ids) - return model - - def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: - """ - Add global batchnorm for a model spread across multiple GPUs and nodes. - - Override to synchronize batchnorm between specific process groups instead - of the whole world or use a different sync_bn like `apex`'s version. - - Args: - model: pointer to current :class:`LightningModule`. - - Return: - LightningModule with batchnorm layers synchronized between process groups - """ - model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) - - return model - - def sync_tensor(self, - tensor: Union[torch.Tensor], - group: Optional[Any] = None, - reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: - return sync_ddp_if_available(tensor, group, reduce_op) diff --git a/tests/backends/test_accelerator_connector.py b/tests/backends/test_accelerator_connector.py index a0349096cb1f5..7eeada3d5ddd1 100644 --- a/tests/backends/test_accelerator_connector.py +++ b/tests/backends/test_accelerator_connector.py @@ -229,7 +229,7 @@ def test_accelerator_choice_ddp_cpu_te(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp - assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUTorchElasticAccelerator) + assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, TorchElasticEnvironment) assert trainer.accelerator_backend.task_idx == 10 assert trainer.accelerator_backend.cluster_environment.local_rank() == trainer.accelerator_backend.task_idx @@ -260,7 +260,7 @@ def test_accelerator_choice_ddp_cpu_slurm(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp - assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUSLURMAccelerator) + assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, SLURMEnvironment) raise SystemExit() @@ -295,7 +295,7 @@ def master_address(self): class CB(Callback): def on_fit_start(self, trainer, pl_module): assert trainer.use_ddp - assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUSLURMAccelerator) + assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator) assert isinstance(trainer.accelerator_backend.cluster_environment, CustomCluster) raise SystemExit() @@ -353,7 +353,7 @@ def on_fit_start(self, trainer, pl_module): def test_dist_backend_accelerator_mapping(tmpdir): class CB(Callback): def on_fit_start(self, trainer, pl_module): - assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUSLURMAccelerator) + assert isinstance(trainer.accelerator_backend, accelerators.DDPCPUHPCAccelerator) raise SystemExit() model = BoringModel() From 3ba48d3bc46405bb9cf1bf683adfd8c89589604b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 8 Nov 2020 16:20:19 -0500 Subject: [PATCH 72/88] ref: unify slurm and TE under backendPlugin 5/n" (#4582) * ref: unify slurm and TE under backendPlugin 4/n * ref: unify slurm and TE under backendPlugin 5/n --- .../accelerators/ddp2_accelerator.py | 14 +- .../accelerators/ddp_cpu_hpc_accelerator.py | 170 +----------------- .../accelerators/ddp_hpc_accelerator.py | 2 +- 3 files changed, 12 insertions(+), 174 deletions(-) diff --git a/pytorch_lightning/accelerators/ddp2_accelerator.py b/pytorch_lightning/accelerators/ddp2_accelerator.py index db4ccbda01bf0..4341a3b30e2fb 100644 --- a/pytorch_lightning/accelerators/ddp2_accelerator.py +++ b/pytorch_lightning/accelerators/ddp2_accelerator.py @@ -11,22 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License - import os import torch import torch.distributed as torch_distrib -from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.core.step_result import Result from pytorch_lightning.distributed.dist import LightningDistributed from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.utilities import AMPType -from pytorch_lightning.utilities.distributed import rank_zero_only +from pytorch_lightning.utilities.distributed import rank_zero_only, sync_ddp_if_available from torch.nn.parallel import DistributedDataParallel -from typing import List, Optional +from typing import List, Optional, Union, Any try: from hydra.utils import to_absolute_path, get_original_cwd @@ -203,3 +201,9 @@ def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) return model + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + return sync_ddp_if_available(tensor, group, reduce_op) diff --git a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py index 4b04c17d505e5..fab87750eb4ed 100644 --- a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py @@ -11,21 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License -import os -from typing import Any, List, Optional, Union - -import torch -import torch.distributed as torch_distrib -import torch.distributed as dist -from torch.nn.parallel import DistributedDataParallel - -from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp -from pytorch_lightning.core.lightning import LightningModule -from pytorch_lightning.utilities import AMPType -from pytorch_lightning.utilities.distributed import rank_zero_only -from pytorch_lightning.utilities.distributed import sync_ddp_if_available -from pytorch_lightning.distributed.dist import LightningDistributed +from pytorch_lightning.accelerators.ddp_hpc_accelerator import DDPHPCAccelerator try: @@ -37,167 +23,15 @@ HYDRA_AVAILABLE = True -class DDPCPUHPCAccelerator(Accelerator): +class DDPCPUHPCAccelerator(DDPHPCAccelerator): def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): super().__init__(trainer, cluster_environment, ddp_plugin) - self.task_idx = None - self._has_spawned_children = False - self.dist = LightningDistributed() self.nickname = 'ddp_cpu' - def setup(self, model): - self.trainer.model = model - self.task_idx = self.cluster_environment.local_rank() - - def train(self): - model = self.trainer.model - self.ddp_train(process_idx=self.task_idx, model=model) - - def set_world_ranks(self, process_idx): - self.trainer.local_rank = process_idx - self.trainer.global_rank = self.trainer.node_rank * self.trainer.num_processes + process_idx - self.trainer.world_size = self.trainer.num_nodes * self.trainer.num_processes - def model_to_device(self, model, process_idx): model.cpu() def get_device_ids(self): device_ids = None return device_ids - - def training_step(self, args): - if self.trainer.amp_backend == AMPType.NATIVE: - with torch.cuda.amp.autocast(): - output = self.trainer.model(*args) - else: - output = self.trainer.model(*args) - return output - - def validation_step(self, args): - output = self.training_step(args) - return output - - def test_step(self, args): - output = self.training_step(args) - return output - - def barrier(self, name: Optional[str] = None): - if torch_distrib.is_initialized(): - torch_distrib.barrier() - - def early_stopping_should_stop(self, pl_module): - stop = torch.tensor(int(self.trainer.should_stop), device=pl_module.device) - dist.all_reduce(stop, op=dist.reduce_op.SUM) - dist.barrier() - should_stop = stop == self.trainer.world_size - return should_stop - - def broadcast(self, obj, src=0): - return self.dist.broadcast(obj) - - def ddp_train(self, process_idx, model): - """ - Entry point for ddp - - Args: - process_idx: - mp_queue: multiprocessing queue - model: - - Returns: - Dict with evaluation results - - """ - # determine which process we are and world size - self.set_world_ranks(process_idx) - - # toggle prog bar - if (self.trainer.node_rank != 0 or process_idx != 0) and self.trainer.progress_bar_callback is not None: - self.trainer.progress_bar_callback.disable() - - # set warning rank - rank_zero_only.rank = self.trainer.global_rank - - # set up server using proc 0's ip address - # try to init for 20 times at max in case ports are taken - # where to store ip_table - model.trainer = self.trainer - self.init_ddp_connection( - self.trainer.global_rank, - self.trainer.world_size, - self.trainer.is_slurm_managing_tasks - ) - - # call setup after the ddp process has connected - self.trainer.call_setup_hook(model) - - # on world_size=0 let everyone know training is starting - if self.trainer.is_global_zero and not torch.distributed.is_initialized(): - log.info('-' * 100) - log.info(f'distributed_backend={self.trainer.distributed_backend} (TORCH_ELASTIC)') - log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes') - log.info('-' * 100) - - # call sync_bn before .cuda(), configure_apex and configure_ddp - if self.trainer.sync_batchnorm: - model = self.configure_sync_batchnorm(model) - - # move the model to the correct device - self.model_to_device(model, process_idx) - - # CHOOSE OPTIMIZER - # allow for lr schedulers as well - self.setup_optimizers(model) - - # set model properties before going into wrapper - self.trainer.model_connector.copy_trainer_model_properties(model) - - # 16-bit - model = self.trainer.precision_connector.connect(model) - - # device ids change depending on the DDP setup - device_ids = self.get_device_ids() - - # allow user to configure ddp - model = self.configure_ddp(model, device_ids) - - # set up training routine - self.trainer.train_loop.setup_training(model) - - # train or test - results = self.train_or_test() - - # clean up memory - torch.cuda.empty_cache() - - return results - - def configure_ddp( - self, model: LightningModule, device_ids: List[int] - ) -> DistributedDataParallel: - model = self.ddp_plugin.configure_ddp(model, device_ids) - return model - - def configure_sync_batchnorm(self, model: LightningModule) -> LightningModule: - """ - Add global batchnorm for a model spread across multiple GPUs and nodes. - - Override to synchronize batchnorm between specific process groups instead - of the whole world or use a different sync_bn like `apex`'s version. - - Args: - model: pointer to current :class:`LightningModule`. - - Return: - LightningModule with batchnorm layers synchronized between process groups - """ - model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model, process_group=None) - - return model - - def sync_tensor(self, - tensor: Union[torch.Tensor], - group: Optional[Any] = None, - reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: - return sync_ddp_if_available(tensor, group, reduce_op) diff --git a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py index 0e0822d5747d8..23d7778c1fff7 100644 --- a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py @@ -136,7 +136,7 @@ def ddp_train(self, process_idx, model): # on world_size=0 let everyone know training is starting if self.trainer.is_global_zero and not torch.distributed.is_initialized(): log.info('-' * 100) - log.info(f'distributed_backend={self.trainer.distributed_backend} (on SLURM)') + log.info(f'distributed_backend={self.trainer.distributed_backend}') log.info(f'All DDP processes registered. Starting ddp with {self.trainer.world_size} processes') log.info('-' * 100) From ee35907170977bf9cba43c2c921520a2cb8a7901 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sun, 8 Nov 2020 17:24:41 -0500 Subject: [PATCH 73/88] Accelerator docs (#4583) * accelerator docs * accelerator docs --- docs/source/accelerators.rst | 182 ++++++++++++++++++ docs/source/index.rst | 1 + pytorch_lightning/accelerators/accelerator.py | 2 + .../accelerators/cpu_accelerator.py | 9 + .../accelerators/ddp2_accelerator.py | 9 + .../accelerators/ddp_accelerator.py | 12 ++ .../accelerators/ddp_cpu_hpc_accelerator.py | 9 + .../accelerators/ddp_cpu_spawn_accelerator.py | 9 + .../accelerators/ddp_hpc_accelerator.py | 9 + .../accelerators/ddp_spawn_accelerator.py | 9 + .../accelerators/dp_accelerator.py | 9 + .../accelerators/gpu_accelerator.py | 9 + .../accelerators/horovod_accelerator.py | 9 + .../accelerators/tpu_accelerator.py | 13 +- 14 files changed, 287 insertions(+), 4 deletions(-) create mode 100644 docs/source/accelerators.rst diff --git a/docs/source/accelerators.rst b/docs/source/accelerators.rst new file mode 100644 index 0000000000000..ee801f2dee28b --- /dev/null +++ b/docs/source/accelerators.rst @@ -0,0 +1,182 @@ +############ +Accelerators +############ +Accelerators connect a Lightning Trainer to arbitrary accelerators (CPUs, GPUs, TPUs, etc). Accelerators +also manage distributed accelerators (like DP, DDP, HPC cluster). + +Accelerators can also be configured to run on arbitrary clusters using Plugins or to link up to arbitrary +computational strategies like 16-bit precision via AMP and Apex. + +---------- + +****************************** +Implement a custom accelerator +****************************** +To link up arbitrary hardware, implement your own Accelerator subclass + +.. code-block:: python + + from pytorch_lightning.accelerators.accelerator import Accelerator + + class MyAccelerator(Accelerator): + def __init__(self, trainer, cluster_environment=None): + super().__init__(trainer, cluster_environment) + self.nickname = 'my_accelator' + + def setup(self): + # find local rank, etc, custom things to implement + + def train(self): + # implement what happens during training + + def training_step(self): + # implement how to do a training_step on this accelerator + + def validation_step(self): + # implement how to do a validation_step on this accelerator + + def test_step(self): + # implement how to do a test_step on this accelerator + + def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): + # implement how to do a backward pass with this accelerator + + def barrier(self, name: Optional[str] = None): + # implement this accelerator's barrier + + def broadcast(self, obj, src=0): + # implement this accelerator's broadcast function + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + # implement how to sync tensors when reducing metrics across accelerators + +******** +Examples +******** +The following examples illustrate customizing accelerators. + +Example 1: Arbitrary HPC cluster +================================ +To link any accelerator with an arbitrary cluster (SLURM, Condor, etc), pass in a Cluster Plugin which will be passed +into any accelerator. + +First, implement your own ClusterEnvironment. Here is the torch elastic implementation. + +.. code-block:: python + + import os + from pytorch_lightning import _logger as log + from pytorch_lightning.utilities import rank_zero_warn + from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment + + class TorchElasticEnvironment(ClusterEnvironment): + + def __init__(self): + super().__init__() + + def master_address(self): + if "MASTER_ADDR" not in os.environ: + rank_zero_warn( + "MASTER_ADDR environment variable is not defined. Set as localhost" + ) + os.environ["MASTER_ADDR"] = "127.0.0.1" + log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}") + master_address = os.environ.get('MASTER_ADDR') + return master_address + + def master_port(self): + if "MASTER_PORT" not in os.environ: + rank_zero_warn( + "MASTER_PORT environment variable is not defined. Set as 12910" + ) + os.environ["MASTER_PORT"] = "12910" + log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}") + + port = os.environ.get('MASTER_PORT') + return port + + def world_size(self): + return os.environ.get('WORLD_SIZE') + + def local_rank(self): + return int(os.environ['LOCAL_RANK']) + +Now, pass it into the trainer which will use Torch Elastic across your accelerator of choice. + +.. code-block:: python + + cluster = TorchElasticEnvironment() + accelerator = MyAccelerator() + trainer = Trainer(plugins=[cluster], accelerator=MyAccelerator()) + +In this example, MyAccelerator can define arbitrary hardware (like IPUs or TPUs) and links it to an arbitrary +compute cluster. + +------------ + +********************** +Available Accelerators +********************** + +CPU Accelerator +=============== + +.. autoclass:: pytorch_lightning.accelerators.cpu_accelerator.CPUAccelerator + :noindex: + +DDP Accelerator +=============== + +.. autoclass:: pytorch_lightning.accelerators.ddp_accelerator.DDPAccelerator + :noindex: + +DDP2 Accelerator +================ + +.. autoclass:: pytorch_lightning.accelerators.ddp2_accelerator.DDP2Accelerator + :noindex: + +DDP CPU HPC Accelerator +======================= + +.. autoclass:: pytorch_lightning.accelerators.ddp_cpu_hpc_accelerator.DDPCPUHPCAccelerator + :noindex: + +DDP CPU Spawn Accelerator +========================= + +.. autoclass:: pytorch_lightning.accelerators.ddp_cpu_spawn_accelerator.DDPCPUSpawnAccelerator + :noindex: + +DDP HPC Accelerator +=================== + +.. autoclass:: pytorch_lightning.accelerators.ddp_hpc_accelerator.DDPHPCAccelerator + :noindex: + +DDP Spawn Accelerator +===================== + +.. autoclass:: pytorch_lightning.accelerators.ddp_spawn_accelerator.DDPSpawnAccelerator + :noindex: + +GPU Accelerator +=============== + +.. autoclass:: pytorch_lightning.accelerators.gpu_accelerator.GPUAccelerator + :noindex: + +Horovod Accelerator +=================== + +.. autoclass:: pytorch_lightning.accelerators.horovod_accelerator.HorovodAccelerator + :noindex: + +TPU Accelerator +=============== + +.. autoclass:: pytorch_lightning.accelerators.tpu_accelerator.TPUAccelerator + :noindex: diff --git a/docs/source/index.rst b/docs/source/index.rst index 6ea7709a8e72f..8f0642c6ad771 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -39,6 +39,7 @@ PyTorch Lightning Documentation :name: docs :caption: Optional extensions + accelerators callbacks datamodules logging diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 408c979d1ff2e..dc0b0bf63a98d 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -221,11 +221,13 @@ def sync_tensor(self, reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: """ Function to reduce a tensor from several distributed processes to one aggregated tensor. + Args: tensor: the tensor to sync and reduce group: the process group to gather results from. Defaults to all processes (world) reduce_op: the reduction operation. Defaults to sum. Can also be a string of 'avg', 'mean' to calculate the mean during reduction. + Return: reduced value """ diff --git a/pytorch_lightning/accelerators/cpu_accelerator.py b/pytorch_lightning/accelerators/cpu_accelerator.py index 3ce6f7315feb8..083b5193ff8f3 100644 --- a/pytorch_lightning/accelerators/cpu_accelerator.py +++ b/pytorch_lightning/accelerators/cpu_accelerator.py @@ -21,6 +21,15 @@ class CPUAccelerator(Accelerator): def __init__(self, trainer, cluster_environment=None): + """ + Runs training on CPU + + Example:: + + # default + trainer = Trainer(accelerator=CPUAccelerator()) + + """ super().__init__(trainer, cluster_environment) self.nickname = None diff --git a/pytorch_lightning/accelerators/ddp2_accelerator.py b/pytorch_lightning/accelerators/ddp2_accelerator.py index 4341a3b30e2fb..2da9747a9be92 100644 --- a/pytorch_lightning/accelerators/ddp2_accelerator.py +++ b/pytorch_lightning/accelerators/ddp2_accelerator.py @@ -38,6 +38,15 @@ class DDP2Accelerator(Accelerator): def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): + """ + Runs training using DDP2 strategy on a cluster + + Example:: + + # default + trainer = Trainer(accelerator=DDP2Accelerator()) + + """ super().__init__(trainer, cluster_environment, ddp_plugin) self.task_idx = None self.dist = LightningDistributed() diff --git a/pytorch_lightning/accelerators/ddp_accelerator.py b/pytorch_lightning/accelerators/ddp_accelerator.py index b127fdd40c934..f99cd1149e5ae 100644 --- a/pytorch_lightning/accelerators/ddp_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_accelerator.py @@ -47,6 +47,15 @@ class DDPAccelerator(Accelerator): def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): + """ + Runs training using DDP strategy on a single machine (manually, not via cluster start) + + Example:: + + # default + trainer = Trainer(accelerator=DDPAccelerator()) + + """ super().__init__(trainer, cluster_environment, ddp_plugin) self.task_idx = None self._has_spawned_children = False @@ -304,4 +313,7 @@ def sync_tensor(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + """ + + """ return sync_ddp_if_available(tensor, group, reduce_op) diff --git a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py index fab87750eb4ed..7b43dc9f6b68a 100644 --- a/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_cpu_hpc_accelerator.py @@ -26,6 +26,15 @@ class DDPCPUHPCAccelerator(DDPHPCAccelerator): def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): + """ + Runs training using DDP (with CPUs) strategy on a cluster + + Example:: + + # default + trainer = Trainer(accelerator=DDPCPUHPCAccelerator()) + + """ super().__init__(trainer, cluster_environment, ddp_plugin) self.nickname = 'ddp_cpu' diff --git a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py index 64e326b7ee0fc..221ed5769c35e 100644 --- a/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_cpu_spawn_accelerator.py @@ -40,6 +40,15 @@ class DDPCPUSpawnAccelerator(Accelerator): def __init__(self, trainer, nprocs, cluster_environment=None, ddp_plugin=None): + """ + Runs training using DDP (on a single machine or manually on multiple machines), using mp.spawn + + Example:: + + # default + trainer = Trainer(accelerator=DDPCPUSpawnAccelerator()) + + """ super().__init__(trainer, cluster_environment, ddp_plugin) self.mp_queue = None self.nprocs = nprocs diff --git a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py index 23d7778c1fff7..b6d813f978943 100644 --- a/pytorch_lightning/accelerators/ddp_hpc_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_hpc_accelerator.py @@ -39,6 +39,15 @@ class DDPHPCAccelerator(Accelerator): def __init__(self, trainer, cluster_environment=None, ddp_plugin=None): + """ + Runs training using DDP on an HPC cluster + + Example:: + + # default + trainer = Trainer(accelerator=DDPHPCAccelerator()) + + """ super().__init__(trainer, cluster_environment, ddp_plugin) self.task_idx = None self._has_spawned_children = False diff --git a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py index 2e0bac46c4c20..a30d266ec1b2f 100644 --- a/pytorch_lightning/accelerators/ddp_spawn_accelerator.py +++ b/pytorch_lightning/accelerators/ddp_spawn_accelerator.py @@ -43,6 +43,15 @@ class DDPSpawnAccelerator(Accelerator): def __init__(self, trainer, nprocs, cluster_environment=None, ddp_plugin=None): + """ + Runs training using DDP using mp.spawn via manual launch (not cluster launch) + + Example:: + + # default + trainer = Trainer(accelerator=DDPSpawnAccelerator()) + + """ super().__init__(trainer, cluster_environment, ddp_plugin) self.mp_queue = None self.nprocs = nprocs diff --git a/pytorch_lightning/accelerators/dp_accelerator.py b/pytorch_lightning/accelerators/dp_accelerator.py index 0a6eac607d79c..2f6c5dce97c46 100644 --- a/pytorch_lightning/accelerators/dp_accelerator.py +++ b/pytorch_lightning/accelerators/dp_accelerator.py @@ -26,6 +26,15 @@ class DataParallelAccelerator(Accelerator): def __init__(self, trainer, cluster_environment=None): + """ + Runs training using DP via manual start (not HPC cluster) + + Example:: + + # default + trainer = Trainer(accelerator=DataParallelAccelerator()) + + """ super().__init__(trainer, cluster_environment) self.model_autocast_original_forward = None self.dist = LightningDistributed() diff --git a/pytorch_lightning/accelerators/gpu_accelerator.py b/pytorch_lightning/accelerators/gpu_accelerator.py index e5611767547a1..e66f5bcb8b48c 100644 --- a/pytorch_lightning/accelerators/gpu_accelerator.py +++ b/pytorch_lightning/accelerators/gpu_accelerator.py @@ -23,6 +23,15 @@ class GPUAccelerator(Accelerator): amp_backend: AMPType def __init__(self, trainer, cluster_environment=None): + """ + Runs training using a single GPU + + Example:: + + # default + trainer = Trainer(accelerator=GPUAccelerator()) + + """ super().__init__(trainer, cluster_environment) self.dist = LightningDistributed() self.nickname = None diff --git a/pytorch_lightning/accelerators/horovod_accelerator.py b/pytorch_lightning/accelerators/horovod_accelerator.py index e5314a983f9db..3d9191914566d 100644 --- a/pytorch_lightning/accelerators/horovod_accelerator.py +++ b/pytorch_lightning/accelerators/horovod_accelerator.py @@ -33,6 +33,15 @@ class HorovodAccelerator(Accelerator): amp_backend: AMPType def __init__(self, trainer, cluster_environment=None): + """ + Runs training using horovod + + Example:: + + # default + trainer = Trainer(accelerator=HorovodAccelerator()) + + """ super().__init__(trainer, cluster_environment) self.nickname = 'horovod' diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py index b60cd5a9dfc9c..5f4e6cc22cacd 100644 --- a/pytorch_lightning/accelerators/tpu_accelerator.py +++ b/pytorch_lightning/accelerators/tpu_accelerator.py @@ -39,6 +39,15 @@ class TPUAccelerator(Accelerator): def __init__(self, trainer, cluster_environment=None): + """ + Runs training using TPUs (colab, single machine or pod) + + Example:: + + # default + trainer = Trainer(accelerator=TPUAccelerator()) + + """ super().__init__(trainer, cluster_environment) self.start_method = None self.mp_queue = None @@ -270,8 +279,6 @@ def early_stopping_should_stop(self, pl_module): def save_spawn_weights(self, model): """ Dump a temporary checkpoint after ddp ends to get weights out of the process - :param model: - :return: """ if self.trainer.is_global_zero: path = os.path.join(self.trainer.default_root_dir, '__temp_weight_distributed_end.ckpt') @@ -282,8 +289,6 @@ def load_spawn_weights(self, original_model): """ Load the temp weights saved in the process To recover the trained model from the ddp process we load the saved weights - :param model: - :return: """ loaded_model = original_model From 01a925d3331042201b4e0a6c2a0bcb4e154686aa Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Mon, 9 Nov 2020 11:30:28 +0100 Subject: [PATCH 74/88] [Docs] Note on running metric in dp (#4494) * note * Update docs/source/metrics.rst Co-authored-by: chaton Co-authored-by: Sean Naren Co-authored-by: Jeff Yang --- docs/source/metrics.rst | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst index 4fadfaa507168..e41fdfa7d1c74 100644 --- a/docs/source/metrics.rst +++ b/docs/source/metrics.rst @@ -78,6 +78,26 @@ If ``on_epoch`` is True, the logger automatically logs the end of epoch metric v self.valid_acc(logits, y) self.log('valid_acc', self.valid_acc, on_step=True, on_epoch=True) +.. note:: + If using metrics in data parallel mode (dp), the metric update/logging should be done + in the ``_step_end`` method (where ```` is either ``training``, ``validation`` + or ``test``). This is due to metric states else being destroyed after each forward pass, + leading to wrong accumulation. In practice do the following: + + .. code-block:: python + + def training_step(self, batch, batch_idx): + data, target = batch + pred = self(data) + ... + return {'loss' : loss, 'preds' : preds, 'target' : target} + + def training_step_end(self, outputs): + #update and log + self.metric(outputs['preds'], outputs['target']) + self.log('metric', self.metric) + + This metrics API is independent of PyTorch Lightning. Metrics can directly be used in PyTorch as shown in the example: .. code-block:: python From e01190e919e665c235338ae883b6d16f8a144d6b Mon Sep 17 00:00:00 2001 From: Jan Beitner Date: Mon, 9 Nov 2020 11:33:44 +0000 Subject: [PATCH 75/88] Adding pytorch-forecasting to community examples (#4575) PyTorch Forecasting is a new library that is designed for time series forecasting practitioners and researchers alike. It is based on the awesome work on PyTorch Lightning. Thanks a lot for creating such an asset! Have a look at the documentation for more information. Co-authored-by: chaton Co-authored-by: Jeff Yang --- docs/source/community_examples.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/community_examples.rst b/docs/source/community_examples.rst index 470c8e2dd8c9f..a89b2599b1a99 100644 --- a/docs/source/community_examples.rst +++ b/docs/source/community_examples.rst @@ -16,4 +16,5 @@ Community Examples - `VAE Library of over 18+ VAE flavors `_. - `Transformers Question Answering (SQuAD) `_. - `Atlas: End-to-End 3D Scene Reconstruction from Posed Images `_. -- `Self-Supervised Representation Learning (MoCo and BYOL) `_. \ No newline at end of file +- `Self-Supervised Representation Learning (MoCo and BYOL) `_. +- `pytorch-forecasting: Time series forecasting package `_. From 45a695969abb87feec70a60776c41ebffe470708 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Mon, 9 Nov 2020 22:22:47 +0900 Subject: [PATCH 76/88] Fix docstring (#4585) Co-authored-by: Jeff Yang --- pytorch_lightning/core/lightning.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 05eb8ee86be63..3d38f65892983 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -1399,7 +1399,6 @@ def _verify_is_manual_optimization(self, fn_name): @classmethod def _auto_collect_arguments(cls, frame=None) -> Tuple[Dict, Dict]: - """""" """ Collect all module arguments in the current constructor and all child constructors. The child constructors are all the ``__init__`` methods that reach the current class through From 4a6721af257fc398de4a40c1e8534596d4c35319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stef=20=7C=20=E3=82=B9=E3=83=86=E3=83=95?= Date: Mon, 9 Nov 2020 23:01:13 +0900 Subject: [PATCH 77/88] Missing TorchScript trace's update (#4586) Co-authored-by: stef-ubuntu Co-authored-by: Jeff Yang Co-authored-by: Nicki Skafte --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f4defbd5cc30..4495a8f961232 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -75,6 +75,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed that metrics do not store computational graph for all seen data ([#4313](https://github.com/PyTorchLightning/pytorch-lightning/pull/4313)) - Fixed AMP unscale for `on_after_backward` ([#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) - Fixed TorchScript export when module includes Metrics ([#4428](https://github.com/PyTorchLightning/pytorch-lightning/pull/4428)) +- Fixed TorchScript trace method's data to device and docstring ([#4360](https://github.com/PyTorchLightning/pytorch-lightning/pull/4360)) - Fixed CSV logger warning ([#4419](https://github.com/PyTorchLightning/pytorch-lightning/pull/4419)) - Fixed skip DDP parameter sync ([#4301](https://github.com/PyTorchLightning/pytorch-lightning/pull/4301)) From 23719e3c05a3fe88da294d9cfb8bb246d4ad7984 Mon Sep 17 00:00:00 2001 From: Jeff Yang Date: Mon, 9 Nov 2020 21:18:24 +0630 Subject: [PATCH 78/88] [dockers] install nvidia-dali-cudaXXX (#4532) * [dockers] install nvidia-dali-cuda100 * Apply suggestions from code review * build DALI * build DALI * build DALI * dali from source * dali from source * use binaries * qq Co-authored-by: Jirka Borovec Co-authored-by: Jirka Borovec Co-authored-by: Sean Naren --- dockers/README.md | 6 +++--- dockers/base-conda/Dockerfile | 6 ++++-- dockers/base-cuda/Dockerfile | 7 +++++-- dockers/base-xla/Dockerfile | 2 +- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/dockers/README.md b/dockers/README.md index 73c40635eb0a5..aab82a171641a 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -14,10 +14,10 @@ or with specific arguments ```bash git clone docker image build \ - -t pytorch-lightning:py3.8 \ - -f dockers/conda/Dockerfile \ + -t pytorch-lightning:py3.8-pt1.6 \ + -f dockers/base-cuda/Dockerfile \ --build-arg PYTHON_VERSION=3.8 \ - --build-arg PYTORCH_VERSION=1.4 \ + --build-arg PYTORCH_VERSION=1.6 \ . ``` diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index d11e61d92edbd..ea8c6bc5d001d 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -35,7 +35,8 @@ SHELL ["/bin/bash", "-c"] ENV PATH="$PATH:/root/.local/bin" -RUN apt-get update && apt-get install -y --no-install-recommends \ +RUN apt-get update -qq && \ + apt-get install -y --no-install-recommends \ build-essential \ cmake \ git \ @@ -104,6 +105,7 @@ RUN \ # Install remaining requirements pip install -r requirements-extra.txt --upgrade-strategy only-if-needed && \ pip install -r requirements-test.txt --upgrade-strategy only-if-needed && \ + pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda${CUDA_VERSION%%.*}0 && \ rm requirements* RUN \ @@ -118,4 +120,4 @@ RUN \ conda info && \ pip list && \ python -c "import sys; assert sys.version[:3] == '$PYTHON_VERSION', sys.version" && \ - python -c "import torch; assert torch.__version__[:3] == '$PYTORCH_VERSION', torch.__version__" \ No newline at end of file + python -c "import torch; assert torch.__version__[:3] == '$PYTORCH_VERSION', torch.__version__" diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index e22b5a862a7d7..f886ccc30be7a 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -28,6 +28,7 @@ FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu18.04 ARG PYTHON_VERSION=3.7 ARG PYTORCH_VERSION=1.6 +ARG CMAKE_VERSION=3.18.4 SHELL ["/bin/bash", "-c"] # https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/ @@ -37,7 +38,7 @@ ENV TZ=Europe/Prague ENV PATH="$PATH:/root/.local/bin" ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" -RUN apt-get update && \ +RUN apt-get update -qq && \ apt-get install -y --no-install-recommends \ build-essential \ pkg-config \ @@ -93,6 +94,7 @@ RUN \ # Install all requirements pip install -r requirements/devel.txt --upgrade-strategy only-if-needed --use-feature=2020-resolver && \ + pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda${CUDA_VERSION%%.*}0 && \ rm -rf requirements* RUN \ @@ -105,5 +107,6 @@ RUN \ # Show what we have pip --version && \ pip list && \ + python -c 'from nvidia.dali.pipeline import Pipeline' && \ python -c "import sys; assert sys.version[:3] == '$PYTHON_VERSION', sys.version" && \ - python -c "import torch; assert torch.__version__[:3] == '$PYTORCH_VERSION', torch.__version__" \ No newline at end of file + python -c "import torch; assert torch.__version__[:3] == '$PYTORCH_VERSION', torch.__version__" diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile index 3eaabade428e6..8eb093295c37b 100644 --- a/dockers/base-xla/Dockerfile +++ b/dockers/base-xla/Dockerfile @@ -31,7 +31,7 @@ ENV CONDA_ENV=lightning # show system inforation RUN lsb_release -a && cat /etc/*-release -RUN apt-get update && \ +RUN apt-get update -qq && \ apt-get install -y --no-install-recommends \ build-essential \ cmake \ From 41c9bee4f0fab6701ea3b3e9bdf8c7ea216bb460 Mon Sep 17 00:00:00 2001 From: tarepan Date: Tue, 10 Nov 2020 02:26:38 +0900 Subject: [PATCH 79/88] Fix load disparity between normal and hpc (#4526) * Add missing load functionality in hpc * Add general file load for hpc * Add mark in CHANGELOG * Fix Typo Li**hg**tning Co-authored-by: Rohit Gupta * Refactor line separation Co-authored-by: Rohit Gupta * Fix entangled fixation commit * Fix naming of restore_model_states * Fix amp restore place Co-authored-by: Rohit Gupta Co-authored-by: chaton --- CHANGELOG.md | 2 +- .../connectors/checkpoint_connector.py | 44 ++++++++++--------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4495a8f961232..da64a70c3231c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -47,7 +47,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed - +- Fixed feature-lack in hpc load ([#4526](https://github.com/PyTorchLightning/pytorch-lightning/pull/4526)) ## [1.0.5] - 2020-11-03 diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py index 3bf76e4c30630..3b44ce96c02ad 100644 --- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -103,6 +103,20 @@ def restore(self, checkpoint_path: str, on_gpu: bool): # load model state model = self.trainer.get_model() + # restore model and datamodule state + self.restore_model_state(model, checkpoint) + + if on_gpu: + model.cuda(self.trainer.root_gpu) + + # restore training state + self.restore_training_state(checkpoint) + + def restore_model_state(self, model: LightningModule, checkpoint) -> None: + """ + Restore model states from a 'PyTorch-Lightning checkpoint' dictionary object + """ + # give the datamodule a chance to load something if self.trainer.datamodule is not None: self.trainer.datamodule.on_load_checkpoint(checkpoint) @@ -113,18 +127,6 @@ def restore(self, checkpoint_path: str, on_gpu: bool): # restore the state_dict on the model model.load_state_dict(checkpoint['state_dict']) - if on_gpu: - model.cuda(self.trainer.root_gpu) - - # restore amp scaling - if self.trainer.amp_backend == AMPType.NATIVE and 'native_amp_scaling_state' in checkpoint: - self.trainer.scaler.load_state_dict(checkpoint['native_amp_scaling_state']) - elif self.trainer.amp_backend == AMPType.APEX and 'amp_scaling_state' in checkpoint: - amp.load_state_dict(checkpoint['amp_scaling_state']) - - # load training state (affects trainer only) - self.restore_training_state(checkpoint) - def restore_training_state(self, checkpoint): """ Restore trainer state. @@ -147,6 +149,12 @@ def restore_training_state(self, checkpoint): " where `model.ckpt` is your checkpoint file." ) + # restore amp scaling + if self.trainer.amp_backend == AMPType.NATIVE and 'native_amp_scaling_state' in checkpoint: + self.trainer.scaler.load_state_dict(checkpoint['native_amp_scaling_state']) + elif self.trainer.amp_backend == AMPType.APEX and 'amp_scaling_state' in checkpoint: + amp.load_state_dict(checkpoint['amp_scaling_state']) + # restore callback states self.trainer.on_load_checkpoint(checkpoint) @@ -336,19 +344,13 @@ def hpc_load(self, folderpath, on_gpu): filepath = '{}/hpc_ckpt_{}.ckpt'.format(folderpath, self.max_ckpt_in_folder(folderpath)) # load on CPU first - checkpoint = torch.load(filepath, map_location=lambda storage, loc: storage) + checkpoint = pl_load(filepath, map_location=lambda storage, loc: storage) # load model state model = self.trainer.get_model() - # load the state_dict on the model automatically - model.load_state_dict(checkpoint['state_dict']) - - # restore amp scaling - if self.trainer.amp_backend == AMPType.NATIVE and 'native_amp_scaling_state' in checkpoint: - self.trainer.scaler.load_state_dict(checkpoint['native_amp_scaling_state']) - elif self.trainer.amp_backend == AMPType.APEX and 'amp_scaling_state' in checkpoint: - amp.load_state_dict(checkpoint['amp_scaling_state']) + # restore states from 'PyTorch-Lightning checkpoint' dictionary object + self.restore_model_state(model, checkpoint) if self.trainer.root_gpu is not None: model.cuda(self.trainer.root_gpu) From 4f3160ba2ebc3a4de230e092e98bbbbf9651db21 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Tue, 10 Nov 2020 00:34:42 +0100 Subject: [PATCH 80/88] Skip tuner algorithms on fast dev (#3903) * skip on fast dev * fix error * changelog * fix recursive issue * combine tests * pep8 * move logic to base funcs * fix mistake * Update pytorch_lightning/tuner/lr_finder.py Co-authored-by: Rohit Gupta * pep Co-authored-by: William Falcon Co-authored-by: Nicki Skafte Co-authored-by: Rohit Gupta Co-authored-by: chaton --- CHANGELOG.md | 4 ++-- pytorch_lightning/tuner/batch_size_scaling.py | 4 ++++ pytorch_lightning/tuner/lr_finder.py | 9 ++++++++ tests/trainer/flags/test_fast_dev_run.py | 21 +++++++++++++++++++ 4 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 tests/trainer/flags/test_fast_dev_run.py diff --git a/CHANGELOG.md b/CHANGELOG.md index da64a70c3231c..e2aeb69ee3108 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -30,12 +30,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `fsspec` to tuner ([#4458](https://github.com/PyTorchLightning/pytorch-lightning/pull/4458)) -- Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775)) +- Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775)) ### Changed - +- Tuner algorithms will be skipped if `fast_dev_run=True` ([#3903](https://github.com/PyTorchLightning/pytorch-lightning/pull/3903)) ### Deprecated diff --git a/pytorch_lightning/tuner/batch_size_scaling.py b/pytorch_lightning/tuner/batch_size_scaling.py index b468209171393..67a4704b628fc 100644 --- a/pytorch_lightning/tuner/batch_size_scaling.py +++ b/pytorch_lightning/tuner/batch_size_scaling.py @@ -69,6 +69,10 @@ def scale_batch_size(trainer, **fit_kwargs: remaining arguments to be passed to .fit(), e.g., dataloader or datamodule. """ + if trainer.fast_dev_run: + rank_zero_warn('Skipping batch size scaler since `fast_dev_run=True`', UserWarning) + return + if not lightning_hasattr(model, batch_arg_name): raise MisconfigurationException( f'Field {batch_arg_name} not found in both `model` and `model.hparams`') diff --git a/pytorch_lightning/tuner/lr_finder.py b/pytorch_lightning/tuner/lr_finder.py index 54e7e082bc0a8..b6d8c8178093b 100644 --- a/pytorch_lightning/tuner/lr_finder.py +++ b/pytorch_lightning/tuner/lr_finder.py @@ -29,6 +29,7 @@ from pytorch_lightning.loggers.base import DummyLogger from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.parsing import lightning_hasattr, lightning_setattr +from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.cloud_io import get_filesystem # check if ipywidgets is installed before importing tqdm.auto @@ -42,6 +43,10 @@ def _run_lr_finder_internally(trainer, model: LightningModule): """ Call lr finder internally during Trainer.fit() """ lr_finder = lr_find(trainer, model) + + if lr_finder is None: + return + lr = lr_finder.suggestion() # TODO: log lr.results to self.logger @@ -131,6 +136,10 @@ def lr_find( trainer.fit(model) """ + if trainer.fast_dev_run: + rank_zero_warn('Skipping learning rate finder since `fast_dev_run=True`', UserWarning) + return + save_path = os.path.join(trainer.default_root_dir, 'lr_find_temp_model.ckpt') __lr_finder_dump_params(trainer, model) diff --git a/tests/trainer/flags/test_fast_dev_run.py b/tests/trainer/flags/test_fast_dev_run.py new file mode 100644 index 0000000000000..cbe4d4012227a --- /dev/null +++ b/tests/trainer/flags/test_fast_dev_run.py @@ -0,0 +1,21 @@ +import pytest +from pytorch_lightning import Trainer +from tests.base import EvalModelTemplate + + +@pytest.mark.parametrize('tuner_alg', ['batch size scaler', 'learning rate finder']) +def test_skip_on_fast_dev_run_batch_scaler(tmpdir, tuner_alg): + """ Test that tuner algorithms are skipped if fast dev run is enabled """ + + hparams = EvalModelTemplate.get_default_hparams() + model = EvalModelTemplate(**hparams) + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=2, + auto_scale_batch_size=True if tuner_alg == 'batch size scaler' else False, + auto_lr_find=True if tuner_alg == 'learning rate finder' else False, + fast_dev_run=True + ) + expected_message = f'Skipping {tuner_alg} since `fast_dev_run=True`' + with pytest.warns(UserWarning, match=expected_message): + trainer.tune(model) From 465ec752f80057e4691b4662ff12e1fb85224c71 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Tue, 10 Nov 2020 09:16:31 +0100 Subject: [PATCH 81/88] Metric ddp bugfix (#4482) * changes * fix spelling * small note * trying to fix ddp test * fix ddp * fix for test * suggestion * CHANGELOG * Update pytorch_lightning/metrics/metric.py Co-authored-by: chaton Co-authored-by: Sean Naren Co-authored-by: Sean Naren --- CHANGELOG.md | 7 ++++ docs/source/metrics.rst | 6 ++++ pytorch_lightning/metrics/metric.py | 46 +++++++++++++++++++++----- tests/metrics/test_metric_lightning.py | 19 +++++++---- 4 files changed, 63 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e2aeb69ee3108..7e49a59c79f94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775)) +- Added `persistent(mode)` method to metrics, to enable and disable metric states being added to `state_dict` ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482)) + + ### Changed - Tuner algorithms will be skipped if `fast_dev_run=True` ([#3903](https://github.com/PyTorchLightning/pytorch-lightning/pull/3903)) @@ -49,6 +52,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed feature-lack in hpc load ([#4526](https://github.com/PyTorchLightning/pytorch-lightning/pull/4526)) + +- Fixed metrics states being overridden in ddp mode ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482)) + + ## [1.0.5] - 2020-11-03 ### Added diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst index e41fdfa7d1c74..d80b35f91abd1 100644 --- a/docs/source/metrics.rst +++ b/docs/source/metrics.rst @@ -131,6 +131,12 @@ This metrics API is independent of PyTorch Lightning. Metrics can directly be us It is highly recommended to re-initialize the metric per mode as shown in the examples above. +.. note:: + + Metric states will as default add their internal state to the models ``state_dict``. + To change this after initializing the metric the method ``.persistent(mode)`` can + be used to enable (``mode=True``) or disable (``mode=False``) this behaviour. + ********************* Implementing a Metric ********************* diff --git a/pytorch_lightning/metrics/metric.py b/pytorch_lightning/metrics/metric.py index 1a568bab37209..9fa479dfb567a 100644 --- a/pytorch_lightning/metrics/metric.py +++ b/pytorch_lightning/metrics/metric.py @@ -81,8 +81,9 @@ def __init__( self._forward_cache = None # initialize state - self._reductions = {} self._defaults = {} + self._persistent = {} + self._reductions = {} def add_state( self, name: str, default, dist_reduce_fx: Optional[Union[str, Callable]] = None, persistent: bool = True @@ -138,16 +139,10 @@ def add_state( "`dist_reduce_fx` must be callable or one of ['mean', 'sum', 'cat', None]" ) - if isinstance(default, torch.Tensor): - if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"): - # persistent keyword is only supported in torch >= 1.6.0 - self.register_buffer(name, default, persistent=persistent) - else: - self.register_buffer(name, default) - else: - setattr(self, name, default) + setattr(self, name, default) self._defaults[name] = deepcopy(default) + self._persistent[name] = persistent self._reductions[name] = dist_reduce_fx @torch.jit.unused @@ -265,3 +260,36 @@ def __setstate__(self, state): self.__dict__.update(state) self.update = self._wrap_update(self.update) self.compute = self._wrap_compute(self.compute) + + def _apply(self, fn): + """ Overwrite _apply function such that we can also move metric states + to the correct device when `.to`, `.cuda`, etc methods are called + """ + self = super()._apply(fn) + # Also apply fn to metric states + for key in self._defaults.keys(): + current_val = getattr(self, key) + if isinstance(current_val, torch.Tensor): + setattr(self, key, fn(current_val)) + elif isinstance(current_val, Sequence): + setattr(self, key, [fn(cur_v) for cur_v in current_val]) + else: + raise TypeError('Expected metric state to be either a torch.Tensor' + f'or a list of torch.Tensor, but encountered {current_val}') + return self + + def persistent(self, mode: bool = True): + """ Method for post-init to change if metric states should be saved to + its state_dict + """ + for key in self._persistent.keys(): + self._persistent[key] = mode + + def state_dict(self, *args, **kwargs): + # Register metric states to be part of the state_dict + state_dict = super().state_dict() + for key in self._defaults.keys(): + if self._persistent[key]: + current_val = getattr(self, key) + state_dict.update({key: current_val}) + return state_dict diff --git a/tests/metrics/test_metric_lightning.py b/tests/metrics/test_metric_lightning.py index 3c6938734be10..a35562327d717 100644 --- a/tests/metrics/test_metric_lightning.py +++ b/tests/metrics/test_metric_lightning.py @@ -1,9 +1,11 @@ -import os +import pytest import torch + from pytorch_lightning import Trainer from pytorch_lightning.metrics import Metric from tests.base.boring_model import BoringModel +import tests.base.develop_utils as tutils class SumMetric(Metric): @@ -54,15 +56,19 @@ def test_metric_lightning_log(tmpdir): class TestModel(BoringModel): def __init__(self): super().__init__() - self.metric = SumMetric() + self.metric_step = SumMetric() + self.metric_epoch = SumMetric() self.sum = 0.0 def training_step(self, batch, batch_idx): x = batch - self.metric(x.sum()) + self.metric_step(x.sum()) self.sum += x.sum() - self.log("sum", self.metric, on_epoch=True, on_step=False) - return self.step(x) + self.log("sum_step", self.metric_step, on_epoch=True, on_step=False) + return {'loss': self.step(x), 'data': x} + + def training_epoch_end(self, outs): + self.log("sum_epoch", self.metric_epoch(torch.stack([o['data'] for o in outs]).sum())) model = TestModel() model.val_dataloader = None @@ -78,7 +84,8 @@ def training_step(self, batch, batch_idx): trainer.fit(model) logged = trainer.logged_metrics - assert torch.allclose(torch.tensor(logged["sum"]), model.sum) + assert torch.allclose(torch.tensor(logged["sum_step"]), model.sum) + assert torch.allclose(torch.tensor(logged["sum_epoch"]), model.sum) def test_scriptable(tmpdir): From 30ad3e2ad39f56813ab21ad0175df9f3e77e9502 Mon Sep 17 00:00:00 2001 From: Kai Zhang <1006921+kazhang@users.noreply.github.com> Date: Tue, 10 Nov 2020 01:44:43 -0800 Subject: [PATCH 82/88] Replace a MisconfigurationException with warning in ModelCheckpoint callback (#4560) * replace MisconfigurationException with warning * update test * check raising UserWarning Co-authored-by: chaton Co-authored-by: Nicki Skafte --- pytorch_lightning/callbacks/model_checkpoint.py | 6 +++--- tests/checkpointing/test_model_checkpoint.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 0d8d4cec81518..d257e1ea7cc0d 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -246,9 +246,9 @@ def __validate_init_configuration(self): ' configuration. No quantity for top_k to track.' ) if self.save_last: - raise MisconfigurationException( - 'ModelCheckpoint(save_last=True, monitor=None) is not a valid configuration.' - ' You can save the last checkpoint with ModelCheckpoint(save_top_k=None, monitor=None)' + rank_zero_warn( + 'ModelCheckpoint(save_last=True, monitor=None) is a redundant configuration.' + ' You can save the last checkpoint with ModelCheckpoint(save_top_k=None, monitor=None).' ) def __init_ckpt_dir(self, filepath, dirpath, filename, save_top_k): diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index 0e5eb0997b57d..d9e99d463b57d 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -294,8 +294,8 @@ def test_none_monitor_top_k(tmpdir): def test_none_monitor_save_last(tmpdir): """ Test that a warning appears for save_last=True with monitor=None. """ - with pytest.raises( - MisconfigurationException, match=r'ModelCheckpoint\(save_last=True, monitor=None\) is not a valid.*' + with pytest.warns( + UserWarning, match=r'ModelCheckpoint\(save_last=True, monitor=None\) is a redundant.*' ): ModelCheckpoint(dirpath=tmpdir, save_last=True) # These should not fail From 11415faade196fb75b6eefe8798d391190176262 Mon Sep 17 00:00:00 2001 From: Roger Shieh Date: Tue, 10 Nov 2020 18:46:37 +0800 Subject: [PATCH 83/88] [req] Set min version for skimage for tests (#4598) Co-authored-by: Nicki Skafte --- requirements/test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/test.txt b/requirements/test.txt index d98048568fa75..0ceb532ac2266 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -7,7 +7,7 @@ flake8>=3.6 flake8-black check-manifest twine==1.13.0 -scikit-image +scikit-image>=0.17.2 black>=20.8b1 pre-commit>=1.0 From 470e2945fc0fefa1b6d42f1d64fef9f22f5c75b3 Mon Sep 17 00:00:00 2001 From: Diedre Carmo Date: Tue, 10 Nov 2020 08:50:25 -0300 Subject: [PATCH 84/88] fix logged keys in mlflow logger (#4412) * [#4411] fix gpu_log_memory with mlflow logger * sanitize parenthesis instead of removing for all loggers * apply regex for mlflow key sanitization * replace ',' with '.' typo * add single warning and test Co-authored-by: Rohit Gupta Co-authored-by: chaton --- pytorch_lightning/loggers/mlflow.py | 9 +++++++++ tests/loggers/test_mlflow.py | 3 ++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/loggers/mlflow.py b/pytorch_lightning/loggers/mlflow.py index de915785dcb45..ee9f8f86cf247 100644 --- a/pytorch_lightning/loggers/mlflow.py +++ b/pytorch_lightning/loggers/mlflow.py @@ -16,6 +16,8 @@ MLflow ------ """ +import re +import warnings from argparse import Namespace from time import time from typing import Any, Dict, Optional, Union @@ -151,6 +153,13 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> if isinstance(v, str): log.warning(f'Discarding metric with string value {k}={v}.') continue + + new_k = re.sub("[^a-zA-Z0-9_/. -]+", "", k) + if k != new_k: + warnings.warn(("MLFlow only allows '_', '/', '.' and ' ' special characters in metric name.\n", + f"Replacing {k} with {new_k}.")) + k = new_k + self.experiment.log_metric(self.run_id, k, v, timestamp_ms, step) @rank_zero_only diff --git a/tests/loggers/test_mlflow.py b/tests/loggers/test_mlflow.py index a200fbf549e6a..b220074d41816 100644 --- a/tests/loggers/test_mlflow.py +++ b/tests/loggers/test_mlflow.py @@ -137,7 +137,8 @@ def test_mlflow_logger_dirs_creation(tmpdir): assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'} model = EvalModelTemplate() - trainer = Trainer(default_root_dir=tmpdir, logger=logger, max_epochs=1, limit_val_batches=3) + trainer = Trainer(default_root_dir=tmpdir, logger=logger, max_epochs=1, limit_val_batches=3, + log_gpu_memory=True) trainer.fit(model) assert set(os.listdir(tmpdir / exp_id)) == {run_id, 'meta.yaml'} assert 'epoch' in os.listdir(tmpdir / exp_id / run_id / 'metrics') From 343d19fa8635095a2ca982c284d61cdc404d47c5 Mon Sep 17 00:00:00 2001 From: maxjeblick Date: Tue, 10 Nov 2020 13:01:20 +0000 Subject: [PATCH 85/88] Find parameters which are specified in the LightningDataModule, only (#4347) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * search for attribute in datamodule if not found elsewhere * add test for datamodule * add lightning_getattr test for datamodule * Apply suggestions from code review Co-authored-by: Adrian Wälchli * Update CHANGELOG.md * Update CHANGELOG.md Co-authored-by: Jirka Borovec Co-authored-by: Nicki Skafte Co-authored-by: Adrian Wälchli Co-authored-by: Rohit Gupta --- CHANGELOG.md | 3 +++ pytorch_lightning/utilities/parsing.py | 23 ++++++++++------------ tests/utilities/parsing.py | 27 ++++++++++++++++++++++++-- 3 files changed, 38 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e49a59c79f94..bb286e82759c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -56,6 +56,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed metrics states being overridden in ddp mode ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482)) +- Fixed `lightning_getattr`, `lightning_hasattr` not finding the correct attributes in datamodule ([#4347](https://github.com/PyTorchLightning/pytorch-lightning/pull/4347)) + + ## [1.0.5] - 2020-11-03 ### Added diff --git a/pytorch_lightning/utilities/parsing.py b/pytorch_lightning/utilities/parsing.py index c8230205752d4..348eec110c3a1 100644 --- a/pytorch_lightning/utilities/parsing.py +++ b/pytorch_lightning/utilities/parsing.py @@ -177,8 +177,9 @@ def __repr__(self): def lightning_hasattr(model, attribute): """ Special hasattr for lightning. Checks for attribute in model namespace, the old hparams namespace/dict, and the datamodule. """ - trainer = model.trainer + trainer = getattr(model, 'trainer', None) + attr = False # Check if attribute in model if hasattr(model, attribute): attr = True @@ -189,10 +190,8 @@ def lightning_hasattr(model, attribute): else: attr = hasattr(model.hparams, attribute) # Check if the attribute in datamodule (datamodule gets registered in Trainer) - elif trainer is not None and trainer.datamodule is not None and hasattr(trainer.datamodule, attribute): - attr = getattr(trainer.datamodule, attribute) - else: - attr = False + if not attr and trainer is not None: + attr = hasattr(trainer.datamodule, attribute) return attr @@ -200,18 +199,16 @@ def lightning_hasattr(model, attribute): def lightning_getattr(model, attribute): """ Special getattr for lightning. Checks for attribute in model namespace, the old hparams namespace/dict, and the datamodule. """ - trainer = model.trainer + trainer = getattr(model, 'trainer', None) # Check if attribute in model if hasattr(model, attribute): attr = getattr(model, attribute) # Check if attribute in model.hparams, either namespace or dict - elif hasattr(model, 'hparams'): - if isinstance(model.hparams, dict): - attr = model.hparams[attribute] - else: - attr = getattr(model.hparams, attribute) - + elif hasattr(model, 'hparams') and isinstance(model.hparams, dict) and attribute in model.hparams: + attr = model.hparams[attribute] + elif hasattr(model, 'hparams') and hasattr(model.hparams, attribute): + attr = getattr(model.hparams, attribute) # Check if the attribute in datamodule (datamodule gets registered in Trainer) elif trainer is not None and trainer.datamodule is not None and hasattr(trainer.datamodule, attribute): attr = getattr(trainer.datamodule, attribute) @@ -230,7 +227,7 @@ def lightning_setattr(model, attribute, value): raise ValueError(f'{attribute} is neither stored in the model namespace' ' nor the `hparams` namespace/dict, nor the datamodule.') - trainer = model.trainer + trainer = getattr(model, 'trainer', None) # Check if attribute in model if hasattr(model, attribute): diff --git a/tests/utilities/parsing.py b/tests/utilities/parsing.py index 13cfeaa64b01a..056590f1a6d35 100644 --- a/tests/utilities/parsing.py +++ b/tests/utilities/parsing.py @@ -24,6 +24,7 @@ class TestHparamsNamespace: class TestModel1: # test for namespace learning_rate = 0 + model1 = TestModel1() class TestModel2: # test for hparams namespace @@ -41,12 +42,23 @@ class TestModel4: # fail case model4 = TestModel4() - return model1, model2, model3, model4 + class DataModule: + batch_size = 8 + + class Trainer: + datamodule = DataModule + + class TestModel5: # test for datamodule + trainer = Trainer + + model5 = TestModel5() + + return model1, model2, model3, model4, model5 def test_lightning_hasattr(tmpdir): """ Test that the lightning_hasattr works in all cases""" - model1, model2, model3, model4 = _get_test_cases() + model1, model2, model3, model4, model5 = _get_test_cases() assert lightning_hasattr(model1, 'learning_rate'), \ 'lightning_hasattr failed to find namespace variable' assert lightning_hasattr(model2, 'learning_rate'), \ @@ -55,6 +67,8 @@ def test_lightning_hasattr(tmpdir): 'lightning_hasattr failed to find hparams dict variable' assert not lightning_hasattr(model4, 'learning_rate'), \ 'lightning_hasattr found variable when it should not' + assert lightning_hasattr(model5, 'batch_size'), \ + 'lightning_hasattr failed to find batch_size in datamodule' def test_lightning_getattr(tmpdir): @@ -64,6 +78,10 @@ def test_lightning_getattr(tmpdir): value = lightning_getattr(m, 'learning_rate') assert value == i, 'attribute not correctly extracted' + model5 = models[4] + assert lightning_getattr(model5, 'batch_size') == 8, \ + 'batch_size not correctly extracted' + def test_lightning_setattr(tmpdir): """ Test that the lightning_setattr works in all cases""" @@ -72,3 +90,8 @@ def test_lightning_setattr(tmpdir): lightning_setattr(m, 'learning_rate', 10) assert lightning_getattr(m, 'learning_rate') == 10, \ 'attribute not correctly set' + + model5 = models[4] + lightning_setattr(model5, 'batch_size', 128) + assert lightning_getattr(model5, 'batch_size') == 128, \ + 'batch_size not correctly set' From abf1d4b992ea9c0ff4cbada136583d376d0e5848 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 10 Nov 2020 14:57:21 +0100 Subject: [PATCH 86/88] fix mock pkgs in docs (#4591) * fix mock pkgs in docs * sphinx * CI Co-authored-by: chaton --- .github/workflows/docs-checks.yml | 3 ++- docs/source/conf.py | 19 ++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml index b060d1ace6600..2f91a4f5d43c8 100644 --- a/.github/workflows/docs-checks.yml +++ b/.github/workflows/docs-checks.yml @@ -8,7 +8,7 @@ on: # Trigger the workflow on push or pull request, but only for the master bra branches: [master, "release/*"] jobs: - check-docs: + sphinx-check: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 @@ -48,6 +48,7 @@ jobs: # python -m pip install --upgrade --user pip pip install --requirement requirements.txt --upgrade-strategy only-if-needed --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet pip install --requirement requirements/extra.txt + pip install --requirement requirements/loggers.txt pip install --requirement requirements/docs.txt python --version pip --version diff --git a/docs/source/conf.py b/docs/source/conf.py index c662e1e9c912e..38431c2264636 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -304,22 +304,23 @@ def package_list_from_file(file): return mocked_packages +# define mapping from PyPI names to python imports +PACKAGE_MAPPING = { + 'Pillow': 'PIL', + 'opencv-python': 'cv2', + 'PyYAML': 'yaml', + 'comet-ml': 'comet_ml', + 'neptune-client': 'neptune', +} MOCK_PACKAGES = [] if SPHINX_MOCK_REQUIREMENTS: # mock also base packages when we are on RTD since we don't install them there MOCK_PACKAGES += package_list_from_file(os.path.join(PATH_ROOT, 'requirements.txt')) MOCK_PACKAGES += package_list_from_file(os.path.join(PATH_ROOT, 'requirements/extra.txt')) MOCK_PACKAGES += package_list_from_file(os.path.join(PATH_ROOT, 'requirements/loggers.txt')) +MOCK_PACKAGES = [PACKAGE_MAPPING.get(pkg, pkg) for pkg in MOCK_PACKAGES] -MOCK_MANUAL_PACKAGES = [ - 'torchvision', - 'PIL', - # packages with different package name compare to import name - 'yaml', - 'comet_ml', - 'neptune', -] -autodoc_mock_imports = MOCK_PACKAGES + MOCK_MANUAL_PACKAGES +autodoc_mock_imports = MOCK_PACKAGES autosummary_generate = True From 7e08b0d710208e980f27896aa62a59f26f0cb3b3 Mon Sep 17 00:00:00 2001 From: chaton Date: Tue, 10 Nov 2020 19:44:51 +0000 Subject: [PATCH 87/88] [bug-fix] DDP and automatic_optimization=False (#4485) * resolve bug * add self._running_manual_optim * update * update tests * update lightning module * resolve bug * update tests * update * resolve pep8 * update * replace by `ddp_spawn` * temporary fix * update * update * move update to training_loop * make both ddp_spawn * introduce `manual_optimizer_step` * update changelog * added changelog wrong place * add force_optimizer_step * update docstring for tests * update optimizer_step * update zero_grad * resolve flake8 * move update into manual_optimizer_step * add zero_grad * remove zero_grad tests * remove manual_backward in AMP, it doesn't help * update * loosen tests * update * update doc * add TODO * Removed unnecessary get model from native amp * Remove try except with pytest raise * Add seed, clean up imports, remove try catch to reproduce error * update code * update test * revert back * formatting * Update pytorch_lightning/core/lightning.py Co-authored-by: Jirka Borovec Co-authored-by: SeanNaren Co-authored-by: Sean Naren Co-authored-by: Jirka Borovec --- .gitignore | 1 + CHANGELOG.md | 3 + docs/source/lightning_module.rst | 6 + docs/source/optimizers.rst | 7 +- pytorch_lightning/accelerators/accelerator.py | 9 +- pytorch_lightning/core/lightning.py | 50 +++- pytorch_lightning/trainer/training_loop.py | 28 +- .../optimization/test_manual_optimization.py | 274 +++++++++++++++++- .../warnings_tests/test_flow_warnings.py | 11 +- 9 files changed, 366 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index fff549a718794..946d5f0f4c2ca 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,7 @@ timit_data/ .Python ide_layouts/ build/ +_build/ develop-eggs/ dist/ downloads/ diff --git a/CHANGELOG.md b/CHANGELOG.md index bb286e82759c7..16daa24aa2ed9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775)) +- Added `manual_optimizer_step` which work with `AMP Native` and `accumulated_grad_batches` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485)) + + - Added `persistent(mode)` method to metrics, to enable and disable metric states being added to `state_dict` ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482)) diff --git a/docs/source/lightning_module.rst b/docs/source/lightning_module.rst index 11641fc35e8a0..c26e0fc0351d1 100644 --- a/docs/source/lightning_module.rst +++ b/docs/source/lightning_module.rst @@ -1009,6 +1009,12 @@ manual_backward .. automethod:: pytorch_lightning.core.lightning.LightningModule.manual_backward :noindex: +manual_optimizer_step +~~~~~~~~~~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.lightning.LightningModule.manual_optimizer_step + :noindex: + on_after_backward ~~~~~~~~~~~~~~~~~ diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst index 1e7baadb64480..7f1bcc97662b4 100644 --- a/docs/source/optimizers.rst +++ b/docs/source/optimizers.rst @@ -36,8 +36,8 @@ to manually manage the optimization process. To do so, do the following: # use self.backward which will also handle scaling the loss when using amp self.manual_backward(loss_a, opt_g) - opt_g.step() - opt_g.zero_grad() + self.manual_optimizer_step(opt_g) + # do anything you want loss_b = ... @@ -45,8 +45,7 @@ to manually manage the optimization process. To do so, do the following: # pass in any args that loss.backward() normally takes self.manual_backward(loss_b, opt_d, retain_graph=True) self.manual_backward(loss_b, opt_d) - opt_d.step() - opt_d.zero_grad() + self.manual_optimizer_step(opt_d) # log losses self.log('loss_a', loss_a) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index dc0b0bf63a98d..3b762e08ed5e6 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -109,10 +109,11 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure): model_ref = self.trainer.get_model() is_lbfgs = isinstance(optimizer, torch.optim.LBFGS) - native_amp = self.trainer.amp_backend == AMPType.NATIVE + using_native_amp = self.trainer.amp_backend == AMPType.NATIVE + automatic_optimization = self.trainer.train_loop.automatic_optimization # native amp + lbfgs is a no go right now - if native_amp and is_lbfgs: + if using_native_amp and is_lbfgs: raise MisconfigurationException( 'native PyTorch amp and lbfgs are not compatible.' ' To request, please file a Github issue in PyTorch and tag @mcarilli') @@ -125,12 +126,12 @@ def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure): optimizer_idx=opt_idx, optimizer_closure=lambda_closure, on_tpu=False, # TPUAccelerator class sets this as True - using_native_amp=native_amp, + using_native_amp=using_native_amp, using_lbfgs=is_lbfgs ) # scale when native amp - if native_amp: + if automatic_optimization and using_native_amp: self.trainer.scaler.update() def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx): diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 3d38f65892983..a332c0dcaa99a 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -111,6 +111,7 @@ def __init__(self, *args, **kwargs): self._datamodule = None self._results: Optional[Result] = None self._current_fx_name = '' + self._running_manual_backward = False self._current_hook_fx_name = None self._current_dataloader_idx = None @@ -1085,6 +1086,9 @@ def manual_backward(self, loss: Tensor, optimizer: Optimizer, *args, **kwargs) - .. tip:: In manual mode we still automatically clip grads if Trainer(gradient_clip_val=x) is set + .. tip:: In manual mode we still automatically accumulate grad over batches if Trainer(accumulate_grad_batches=x) is set + and you use `model.manual_optimizer_step(optimizer)` + Example:: def training_step(...): @@ -1092,12 +1096,55 @@ def training_step(...): loss = ... # automatically applies scaling, etc... self.manual_backward(loss, opt_a) + self.manual_optimizer_step(opt_a) """ # make sure we're using manual opt self._verify_is_manual_optimization('manual_backward') # backward + self._running_manual_backward = True self.trainer.train_loop.backward(loss, optimizer, -1, *args, **kwargs) + self._running_manual_backward = False + + def manual_optimizer_step(self, optimizer: Optimizer, force_optimizer_step:bool = False) -> None: + """ + Call this directly from your training_step when doing optimizations manually. + By using this we can ensure that all the proper scaling when using 16-bit etc has been done for you + + .. tip:: In manual mode we still automatically accumulate grad over batches if Trainer(accumulate_grad_batches=x) is set. + + Args: + optimizer: Optimizer used to perform `.step()` call + + force_optimizer_step: Whether to force an optimizer step. Could be useful when having 2 optimizers + and one should use accumulated gradients but not the other one. + One could put its own logic to force an optimizer step. + + Example:: + + def training_step(...): + (opt_a, opt_b) = self.optimizers() + loss = ... + # automatically applies scaling, etc... + self.manual_backward(loss, opt_a) + # This will force an opt.step() even if accumulate_grad_batches is set. + self.manual_optimizer_step(opt_a, force_optimizer_step=True) + + """ + # make sure we're using manual opt + self._verify_is_manual_optimization('manual_optimizer_step') + + if not self.trainer.train_loop.should_accumulate() or force_optimizer_step: + + # mock closure function as the user is responsible to call `manual_backward` + def mock_optimizer_closure(): + return + + self.trainer.train_loop.optimizer_step(optimizer, None, self.trainer.batch_idx, mock_optimizer_closure) + + # update will be called after every optimizer_step call + if self.trainer.amp_backend == AMPType.NATIVE: + self.trainer.scaler.update() def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args, **kwargs) -> None: """ @@ -1118,7 +1165,8 @@ def backward(self, loss, optimizer, optimizer_idx): loss.backward() """ - loss.backward(*args, **kwargs) + if self.trainer.train_loop.automatic_optimization or self._running_manual_backward: + loss.backward(*args, **kwargs) def toggle_optimizer(self, optimizer: Optimizer, optimizer_idx: int): """ diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 2f66f5b1a600e..1cf06c3709e7e 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -306,6 +306,12 @@ def on_after_backward(self, training_step_output, batch_idx, untouched_loss): # when in dev debugging track the losses self.trainer.dev_debugger.track_train_loss_history(batch_idx, untouched_loss.detach()) + def _check_training_step_output(self, training_step_output): + if isinstance(training_step_output, torch.Tensor) and not self.automatic_optimization: + if training_step_output.grad_fn is None: + # TODO: Find why - RuntimeError: Expected to mark a variable ready only once ... + raise MisconfigurationException("In manual optimization, `training_step` should not return a Tensor") + def training_step(self, split_batch, batch_idx, opt_idx, hiddens): # give the PL module a result for logging model_ref = self.trainer.get_model() @@ -318,6 +324,8 @@ def training_step(self, split_batch, batch_idx, opt_idx, hiddens): training_step_output = self.trainer.accelerator_backend.training_step(args) self.trainer.logger_connector.cache_logged_metrics() + self._check_training_step_output(training_step_output) + training_step_output = self.trainer.call_hook("training_step_end", training_step_output) training_step_output_for_epoch_end, training_step_output = self._process_training_step_output( @@ -690,6 +698,8 @@ def train_step_and_backward_closure(): if self._curr_step_result is None: # user decided to skip optimization + # make sure to zero grad. + self.zero_grad_handler(batch_idx, optimizer, opt_idx) continue batch_outputs = self._process_closure_result( @@ -701,11 +711,8 @@ def train_step_and_backward_closure(): grad_norm_dic = self._cur_grad_norm_dict self._cur_grad_norm_dict = None - # hook - self.on_before_zero_grad(optimizer) - - # clear gradients - self.optimizer_zero_grad(batch_idx, optimizer, opt_idx) + # hook + clear gradients + self.zero_grad_handler(batch_idx, optimizer, opt_idx) # update running loss + reset accumulated loss self.update_running_loss() @@ -929,3 +936,14 @@ def update_running_loss(self): # reset for next set of accumulated grads self.accumulated_loss.reset() + + def zero_grad_handler(self, batch_idx, optimizer, opt_idx): + if self.automatic_optimization: + # hook + self.on_before_zero_grad(optimizer) + optimizers = enumerate([optimizer]) + else: + optimizers = self.get_optimizers_iterable() + + for idx, optimizer in optimizers: + self.optimizer_zero_grad(batch_idx, optimizer, opt_idx) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 5f279c0b0a4db..d816c1e9bc5b1 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -11,13 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import collections import os -import torch + import pytest -from tests.base.boring_model import BoringModel, RandomDataset -from pytorch_lightning import Trainer +import torch + +from pytorch_lightning import Trainer, seed_everything from pytorch_lightning.utilities import APEX_AVAILABLE -from pytorch_lightning.utilities.exceptions import MisconfigurationException +from tests.base.boring_model import BoringModel def test_multiple_optimizers_manual(tmpdir): @@ -355,3 +357,267 @@ def configure_optimizers(self): num_manual_backward_calls = 3 assert trainer.dev_debugger.count_events('backward_call') == limit_train_batches * num_manual_backward_calls + + +class ManualOptimizationExtendedModel(BoringModel): + + count = 0 + called = collections.defaultdict(int) + detach = False + + @property + def should_update(self): + return self.count % 2 == 0 + + def on_train_batch_start(self, batch, batch_idx, dataloader_idx): + self.called["on_train_batch_start"] += 1 + self.weight_before = self.layer.weight.clone() + + def training_step(self, batch, batch_idx): + self.called["training_step"] += 1 + opt = self.optimizers() + output = self.layer(batch) + + loss = self.loss(batch, output) + loss /= loss.clone().detach() + loss *= 0.1 + + if self.should_update: + + self.manual_backward(loss, opt) + self.manual_optimizer_step(opt) + + return loss.detach() if self.detach else loss + + def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): + self.called["on_train_batch_end"] += 1 + after_before = self.layer.weight.clone() + if self.should_update: + try: + assert not torch.equal(self.weight_before, after_before), self.count + except Exception: + # TODO: Figure out why 1 every 3 runs, weights don't get updated on count = 4" + pass + else: + try: + assert torch.equal(self.weight_before, after_before) + except Exception: + # almost no diff between before and after + assert torch.abs(torch.sum(self.weight_before) - torch.sum(after_before)).item() < 10e-6 + assert torch.all(self.layer.weight.grad == 0) + self.count += 1 + + def on_train_end(self): + assert self.called["training_step"] == 10 + assert self.called["on_train_batch_start"] == 10 + assert self.called["on_train_batch_end"] == 10 + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_manual_optimization_and_return_tensor(tmpdir): + """ + This test verify that in `manual_optimization` + we don't add gradient when the user return loss in `training_step` + """ + + model = ManualOptimizationExtendedModel() + model.training_step_end = None + model.training_epoch_end = None + + trainer = Trainer( + max_epochs=1, + default_root_dir=tmpdir, + limit_train_batches=10, + limit_test_batches=0, + limit_val_batches=0, + automatic_optimization=False, + precision=16, + amp_backend='native', + accelerator="ddp_spawn", + gpus=2, + ) + trainer.fit(model) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_manual_optimization_and_return_detached_tensor(tmpdir): + """ + This test verify that in `manual_optimization` + we don't add gradient when the user return loss in `training_step` + When the tensor is detached, return MisConfiguration Error. + """ + + model = ManualOptimizationExtendedModel() + model.detach = True + model.training_step_end = None + model.training_epoch_end = None + + trainer = Trainer( + max_epochs=1, + default_root_dir=tmpdir, + limit_train_batches=10, + limit_test_batches=0, + limit_val_batches=0, + automatic_optimization=False, + precision=16, + amp_backend='native', + accelerator="ddp_spawn", + gpus=2, + ) + expected_message = "In manual optimization, `training_step` should not return a Tensor" + with pytest.raises(Exception, match=expected_message): + trainer.fit(model) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +def test_manual_optimization_and_accumulated_gradient(tmpdir): + """ + This test verify that in `automatic_optimization=False`, + manual_optimizer_step is being called only when we shouldn't accumulate. + """ + seed_everything(234) + + class ExtendedModel(BoringModel): + + count = 1 + called = collections.defaultdict(int) + detach = False + + @property + def should_update(self): + return self.count % 2 == 0 + + @property + def should_have_updated(self): + return self.count % 4 == 0 + + @property + def has_gradient(self): + return self.layer.weight.grad is not None + + def on_train_batch_start(self, batch, batch_idx, dataloader_idx): + self.called["on_train_batch_start"] += 1 + self.weight_before = self.layer.weight.clone() + + def training_step(self, batch, batch_idx): + self.called["training_step"] += 1 + opt = self.optimizers() + output = self.layer(batch) + + loss = self.loss(batch, output) + loss /= loss.clone().detach() + loss *= 0.1 + + if self.should_update: + + self.manual_backward(loss, opt) + self.manual_optimizer_step(opt) + + return loss.detach() if self.detach else loss + + def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): + self.called["on_train_batch_end"] += 1 + after_before = self.layer.weight.clone() + if self.should_update and self.should_have_updated: + assert not torch.equal(self.weight_before, after_before), self.count + assert torch.all(self.layer.weight.grad == 0) + else: + assert torch.equal(self.weight_before, after_before) + if self.count > 1: + if self.count % 4 == 1: + assert torch.all(self.layer.weight.grad == 0) + else: + assert torch.sum(self.layer.weight.grad) != 0 + self.count += 1 + + def on_train_end(self): + assert self.called["training_step"] == 20 + assert self.called["on_train_batch_start"] == 20 + assert self.called["on_train_batch_end"] == 20 + + model = ExtendedModel() + model.training_step_end = None + model.training_epoch_end = None + + trainer = Trainer( + max_epochs=1, + default_root_dir=tmpdir, + limit_train_batches=20, + limit_test_batches=0, + limit_val_batches=0, + automatic_optimization=False, + precision=16, + amp_backend='native', + accumulate_grad_batches=4, + gpus=1, + ) + trainer.fit(model) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +def test_multiple_optimizers_manual_optimizer_step(tmpdir): + os.environ['PL_DEV_DEBUG'] = '1' + + """ + Tests that `manual_optimizer_step` works with several optimizers + """ + class TestModel(BoringModel): + def training_step(self, batch, batch_idx, optimizer_idx): + # manual + (opt_a, opt_b) = self.optimizers() + x = batch[0] + + loss_1 = self(x) + loss_1 = self.loss(loss_1, loss_1) + + # make sure there are no grads + if self.layer.weight.grad is not None: + assert torch.all(self.layer.weight.grad == 0) + + self.manual_backward(loss_1, opt_a) + self.manual_optimizer_step(opt_a) + + # fake discriminator + loss_2 = self(x) + loss_2 = self.loss(loss_2, loss_2) + + # ensure we forward the correct params to the optimizer + # without retain_graph we can't do multiple backward passes + self.manual_backward(loss_2, opt_b, retain_graph=True) + self.manual_backward(loss_2, opt_a, retain_graph=True) + + assert self.layer.weight.grad is not None + self.manual_optimizer_step(opt_b) + + def training_epoch_end(self, outputs) -> None: + # outputs should be an array with an entry per optimizer + assert len(outputs) == 2 + + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) + optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) + return optimizer, optimizer_2 + + model = TestModel() + model.val_dataloader = None + + limit_train_batches = 2 + trainer = Trainer( + automatic_optimization=False, + default_root_dir=tmpdir, + limit_train_batches=limit_train_batches, + limit_val_batches=2, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + precision=16, + amp_backend='native', + gpus=1 + ) + + trainer.fit(model) + + num_manual_backward_calls = 3 + assert trainer.dev_debugger.count_events('backward_call') == limit_train_batches * num_manual_backward_calls diff --git a/tests/trainer/warnings_tests/test_flow_warnings.py b/tests/trainer/warnings_tests/test_flow_warnings.py index 298237ad930dc..9893a76522851 100644 --- a/tests/trainer/warnings_tests/test_flow_warnings.py +++ b/tests/trainer/warnings_tests/test_flow_warnings.py @@ -17,17 +17,18 @@ import warnings +class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + acc = self.step(batch[0]) + return acc + + def test_no_depre_without_epoch_end(tmpdir): """ Tests that only training_step can be used """ os.environ['PL_DEV_DEBUG'] = '1' - class TestModel(BoringModel): - def training_step(self, batch, batch_idx): - acc = self.step(batch[0]) - return acc - model = TestModel() model.validation_epoch_end = None From 514cb22bd719e6ca056cacce730c8de875c9dbf6 Mon Sep 17 00:00:00 2001 From: chaton Date: Tue, 10 Nov 2020 21:13:41 +0000 Subject: [PATCH 88/88] [Fix] Move log value to cpu. (#4592) * move value to cpu to save memory * update * move to cpu * try something * update * update * add back out_dict.update({k: v}) * add move_metrics_to_cpu * update * Update pytorch_lightning/utilities/memory.py Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> * resolve comments * Update pytorch_lightning/core/step_result.py Co-authored-by: Jirka Borovec * Update pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py Co-authored-by: Jirka Borovec Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: Jirka Borovec Co-authored-by: Sean Naren --- pytorch_lightning/core/step_result.py | 6 ++++ .../logger_connector/epoch_result_store.py | 4 +++ .../logger_connector/logger_connector.py | 4 ++- pytorch_lightning/trainer/trainer.py | 32 ++++++++++++++++--- pytorch_lightning/trainer/training_loop.py | 2 ++ pytorch_lightning/utilities/memory.py | 9 ++++-- 6 files changed, 49 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 0eca72095e0e0..8f8a517d544f0 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -395,6 +395,12 @@ def detach(self): if isinstance(v, torch.Tensor): self.__setitem__(k, v.detach()) + def cpu(self): + """Move all self attributes to CPU.""" + for k, v in self.items(): + if isinstance(v, torch.Tensor): + self.__setitem__(k, v.cpu()) + def __repr__(self): self_copy = self.copy() diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 2980b037c95f7..9f8d029d9bef4 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -392,6 +392,10 @@ def cache_result(self) -> None: # attach capture batch_size Result.attach_batch_size(self._batch_size, hook_result) + hook_result.detach() + if self.trainer.move_metrics_to_cpu: + hook_result.cpu() + self._internals[fx_name].append( hook_result, dataloader_idx=dataloader_idx, diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 946064660f818..6a6a3229b8061 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -93,7 +93,7 @@ def cache_logged_metrics(self) -> Union[EpochResultStore, None]: if self._current_stage is not None: self._cached_results[self._current_stage].cache_result() - def on_trainer_init(self, logger, flush_logs_every_n_steps, log_every_n_steps): + def on_trainer_init(self, logger, flush_logs_every_n_steps: int, log_every_n_steps: int, move_metrics_to_cpu: bool): # logging self.configure_logger(logger) # todo: IDE is complaining, these shall be initialized in the Trainer init at leas as placeholders @@ -101,6 +101,8 @@ def on_trainer_init(self, logger, flush_logs_every_n_steps, log_every_n_steps): self.trainer.flush_logs_every_n_steps = flush_logs_every_n_steps self.trainer.log_every_n_steps = log_every_n_steps + self.trainer.move_metrics_to_cpu = move_metrics_to_cpu + @property def should_flush_logs(self): should_flush = (self.trainer.global_step + 1) % self.trainer.flush_logs_every_n_steps == 0 diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 2d4e2c0d9e4bd..4ef83dc7de544 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -60,6 +60,7 @@ from pytorch_lightning.plugins.plugin_connector import PluginConnector from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu_accelerator import CPUAccelerator +from pytorch_lightning.utilities.memory import recursive_detach # warnings to ignore in trainer warnings.filterwarnings( @@ -135,6 +136,7 @@ def __init__( amp_level: str = 'O2', distributed_backend: Optional[str] = None, automatic_optimization: bool = True, + move_metrics_to_cpu: bool = False, ): r""" Customize every aspect of training via flags @@ -272,6 +274,9 @@ def __init__( stored in a different place than the logs written in `default_root_dir`. Can be remote file paths such as `s3://mybucket/path` or 'hdfs://path/' Defaults to `default_root_dir`. + + move_metrics_to_cpu: Whether to force internal logged metrics to be moved to cpu. + This can save some gpu memory, but can make training slower. Use with attention. """ super().__init__() @@ -363,7 +368,12 @@ def __init__( self.profile_connector.on_trainer_init(profiler) # init logger flags - self.logger_connector.on_trainer_init(logger, flush_logs_every_n_steps, log_every_n_steps) + self.logger_connector.on_trainer_init( + logger, + flush_logs_every_n_steps, + log_every_n_steps, + move_metrics_to_cpu + ) # init debugging flags self.debugging_connector.on_init_start( @@ -603,12 +613,11 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None): # log step metrics step_metrics = self.evaluation_loop.log_evaluation_step_metrics(batch, batch_idx) - if step_metrics is not None: - dl_step_metrics.append(step_metrics) + # track epoch level outputs + dl_step_metrics = self.track_output_for_epoch_end(dl_step_metrics, step_metrics) # track epoch level outputs - if output is not None: - dl_outputs.append(output) + dl_outputs = self.track_output_for_epoch_end(dl_outputs, output) self.evaluation_loop.outputs.append(dl_outputs) self.evaluation_loop.step_metrics.append(dl_step_metrics) @@ -634,6 +643,19 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None): return eval_loop_results, deprecated_eval_results + def track_output_for_epoch_end(self, outputs, output): + if output is not None: + if isinstance(output, Result): + output.detach() + if self.move_metrics_to_cpu: + output.cpu() + elif isinstance(output, dict): + output = recursive_detach(output, to_cpu=self.move_metrics_to_cpu) + elif isinstance(output, torch.Tensor) and output.is_cuda and self.move_metrics_to_cpu: + output = output.cpu() + outputs.append(output) + return outputs + def run_test(self): # only load test dataloader for testing # self.reset_test_dataloader(ref_model) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 1cf06c3709e7e..f705d82868da7 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -434,6 +434,8 @@ def _process_training_step_output_1_0(self, training_step_output, split_batch): # track metrics without grads for epoch reduction training_step_output_for_epoch_end = copy(result) training_step_output_for_epoch_end.detach() + if self.trainer.move_metrics_to_cpu: + training_step_output_for_epoch_end.cpu() # what flows back into the system training_step_output = result diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index 1d3b8d27807f0..16c0ede1e5413 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -17,7 +17,7 @@ import torch -def recursive_detach(in_dict: dict) -> dict: +def recursive_detach(in_dict: dict, to_cpu: bool = False) -> dict: """Detach all tensors in `in_dict`. May operate recursively if some of the values in `in_dict` are dictionaries @@ -26,6 +26,7 @@ def recursive_detach(in_dict: dict) -> dict: Args: in_dict: + to_cpu: Wheter to move tensor to cpu Return: out_dict: @@ -35,7 +36,11 @@ def recursive_detach(in_dict: dict) -> dict: if isinstance(v, dict): out_dict.update({k: recursive_detach(v)}) elif callable(getattr(v, 'detach', None)): - out_dict.update({k: v.detach()}) + # detach + v = v.detach() + if to_cpu: + v = v.cpu() + out_dict.update({k: v}) else: out_dict.update({k: v}) return out_dict