From 7e08b0d710208e980f27896aa62a59f26f0cb3b3 Mon Sep 17 00:00:00 2001 From: chaton Date: Tue, 10 Nov 2020 19:44:51 +0000 Subject: [PATCH 01/11] [bug-fix] DDP and automatic_optimization=False (#4485) * resolve bug * add self._running_manual_optim * update * update tests * update lightning module * resolve bug * update tests * update * resolve pep8 * update * replace by `ddp_spawn` * temporary fix * update * update * move update to training_loop * make both ddp_spawn * introduce `manual_optimizer_step` * update changelog * added changelog wrong place * add force_optimizer_step * update docstring for tests * update optimizer_step * update zero_grad * resolve flake8 * move update into manual_optimizer_step * add zero_grad * remove zero_grad tests * remove manual_backward in AMP, it doesn't help * update * loosen tests * update * update doc * add TODO * Removed unnecessary get model from native amp * Remove try except with pytest raise * Add seed, clean up imports, remove try catch to reproduce error * update code * update test * revert back * formatting * Update pytorch_lightning/core/lightning.py Co-authored-by: Jirka Borovec Co-authored-by: SeanNaren Co-authored-by: Sean Naren Co-authored-by: Jirka Borovec --- .gitignore | 1 + CHANGELOG.md | 3 + docs/source/lightning_module.rst | 6 + docs/source/optimizers.rst | 7 +- pytorch_lightning/accelerators/accelerator.py | 9 +- pytorch_lightning/core/lightning.py | 50 +++- pytorch_lightning/trainer/training_loop.py | 28 +- .../optimization/test_manual_optimization.py | 274 +++++++++++++++++- .../warnings_tests/test_flow_warnings.py | 11 +- 9 files changed, 366 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index fff549a7187945..946d5f0f4c2ca3 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,7 @@ timit_data/ .Python ide_layouts/ build/ +_build/ develop-eggs/ dist/ downloads/ diff --git a/CHANGELOG.md b/CHANGELOG.md index bb286e82759c79..16daa24aa2ed9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775)) +- Added `manual_optimizer_step` which work with `AMP Native` and `accumulated_grad_batches` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485)) + + - Added `persistent(mode)` method to metrics, to enable and disable metric states being added to `state_dict` ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482)) diff --git a/docs/source/lightning_module.rst b/docs/source/lightning_module.rst index 11641fc35e8a05..c26e0fc0351d16 100644 --- a/docs/source/lightning_module.rst +++ b/docs/source/lightning_module.rst @@ -1009,6 +1009,12 @@ manual_backward .. automethod:: pytorch_lightning.core.lightning.LightningModule.manual_backward :noindex: +manual_optimizer_step +~~~~~~~~~~~~~~~~~~~~~ + +.. automethod:: pytorch_lightning.core.lightning.LightningModule.manual_optimizer_step + :noindex: + on_after_backward ~~~~~~~~~~~~~~~~~ diff --git a/docs/source/optimizers.rst b/docs/source/optimizers.rst index 1e7baadb64480c..7f1bcc97662b48 100644 --- a/docs/source/optimizers.rst +++ b/docs/source/optimizers.rst @@ -36,8 +36,8 @@ to manually manage the optimization process. To do so, do the following: # use self.backward which will also handle scaling the loss when using amp self.manual_backward(loss_a, opt_g) - opt_g.step() - opt_g.zero_grad() + self.manual_optimizer_step(opt_g) + # do anything you want loss_b = ... @@ -45,8 +45,7 @@ to manually manage the optimization process. To do so, do the following: # pass in any args that loss.backward() normally takes self.manual_backward(loss_b, opt_d, retain_graph=True) self.manual_backward(loss_b, opt_d) - opt_d.step() - opt_d.zero_grad() + self.manual_optimizer_step(opt_d) # log losses self.log('loss_a', loss_a) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index dc0b0bf63a98d6..3b762e08ed5e6d 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -109,10 +109,11 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure): model_ref = self.trainer.get_model() is_lbfgs = isinstance(optimizer, torch.optim.LBFGS) - native_amp = self.trainer.amp_backend == AMPType.NATIVE + using_native_amp = self.trainer.amp_backend == AMPType.NATIVE + automatic_optimization = self.trainer.train_loop.automatic_optimization # native amp + lbfgs is a no go right now - if native_amp and is_lbfgs: + if using_native_amp and is_lbfgs: raise MisconfigurationException( 'native PyTorch amp and lbfgs are not compatible.' ' To request, please file a Github issue in PyTorch and tag @mcarilli') @@ -125,12 +126,12 @@ def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure): optimizer_idx=opt_idx, optimizer_closure=lambda_closure, on_tpu=False, # TPUAccelerator class sets this as True - using_native_amp=native_amp, + using_native_amp=using_native_amp, using_lbfgs=is_lbfgs ) # scale when native amp - if native_amp: + if automatic_optimization and using_native_amp: self.trainer.scaler.update() def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx): diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index 3d38f65892983d..a332c0dcaa99a1 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -111,6 +111,7 @@ def __init__(self, *args, **kwargs): self._datamodule = None self._results: Optional[Result] = None self._current_fx_name = '' + self._running_manual_backward = False self._current_hook_fx_name = None self._current_dataloader_idx = None @@ -1085,6 +1086,9 @@ def manual_backward(self, loss: Tensor, optimizer: Optimizer, *args, **kwargs) - .. tip:: In manual mode we still automatically clip grads if Trainer(gradient_clip_val=x) is set + .. tip:: In manual mode we still automatically accumulate grad over batches if Trainer(accumulate_grad_batches=x) is set + and you use `model.manual_optimizer_step(optimizer)` + Example:: def training_step(...): @@ -1092,12 +1096,55 @@ def training_step(...): loss = ... # automatically applies scaling, etc... self.manual_backward(loss, opt_a) + self.manual_optimizer_step(opt_a) """ # make sure we're using manual opt self._verify_is_manual_optimization('manual_backward') # backward + self._running_manual_backward = True self.trainer.train_loop.backward(loss, optimizer, -1, *args, **kwargs) + self._running_manual_backward = False + + def manual_optimizer_step(self, optimizer: Optimizer, force_optimizer_step:bool = False) -> None: + """ + Call this directly from your training_step when doing optimizations manually. + By using this we can ensure that all the proper scaling when using 16-bit etc has been done for you + + .. tip:: In manual mode we still automatically accumulate grad over batches if Trainer(accumulate_grad_batches=x) is set. + + Args: + optimizer: Optimizer used to perform `.step()` call + + force_optimizer_step: Whether to force an optimizer step. Could be useful when having 2 optimizers + and one should use accumulated gradients but not the other one. + One could put its own logic to force an optimizer step. + + Example:: + + def training_step(...): + (opt_a, opt_b) = self.optimizers() + loss = ... + # automatically applies scaling, etc... + self.manual_backward(loss, opt_a) + # This will force an opt.step() even if accumulate_grad_batches is set. + self.manual_optimizer_step(opt_a, force_optimizer_step=True) + + """ + # make sure we're using manual opt + self._verify_is_manual_optimization('manual_optimizer_step') + + if not self.trainer.train_loop.should_accumulate() or force_optimizer_step: + + # mock closure function as the user is responsible to call `manual_backward` + def mock_optimizer_closure(): + return + + self.trainer.train_loop.optimizer_step(optimizer, None, self.trainer.batch_idx, mock_optimizer_closure) + + # update will be called after every optimizer_step call + if self.trainer.amp_backend == AMPType.NATIVE: + self.trainer.scaler.update() def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args, **kwargs) -> None: """ @@ -1118,7 +1165,8 @@ def backward(self, loss, optimizer, optimizer_idx): loss.backward() """ - loss.backward(*args, **kwargs) + if self.trainer.train_loop.automatic_optimization or self._running_manual_backward: + loss.backward(*args, **kwargs) def toggle_optimizer(self, optimizer: Optimizer, optimizer_idx: int): """ diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 2f66f5b1a600ec..1cf06c3709e7ec 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -306,6 +306,12 @@ def on_after_backward(self, training_step_output, batch_idx, untouched_loss): # when in dev debugging track the losses self.trainer.dev_debugger.track_train_loss_history(batch_idx, untouched_loss.detach()) + def _check_training_step_output(self, training_step_output): + if isinstance(training_step_output, torch.Tensor) and not self.automatic_optimization: + if training_step_output.grad_fn is None: + # TODO: Find why - RuntimeError: Expected to mark a variable ready only once ... + raise MisconfigurationException("In manual optimization, `training_step` should not return a Tensor") + def training_step(self, split_batch, batch_idx, opt_idx, hiddens): # give the PL module a result for logging model_ref = self.trainer.get_model() @@ -318,6 +324,8 @@ def training_step(self, split_batch, batch_idx, opt_idx, hiddens): training_step_output = self.trainer.accelerator_backend.training_step(args) self.trainer.logger_connector.cache_logged_metrics() + self._check_training_step_output(training_step_output) + training_step_output = self.trainer.call_hook("training_step_end", training_step_output) training_step_output_for_epoch_end, training_step_output = self._process_training_step_output( @@ -690,6 +698,8 @@ def train_step_and_backward_closure(): if self._curr_step_result is None: # user decided to skip optimization + # make sure to zero grad. + self.zero_grad_handler(batch_idx, optimizer, opt_idx) continue batch_outputs = self._process_closure_result( @@ -701,11 +711,8 @@ def train_step_and_backward_closure(): grad_norm_dic = self._cur_grad_norm_dict self._cur_grad_norm_dict = None - # hook - self.on_before_zero_grad(optimizer) - - # clear gradients - self.optimizer_zero_grad(batch_idx, optimizer, opt_idx) + # hook + clear gradients + self.zero_grad_handler(batch_idx, optimizer, opt_idx) # update running loss + reset accumulated loss self.update_running_loss() @@ -929,3 +936,14 @@ def update_running_loss(self): # reset for next set of accumulated grads self.accumulated_loss.reset() + + def zero_grad_handler(self, batch_idx, optimizer, opt_idx): + if self.automatic_optimization: + # hook + self.on_before_zero_grad(optimizer) + optimizers = enumerate([optimizer]) + else: + optimizers = self.get_optimizers_iterable() + + for idx, optimizer in optimizers: + self.optimizer_zero_grad(batch_idx, optimizer, opt_idx) diff --git a/tests/trainer/optimization/test_manual_optimization.py b/tests/trainer/optimization/test_manual_optimization.py index 5f279c0b0a4db9..d816c1e9bc5b15 100644 --- a/tests/trainer/optimization/test_manual_optimization.py +++ b/tests/trainer/optimization/test_manual_optimization.py @@ -11,13 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import collections import os -import torch + import pytest -from tests.base.boring_model import BoringModel, RandomDataset -from pytorch_lightning import Trainer +import torch + +from pytorch_lightning import Trainer, seed_everything from pytorch_lightning.utilities import APEX_AVAILABLE -from pytorch_lightning.utilities.exceptions import MisconfigurationException +from tests.base.boring_model import BoringModel def test_multiple_optimizers_manual(tmpdir): @@ -355,3 +357,267 @@ def configure_optimizers(self): num_manual_backward_calls = 3 assert trainer.dev_debugger.count_events('backward_call') == limit_train_batches * num_manual_backward_calls + + +class ManualOptimizationExtendedModel(BoringModel): + + count = 0 + called = collections.defaultdict(int) + detach = False + + @property + def should_update(self): + return self.count % 2 == 0 + + def on_train_batch_start(self, batch, batch_idx, dataloader_idx): + self.called["on_train_batch_start"] += 1 + self.weight_before = self.layer.weight.clone() + + def training_step(self, batch, batch_idx): + self.called["training_step"] += 1 + opt = self.optimizers() + output = self.layer(batch) + + loss = self.loss(batch, output) + loss /= loss.clone().detach() + loss *= 0.1 + + if self.should_update: + + self.manual_backward(loss, opt) + self.manual_optimizer_step(opt) + + return loss.detach() if self.detach else loss + + def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): + self.called["on_train_batch_end"] += 1 + after_before = self.layer.weight.clone() + if self.should_update: + try: + assert not torch.equal(self.weight_before, after_before), self.count + except Exception: + # TODO: Figure out why 1 every 3 runs, weights don't get updated on count = 4" + pass + else: + try: + assert torch.equal(self.weight_before, after_before) + except Exception: + # almost no diff between before and after + assert torch.abs(torch.sum(self.weight_before) - torch.sum(after_before)).item() < 10e-6 + assert torch.all(self.layer.weight.grad == 0) + self.count += 1 + + def on_train_end(self): + assert self.called["training_step"] == 10 + assert self.called["on_train_batch_start"] == 10 + assert self.called["on_train_batch_end"] == 10 + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_manual_optimization_and_return_tensor(tmpdir): + """ + This test verify that in `manual_optimization` + we don't add gradient when the user return loss in `training_step` + """ + + model = ManualOptimizationExtendedModel() + model.training_step_end = None + model.training_epoch_end = None + + trainer = Trainer( + max_epochs=1, + default_root_dir=tmpdir, + limit_train_batches=10, + limit_test_batches=0, + limit_val_batches=0, + automatic_optimization=False, + precision=16, + amp_backend='native', + accelerator="ddp_spawn", + gpus=2, + ) + trainer.fit(model) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_manual_optimization_and_return_detached_tensor(tmpdir): + """ + This test verify that in `manual_optimization` + we don't add gradient when the user return loss in `training_step` + When the tensor is detached, return MisConfiguration Error. + """ + + model = ManualOptimizationExtendedModel() + model.detach = True + model.training_step_end = None + model.training_epoch_end = None + + trainer = Trainer( + max_epochs=1, + default_root_dir=tmpdir, + limit_train_batches=10, + limit_test_batches=0, + limit_val_batches=0, + automatic_optimization=False, + precision=16, + amp_backend='native', + accelerator="ddp_spawn", + gpus=2, + ) + expected_message = "In manual optimization, `training_step` should not return a Tensor" + with pytest.raises(Exception, match=expected_message): + trainer.fit(model) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +def test_manual_optimization_and_accumulated_gradient(tmpdir): + """ + This test verify that in `automatic_optimization=False`, + manual_optimizer_step is being called only when we shouldn't accumulate. + """ + seed_everything(234) + + class ExtendedModel(BoringModel): + + count = 1 + called = collections.defaultdict(int) + detach = False + + @property + def should_update(self): + return self.count % 2 == 0 + + @property + def should_have_updated(self): + return self.count % 4 == 0 + + @property + def has_gradient(self): + return self.layer.weight.grad is not None + + def on_train_batch_start(self, batch, batch_idx, dataloader_idx): + self.called["on_train_batch_start"] += 1 + self.weight_before = self.layer.weight.clone() + + def training_step(self, batch, batch_idx): + self.called["training_step"] += 1 + opt = self.optimizers() + output = self.layer(batch) + + loss = self.loss(batch, output) + loss /= loss.clone().detach() + loss *= 0.1 + + if self.should_update: + + self.manual_backward(loss, opt) + self.manual_optimizer_step(opt) + + return loss.detach() if self.detach else loss + + def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx): + self.called["on_train_batch_end"] += 1 + after_before = self.layer.weight.clone() + if self.should_update and self.should_have_updated: + assert not torch.equal(self.weight_before, after_before), self.count + assert torch.all(self.layer.weight.grad == 0) + else: + assert torch.equal(self.weight_before, after_before) + if self.count > 1: + if self.count % 4 == 1: + assert torch.all(self.layer.weight.grad == 0) + else: + assert torch.sum(self.layer.weight.grad) != 0 + self.count += 1 + + def on_train_end(self): + assert self.called["training_step"] == 20 + assert self.called["on_train_batch_start"] == 20 + assert self.called["on_train_batch_end"] == 20 + + model = ExtendedModel() + model.training_step_end = None + model.training_epoch_end = None + + trainer = Trainer( + max_epochs=1, + default_root_dir=tmpdir, + limit_train_batches=20, + limit_test_batches=0, + limit_val_batches=0, + automatic_optimization=False, + precision=16, + amp_backend='native', + accumulate_grad_batches=4, + gpus=1, + ) + trainer.fit(model) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +def test_multiple_optimizers_manual_optimizer_step(tmpdir): + os.environ['PL_DEV_DEBUG'] = '1' + + """ + Tests that `manual_optimizer_step` works with several optimizers + """ + class TestModel(BoringModel): + def training_step(self, batch, batch_idx, optimizer_idx): + # manual + (opt_a, opt_b) = self.optimizers() + x = batch[0] + + loss_1 = self(x) + loss_1 = self.loss(loss_1, loss_1) + + # make sure there are no grads + if self.layer.weight.grad is not None: + assert torch.all(self.layer.weight.grad == 0) + + self.manual_backward(loss_1, opt_a) + self.manual_optimizer_step(opt_a) + + # fake discriminator + loss_2 = self(x) + loss_2 = self.loss(loss_2, loss_2) + + # ensure we forward the correct params to the optimizer + # without retain_graph we can't do multiple backward passes + self.manual_backward(loss_2, opt_b, retain_graph=True) + self.manual_backward(loss_2, opt_a, retain_graph=True) + + assert self.layer.weight.grad is not None + self.manual_optimizer_step(opt_b) + + def training_epoch_end(self, outputs) -> None: + # outputs should be an array with an entry per optimizer + assert len(outputs) == 2 + + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) + optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) + return optimizer, optimizer_2 + + model = TestModel() + model.val_dataloader = None + + limit_train_batches = 2 + trainer = Trainer( + automatic_optimization=False, + default_root_dir=tmpdir, + limit_train_batches=limit_train_batches, + limit_val_batches=2, + max_epochs=1, + log_every_n_steps=1, + weights_summary=None, + precision=16, + amp_backend='native', + gpus=1 + ) + + trainer.fit(model) + + num_manual_backward_calls = 3 + assert trainer.dev_debugger.count_events('backward_call') == limit_train_batches * num_manual_backward_calls diff --git a/tests/trainer/warnings_tests/test_flow_warnings.py b/tests/trainer/warnings_tests/test_flow_warnings.py index 298237ad930dc4..9893a765228515 100644 --- a/tests/trainer/warnings_tests/test_flow_warnings.py +++ b/tests/trainer/warnings_tests/test_flow_warnings.py @@ -17,17 +17,18 @@ import warnings +class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + acc = self.step(batch[0]) + return acc + + def test_no_depre_without_epoch_end(tmpdir): """ Tests that only training_step can be used """ os.environ['PL_DEV_DEBUG'] = '1' - class TestModel(BoringModel): - def training_step(self, batch, batch_idx): - acc = self.step(batch[0]) - return acc - model = TestModel() model.validation_epoch_end = None From 514cb22bd719e6ca056cacce730c8de875c9dbf6 Mon Sep 17 00:00:00 2001 From: chaton Date: Tue, 10 Nov 2020 21:13:41 +0000 Subject: [PATCH 02/11] [Fix] Move log value to cpu. (#4592) * move value to cpu to save memory * update * move to cpu * try something * update * update * add back out_dict.update({k: v}) * add move_metrics_to_cpu * update * Update pytorch_lightning/utilities/memory.py Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> * resolve comments * Update pytorch_lightning/core/step_result.py Co-authored-by: Jirka Borovec * Update pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py Co-authored-by: Jirka Borovec Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: Jirka Borovec Co-authored-by: Sean Naren --- pytorch_lightning/core/step_result.py | 6 ++++ .../logger_connector/epoch_result_store.py | 4 +++ .../logger_connector/logger_connector.py | 4 ++- pytorch_lightning/trainer/trainer.py | 32 ++++++++++++++++--- pytorch_lightning/trainer/training_loop.py | 2 ++ pytorch_lightning/utilities/memory.py | 9 ++++-- 6 files changed, 49 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 0eca72095e0e06..8f8a517d544f01 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -395,6 +395,12 @@ def detach(self): if isinstance(v, torch.Tensor): self.__setitem__(k, v.detach()) + def cpu(self): + """Move all self attributes to CPU.""" + for k, v in self.items(): + if isinstance(v, torch.Tensor): + self.__setitem__(k, v.cpu()) + def __repr__(self): self_copy = self.copy() diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 2980b037c95f71..9f8d029d9bef4c 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -392,6 +392,10 @@ def cache_result(self) -> None: # attach capture batch_size Result.attach_batch_size(self._batch_size, hook_result) + hook_result.detach() + if self.trainer.move_metrics_to_cpu: + hook_result.cpu() + self._internals[fx_name].append( hook_result, dataloader_idx=dataloader_idx, diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 946064660f818d..6a6a3229b8061b 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -93,7 +93,7 @@ def cache_logged_metrics(self) -> Union[EpochResultStore, None]: if self._current_stage is not None: self._cached_results[self._current_stage].cache_result() - def on_trainer_init(self, logger, flush_logs_every_n_steps, log_every_n_steps): + def on_trainer_init(self, logger, flush_logs_every_n_steps: int, log_every_n_steps: int, move_metrics_to_cpu: bool): # logging self.configure_logger(logger) # todo: IDE is complaining, these shall be initialized in the Trainer init at leas as placeholders @@ -101,6 +101,8 @@ def on_trainer_init(self, logger, flush_logs_every_n_steps, log_every_n_steps): self.trainer.flush_logs_every_n_steps = flush_logs_every_n_steps self.trainer.log_every_n_steps = log_every_n_steps + self.trainer.move_metrics_to_cpu = move_metrics_to_cpu + @property def should_flush_logs(self): should_flush = (self.trainer.global_step + 1) % self.trainer.flush_logs_every_n_steps == 0 diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 2d4e2c0d9e4bdc..4ef83dc7de544c 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -60,6 +60,7 @@ from pytorch_lightning.plugins.plugin_connector import PluginConnector from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.accelerators.cpu_accelerator import CPUAccelerator +from pytorch_lightning.utilities.memory import recursive_detach # warnings to ignore in trainer warnings.filterwarnings( @@ -135,6 +136,7 @@ def __init__( amp_level: str = 'O2', distributed_backend: Optional[str] = None, automatic_optimization: bool = True, + move_metrics_to_cpu: bool = False, ): r""" Customize every aspect of training via flags @@ -272,6 +274,9 @@ def __init__( stored in a different place than the logs written in `default_root_dir`. Can be remote file paths such as `s3://mybucket/path` or 'hdfs://path/' Defaults to `default_root_dir`. + + move_metrics_to_cpu: Whether to force internal logged metrics to be moved to cpu. + This can save some gpu memory, but can make training slower. Use with attention. """ super().__init__() @@ -363,7 +368,12 @@ def __init__( self.profile_connector.on_trainer_init(profiler) # init logger flags - self.logger_connector.on_trainer_init(logger, flush_logs_every_n_steps, log_every_n_steps) + self.logger_connector.on_trainer_init( + logger, + flush_logs_every_n_steps, + log_every_n_steps, + move_metrics_to_cpu + ) # init debugging flags self.debugging_connector.on_init_start( @@ -603,12 +613,11 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None): # log step metrics step_metrics = self.evaluation_loop.log_evaluation_step_metrics(batch, batch_idx) - if step_metrics is not None: - dl_step_metrics.append(step_metrics) + # track epoch level outputs + dl_step_metrics = self.track_output_for_epoch_end(dl_step_metrics, step_metrics) # track epoch level outputs - if output is not None: - dl_outputs.append(output) + dl_outputs = self.track_output_for_epoch_end(dl_outputs, output) self.evaluation_loop.outputs.append(dl_outputs) self.evaluation_loop.step_metrics.append(dl_step_metrics) @@ -634,6 +643,19 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None): return eval_loop_results, deprecated_eval_results + def track_output_for_epoch_end(self, outputs, output): + if output is not None: + if isinstance(output, Result): + output.detach() + if self.move_metrics_to_cpu: + output.cpu() + elif isinstance(output, dict): + output = recursive_detach(output, to_cpu=self.move_metrics_to_cpu) + elif isinstance(output, torch.Tensor) and output.is_cuda and self.move_metrics_to_cpu: + output = output.cpu() + outputs.append(output) + return outputs + def run_test(self): # only load test dataloader for testing # self.reset_test_dataloader(ref_model) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 1cf06c3709e7ec..f705d82868da78 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -434,6 +434,8 @@ def _process_training_step_output_1_0(self, training_step_output, split_batch): # track metrics without grads for epoch reduction training_step_output_for_epoch_end = copy(result) training_step_output_for_epoch_end.detach() + if self.trainer.move_metrics_to_cpu: + training_step_output_for_epoch_end.cpu() # what flows back into the system training_step_output = result diff --git a/pytorch_lightning/utilities/memory.py b/pytorch_lightning/utilities/memory.py index 1d3b8d27807f05..16c0ede1e5413e 100644 --- a/pytorch_lightning/utilities/memory.py +++ b/pytorch_lightning/utilities/memory.py @@ -17,7 +17,7 @@ import torch -def recursive_detach(in_dict: dict) -> dict: +def recursive_detach(in_dict: dict, to_cpu: bool = False) -> dict: """Detach all tensors in `in_dict`. May operate recursively if some of the values in `in_dict` are dictionaries @@ -26,6 +26,7 @@ def recursive_detach(in_dict: dict) -> dict: Args: in_dict: + to_cpu: Wheter to move tensor to cpu Return: out_dict: @@ -35,7 +36,11 @@ def recursive_detach(in_dict: dict) -> dict: if isinstance(v, dict): out_dict.update({k: recursive_detach(v)}) elif callable(getattr(v, 'detach', None)): - out_dict.update({k: v.detach()}) + # detach + v = v.detach() + if to_cpu: + v = v.cpu() + out_dict.update({k: v}) else: out_dict.update({k: v}) return out_dict From 3d202f9ecc4137b08cb5b1ac15af276456fcfaaf Mon Sep 17 00:00:00 2001 From: chaton Date: Wed, 11 Nov 2020 17:05:24 +0000 Subject: [PATCH 03/11] [FEAT] Refactor logging 3/3 [v1] (#4552) * wip * wip check how many tests break * wip * resolve some bugs * resolve more bugs * resolve 2 bugs * resolve * temp fix * update * remove useless code * remove result * try to resolve bug * update changelog * formatting * remove pl Co-authored-by: Jirka Borovec Co-authored-by: Sean Naren --- CHANGELOG.md | 7 +- .../logger_connector/epoch_result_store.py | 10 +- .../logger_connector/logger_connector.py | 151 ++----- pytorch_lightning/trainer/evaluation_loop.py | 91 ++--- pytorch_lightning/trainer/trainer.py | 53 +-- tests/models/test_hooks.py | 2 +- .../test_eval_loop_dict_return.py | 6 +- .../trainer/logging/test_logger_connector.py | 12 +- .../test_eval_loop_logging_1_0.py | 383 +++++++++++++++++- 9 files changed, 501 insertions(+), 214 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 16daa24aa2ed9b..7da839b74e3432 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,8 +33,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775)) -- Added `manual_optimizer_step` which work with `AMP Native` and `accumulated_grad_batches` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485)) +- Added logging using `self.log` in train and evaluation for most callbacks and model hooks ( + [#4552](https://github.com/PyTorchLightning/pytorch-lightning/pull/4552), + [#4495](https://github.com/PyTorchLightning/pytorch-lightning/pull/4495), + [#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439) +) +- Added `manual_optimizer_step` which work with `AMP Native` and `accumulated_grad_batches` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485)) - Added `persistent(mode)` method to metrics, to enable and disable metric states being added to `state_dict` ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482)) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py index 9f8d029d9bef4c..7ea2e8208a8748 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/epoch_result_store.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +from copy import deepcopy from collections import defaultdict, ChainMap from enum import Enum from typing import Union, Tuple, Any, Dict, Optional, List @@ -419,13 +420,14 @@ def update_logger_connector(self, fx_name: str = None) -> None: logger_connector = self.trainer.logger_connector callback_metrics = {} + is_train = self._stage in LoggerStages.TRAIN.value if not self._has_batch_loop_finished: # get pbar batch_pbar_metrics = self.get_latest_batch_pbar_metrics() logger_connector.add_progress_bar_metrics(batch_pbar_metrics) - if self._stage in LoggerStages.TRAIN.value: + if is_train: # Only log and add to callback epoch step during evaluation, test. batch_log_metrics = self.get_latest_batch_log_metrics() logger_connector.logged_metrics.update(batch_log_metrics) @@ -443,6 +445,9 @@ def update_logger_connector(self, fx_name: str = None) -> None: epoch_log_metrics = self.get_epoch_log_metrics() logger_connector.logged_metrics.update(epoch_log_metrics) logger_connector.logged_metrics.update(epoch_dict) + if not self.trainer.running_sanity_check and not is_train: + if len(epoch_log_metrics) > 0: + self.trainer.dev_debugger.track_logged_metrics_history(deepcopy(epoch_log_metrics)) # get forked_metrics forked_metrics = self.get_forked_metrics() @@ -451,6 +456,9 @@ def update_logger_connector(self, fx_name: str = None) -> None: callback_metrics.update(epoch_log_metrics) callback_metrics.update(forked_metrics) + if not is_train: + logger_connector.evaluation_callback_metrics.update(callback_metrics) + # update callback_metrics logger_connector.callback_metrics.update(callback_metrics) logger_connector.callback_metrics.pop("epoch", None) diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index 6a6a3229b8061b..7116739c8aa1f8 100644 --- a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -36,6 +36,7 @@ class LoggerConnector: def __init__(self, trainer): self.trainer = trainer self.callback_metrics = {} + self.evaluation_callback_metrics = {} self.logged_metrics = {} self.progress_bar_metrics = {} self.eval_loop_results = [] @@ -59,10 +60,9 @@ def check_logging_in_callbacks(self, hook_fx_name, on_step: bool = None, on_epoc on_epoch=on_epoch) def on_evaluation_batch_start(self, testing, batch, dataloader_idx, num_dataloaders): - # reset the result of the PL module model = self.trainer.get_model() + # set dataloader_idx only if multiple ones model._current_dataloader_idx = dataloader_idx if num_dataloaders > 1 else None - # track batch_size self.cached_results._batch_size = Result.extract_batch_size(batch) @@ -226,19 +226,41 @@ def add_progress_bar_metrics(self, metrics): self.trainer.dev_debugger.track_pbar_metrics_history(metrics) - def on_evaluation_epoch_end(self, deprecated_eval_results, epoch_logs, using_eval_result, test_mode): + def track_metrics_deprecated(self, deprecated_eval_results, using_eval_result, test_mode): self._track_callback_metrics(deprecated_eval_results, using_eval_result) - - # TODO: deprecate parts of this for 1.0 (when removing results) self.__process_eval_epoch_end_results_and_log_legacy(deprecated_eval_results, test_mode) - self._log_on_evaluation_epoch_end_metrics(epoch_logs) + def evaluation_epoch_end(self, testing): + # reset dataloader idx + model_ref = self.trainer.get_model() + model_ref._current_dataloader_idx = None + + # setting `has_batch_loop_finished` to True + # will perform Results reduction accross entire epoch. + self.cached_results.has_batch_loop_finished = True + + def add_to_eval_loop_results(self, dl_idx, has_been_initialized): + callback_metrics = deepcopy(self.evaluation_callback_metrics) + for key in list(callback_metrics.keys()): + if "dataloader_idx" in key: + if f"dataloader_idx_{dl_idx}" not in key: + # remove dl_idx from self.callback_metrics not belonging to this dataset. + del callback_metrics[key] + if has_been_initialized: + self.eval_loop_results[dl_idx].update(callback_metrics) + else: + self.eval_loop_results.append(callback_metrics) - # get the final loop results - eval_loop_results = self._get_evaluate_epoch_results(test_mode) - return eval_loop_results + def prepare_eval_loop_results(self): + num_dataloaders = self.trainer.evaluation_loop.num_dataloaders + has_been_initialized = len(self.eval_loop_results) == num_dataloaders + for dl_idx in range(self.trainer.evaluation_loop.num_dataloaders): + self.add_to_eval_loop_results(dl_idx, has_been_initialized) + + def get_evaluate_epoch_results(self, test_mode): + + self.prepare_eval_loop_results() - def _get_evaluate_epoch_results(self, test_mode): # log results of test if test_mode and self.trainer.is_global_zero and self.trainer.verbose_test: print('-' * 80) @@ -253,106 +275,6 @@ def _get_evaluate_epoch_results(self, test_mode): self.eval_loop_results = [] return results - def _log_on_evaluation_epoch_end_metrics(self, epoch_logs): - step_metrics = self.trainer.evaluation_loop.step_metrics - - num_loaders = len(step_metrics) - - # clear mem - self.trainer.evaluation_loop.step_metrics = [] - - if self.trainer.running_sanity_check: - return - - # track all metrics we want to log - metrics_to_log = [] - - # --------------------------- - # UPDATE EPOCH LOGGED METRICS - # --------------------------- - # (ie: in methods at the val_epoch_end level) - # union the epoch logs with whatever was returned from loaders and reduced - epoch_logger_metrics = epoch_logs.get_epoch_log_metrics() - epoch_pbar_metrics = epoch_logs.get_epoch_pbar_metrics() - - self.logged_metrics.update(epoch_logger_metrics) - self.add_progress_bar_metrics(epoch_pbar_metrics) - - # enable the metrics to be monitored - self.callback_metrics.update(epoch_logger_metrics) - self.callback_metrics.update(epoch_pbar_metrics) - - if len(epoch_logger_metrics) > 0: - metrics_to_log.append(epoch_logger_metrics) - - # -------------------------------- - # UPDATE METRICS PER DATALOADER - # -------------------------------- - # each dataloader aggregated metrics - # now we log all of them - for dl_idx, dl_metrics in enumerate(step_metrics): - if len(dl_metrics) == 0: - # Ensure custom logged metrics are included if not included with step metrics - if len(epoch_logger_metrics) > 0: - self.eval_loop_results.append(epoch_logger_metrics) - continue - - reduced_epoch_metrics = dl_metrics[0].__class__.reduce_on_epoch_end(dl_metrics) - # track the metrics - logger_metrics = reduced_epoch_metrics.get_epoch_log_metrics() - pbar_metrics = reduced_epoch_metrics.get_epoch_pbar_metrics() - forked_metrics = reduced_epoch_metrics.get_forked_metrics() - - # make the keys 'k/dl' - logger_metrics = self.__rename_keys_by_dataloader_idx(logger_metrics, dl_idx, num_loaders) - pbar_metrics = self.__rename_keys_by_dataloader_idx(pbar_metrics, dl_idx, num_loaders) - forked_metrics = self.__rename_keys_by_dataloader_idx(forked_metrics, dl_idx, num_loaders) - - self.logged_metrics.update(logger_metrics) - self.add_progress_bar_metrics(pbar_metrics) - - # enable the metrics to be monitored - self.callback_metrics.update(logger_metrics) - self.callback_metrics.update(pbar_metrics) - - # forked metrics were dropped, enable them for callbacks - self.callback_metrics.update(forked_metrics) - - # track the final results for the dataloader - self.add_to_eval_loop_results(dl_idx, num_loaders) - - # actually log - if len(logger_metrics) > 0: - metrics_to_log.append(logger_metrics) - - # log all the metrics as a s single dict - metrics_to_log = dict(ChainMap(*metrics_to_log)) - if len(metrics_to_log) > 0: - self.log_metrics(metrics_to_log, {}) - - def add_to_eval_loop_results(self, dl_idx, num_loaders): - callback_metrics = deepcopy(self.callback_metrics) - if num_loaders == 1: - if len(self.eval_loop_results) > 0: - self.eval_loop_results[0].update(callback_metrics) - else: - self.eval_loop_results.append(callback_metrics) - return - - for key in list(callback_metrics.keys()): - if "dataloader_idx" in key: - if f"dataloader_idx_{dl_idx}" not in key: - # remove dl_idx from self.callback_metrics not belonging to this dataset. - del callback_metrics[key] - self.eval_loop_results.append(callback_metrics) - - def __rename_keys_by_dataloader_idx(self, metrics, dataloader_idx, num_loaders): - if num_loaders == 1: - return metrics - - result = {f'{k}/dataloader_idx_{dataloader_idx}': v for k, v in metrics.items()} - return result - def _track_callback_metrics(self, eval_results, using_eval_result): if ( len(eval_results) > 0 and @@ -364,8 +286,10 @@ def _track_callback_metrics(self, eval_results, using_eval_result): if isinstance(eval_results, list): for eval_result in eval_results: self.trainer.logger_connector.callback_metrics.update(eval_result.callback_metrics) + self.trainer.logger_connector.evaluation_callback_metrics.update(eval_result.callback_metrics) else: self.trainer.logger_connector.callback_metrics.update(eval_results.callback_metrics) + self.trainer.logger_connector.evaluation_callback_metrics.update(eval_results.callback_metrics) else: flat = {} if isinstance(eval_results, list): @@ -381,6 +305,7 @@ def _track_callback_metrics(self, eval_results, using_eval_result): flat['checkpoint_on'] = flat['val_loss'] flat['early_stop_on'] = flat['val_loss'] self.trainer.logger_connector.callback_metrics.update(flat) + self.trainer.logger_connector.evaluation_callback_metrics.update(flat) else: # with a scalar return, auto set it to "val_loss" for callbacks if isinstance(eval_results, torch.Tensor): @@ -393,6 +318,7 @@ def _track_callback_metrics(self, eval_results, using_eval_result): flat['checkpoint_on'] = flat['val_loss'] flat['early_stop_on'] = flat['val_loss'] self.trainer.logger_connector.callback_metrics.update(flat) + self.trainer.logger_connector.evaluation_callback_metrics.update(flat) def __process_eval_epoch_end_results_and_log_legacy_update(self, prog_bar_metrics, log_metrics, callback_metrics): # eval loop returns all metrics @@ -406,9 +332,10 @@ def __process_eval_epoch_end_results_and_log_legacy_update(self, prog_bar_metric self.trainer.logger_connector.log_metrics(log_metrics, {}) # track metrics for callbacks (all prog bar, logged and callback metrics) + callback_metrics.update(log_metrics) + callback_metrics.update(prog_bar_metrics) self.trainer.logger_connector.callback_metrics.update(callback_metrics) - self.trainer.logger_connector.callback_metrics.update(log_metrics) - self.trainer.logger_connector.callback_metrics.update(prog_bar_metrics) + self.trainer.logger_connector.evaluation_callback_metrics.update(callback_metrics) if len(dataloader_result_metrics) > 0: self.eval_loop_results.append(dataloader_result_metrics) diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index 6ebab1ade0f1d3..e3a0f1108f1f96 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -109,9 +109,6 @@ def on_evaluation_end(self, *args, **kwargs): else: self.trainer.call_hook('on_validation_end', *args, **kwargs) - # reset stage to train - self.trainer.logger_connector.set_stage("train") - def reload_evaluation_dataloaders(self): model = self.trainer.get_model() if self.testing: @@ -169,12 +166,17 @@ def evaluation_step(self, test_mode, batch, batch_idx, dataloader_idx): # configure args args = self.build_args(test_mode, batch, batch_idx, dataloader_idx) + model_ref = self.trainer.get_model() # run actual test step if self.testing: + model_ref._current_fx_name = "test_step" output = self.trainer.accelerator_backend.test_step(args) else: + model_ref._current_fx_name = "validation_step" output = self.trainer.accelerator_backend.validation_step(args) + # capture any logged information + self.trainer.logger_connector.cache_logged_metrics() # track batch size for weighted average is_result_obj = isinstance(output, Result) if is_result_obj: @@ -194,38 +196,30 @@ def evaluation_step_end(self, *args, **kwargs): output = self.trainer.call_hook('validation_step_end', *args, **kwargs) return output - def evaluation_epoch_end(self, num_dataloaders): + def evaluation_epoch_end(self): + # unset dataloder_idx in model + self.trainer.logger_connector.evaluation_epoch_end(self.testing) + using_eval_result = self.is_using_eval_results() # call the model epoch end - deprecated_results = self.__run_eval_epoch_end(num_dataloaders, using_eval_result) - - # 1.0 - epoch_logs = self.trainer.get_model()._results + deprecated_results = self.__run_eval_epoch_end(self.num_dataloaders, using_eval_result) # enable returning anything for i, r in enumerate(deprecated_results): if not isinstance(r, (dict, Result, torch.Tensor)): deprecated_results[i] = [] - return deprecated_results, epoch_logs + return deprecated_results - def log_epoch_metrics(self, deprecated_eval_results, epoch_logs, test_mode): - using_eval_result = self.is_using_eval_results() - eval_loop_results = self.trainer.logger_connector.on_evaluation_epoch_end( - deprecated_eval_results, - epoch_logs, - using_eval_result, - test_mode - ) + def log_epoch_metrics_on_evaluation_end(self): + # get the final loop results + eval_loop_results = self.trainer.logger_connector.get_evaluate_epoch_results(self.testing) return eval_loop_results def __run_eval_epoch_end(self, num_dataloaders, using_eval_result): model = self.trainer.get_model() - # reset results - model._results = Result() - # with a single dataloader don't pass an array outputs = self.outputs eval_results = outputs @@ -236,22 +230,22 @@ def __run_eval_epoch_end(self, num_dataloaders, using_eval_result): if self.testing: if is_overridden('test_epoch_end', model=model): - model._current_fx_name = 'test_epoch_end' if using_eval_result: eval_results = self.__gather_epoch_end_eval_results(outputs) - + model._current_fx_name = 'test_epoch_end' eval_results = model.test_epoch_end(eval_results) user_reduced = True else: if is_overridden('validation_epoch_end', model=model): - model._current_fx_name = 'validation_epoch_end' if using_eval_result: eval_results = self.__gather_epoch_end_eval_results(outputs) - + model._current_fx_name = 'validation_epoch_end' eval_results = model.validation_epoch_end(eval_results) user_reduced = True + # capture logging + self.trainer.logger_connector.cache_logged_metrics() # depre warning if eval_results is not None and user_reduced: step = 'testing_epoch_end' if self.testing else 'validation_epoch_end' @@ -266,6 +260,9 @@ def __run_eval_epoch_end(self, num_dataloaders, using_eval_result): if not isinstance(eval_results, list): eval_results = [eval_results] + # track depreceated metrics + self.trainer.logger_connector.track_metrics_deprecated(eval_results, using_eval_result, self.testing) + return eval_results def __gather_epoch_end_eval_results(self, outputs): @@ -299,12 +296,7 @@ def __auto_reduce_result_objs(self, outputs): return eval_results def on_evaluation_batch_start(self, batch, batch_idx, dataloader_idx): - # reset the result of the PL module - model = self.trainer.get_model() - model._results = Result() - model._current_fx_name = 'evaluation_step' - - # set dataloader_idx and track batch_size + # set dataloader_idx to model and track batch_size self.trainer.logger_connector.on_evaluation_batch_start( self.testing, batch, dataloader_idx, self.num_dataloaders) @@ -313,13 +305,16 @@ def on_evaluation_batch_start(self, batch, batch_idx, dataloader_idx): else: self.trainer.call_hook('on_validation_batch_start', batch, batch_idx, dataloader_idx) - def on_evaluation_batch_end(self, *args, **kwargs): + def on_evaluation_batch_end(self, output, batch, batch_idx, dataloader_idx): if self.testing: - self.trainer.call_hook('on_test_batch_end', *args, **kwargs) + self.trainer.call_hook('on_test_batch_end', output, batch, batch_idx, dataloader_idx) else: - self.trainer.call_hook('on_validation_batch_end', *args, **kwargs) + self.trainer.call_hook('on_validation_batch_end', output, batch, batch_idx, dataloader_idx) - def evaluation_batch_end_cleanup(self, output, batch_idx, dataloader_idx): + # store predicitons if do_write_predictions and track eval loss history + self.store_predictions(output, batch_idx, dataloader_idx) + + def store_predictions(self, output, batch_idx, dataloader_idx): # Add step predictions to prediction collection to write later if output is not None: do_write_predictions = isinstance(output, Result) and self.testing @@ -336,30 +331,26 @@ def on_evaluation_epoch_end(self, *args, **kwargs): else: self.trainer.call_hook('on_validation_epoch_end', *args, **kwargs) - def log_evaluation_step_metrics(self, batch, batch_idx): - results = self.trainer.get_model()._results - if len(results) == 1: - return None - - results.track_batch_size(batch) - self.__log_result_step_metrics(results, batch_idx) - - return results - - # TODO: deprecate at 1.0 - def log_evaluation_step_metrics_legacy(self, output, batch_idx): + def log_evaluation_step_metrics(self, output, batch_idx): if self.trainer.running_sanity_check: return + step_log_metrics = {} + step_pbar_metrics = {} if isinstance(output, EvalResult): - self.__log_result_step_metrics(output, batch_idx) + step_log_metrics = output.get_batch_log_metrics(include_forked_originals=False) + step_pbar_metrics = output.get_batch_pbar_metrics(include_forked_originals=False) - def __log_result_step_metrics(self, output, batch_idx): - step_log_metrics = output.get_batch_log_metrics(include_forked_originals=False) - step_pbar_metrics = output.get_batch_pbar_metrics(include_forked_originals=False) + self.__log_result_step_metrics(step_log_metrics, step_pbar_metrics, batch_idx) + def __log_result_step_metrics(self, step_log_metrics, step_pbar_metrics, batch_idx): cached_batch_log_metrics = \ self.trainer.logger_connector.cached_results.get_latest_batch_log_metrics() + cached_batch_pbar_metrics = \ + self.trainer.logger_connector.cached_results.get_latest_batch_pbar_metrics() + + step_log_metrics.update(cached_batch_log_metrics) + step_pbar_metrics.update(cached_batch_pbar_metrics) if len(step_log_metrics) > 0: # make the metrics appear as a different line in the same graph diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 4ef83dc7de544c..ad7a20e500687e 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -239,6 +239,8 @@ def __init__( num_nodes: number of GPU nodes for distributed training. + num_processes: number of processes for distributed training with distributed_backend="ddp_cpu" + num_sanity_val_steps: Sanity check runs n validation batches before starting the training routine. Set it to `-1` to run all batches in all validation dataloaders. Default: 2 @@ -575,14 +577,12 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None): self.evaluation_loop.setup(model, max_batches, dataloaders) # hook - # TODO: should this be insider the dataloader loop? self.evaluation_loop.on_evaluation_epoch_start() # run validation/testing for dataloader_idx, dataloader in enumerate(dataloaders): # bookkeeping dl_outputs = [] - dl_step_metrics = [] dataloader = self.accelerator_backend.process_dataloader(dataloader) dl_max_batches = self.evaluation_loop.max_batches[dataloader_idx] @@ -601,46 +601,37 @@ def run_evaluation(self, test_mode: bool = False, max_batches=None): output = self.evaluation_loop.evaluation_step(test_mode, batch, batch_idx, dataloader_idx) output = self.evaluation_loop.evaluation_step_end(output) - # hook + # hook + store predictions self.evaluation_loop.on_evaluation_batch_end(output, batch, batch_idx, dataloader_idx) - # clean up - self.evaluation_loop.evaluation_batch_end_cleanup(output, batch_idx, dataloader_idx) - - # TODO: deprecate 1.0 - self.evaluation_loop.log_evaluation_step_metrics_legacy(output, batch_idx) - - # log step metrics - step_metrics = self.evaluation_loop.log_evaluation_step_metrics(batch, batch_idx) - - # track epoch level outputs - dl_step_metrics = self.track_output_for_epoch_end(dl_step_metrics, step_metrics) + # log batch metrics + self.evaluation_loop.log_evaluation_step_metrics(output, batch_idx) # track epoch level outputs dl_outputs = self.track_output_for_epoch_end(dl_outputs, output) + # store batch level output per dataloader self.evaluation_loop.outputs.append(dl_outputs) - self.evaluation_loop.step_metrics.append(dl_step_metrics) # lightning module method - deprecated_eval_results, epoch_logs = self.evaluation_loop.evaluation_epoch_end( - num_dataloaders=len(dataloaders) - ) - - # bookkeeping - eval_loop_results = self.evaluation_loop.log_epoch_metrics(deprecated_eval_results, epoch_logs, test_mode) - self.evaluation_loop.predictions.to_disk() + deprecated_eval_results = self.evaluation_loop.evaluation_epoch_end() # hook self.evaluation_loop.on_evaluation_epoch_end() + # hook + self.evaluation_loop.on_evaluation_end() + + # log epoch metrics + eval_loop_results = self.evaluation_loop.log_epoch_metrics_on_evaluation_end() + + # save predictions to disk + self.evaluation_loop.predictions.to_disk() + # enable train mode again self.evaluation_loop.on_evaluation_model_train() torch.set_grad_enabled(True) - # hook - self.evaluation_loop.on_evaluation_end() - return eval_loop_results, deprecated_eval_results def track_output_for_epoch_end(self, outputs, output): @@ -874,10 +865,8 @@ def _cache_logged_metrics(self): self.logger_connector.cache_logged_metrics() def call_hook(self, hook_name, *args, **kwargs): - # temporary. Don't modify evaluation behaviour - if self.logger_connector._current_stage == "train": - # set hook_name to model + reset Result obj - self._reset_result_and_set_hook_fx_name(hook_name) + # set hook_name to model + reset Result obj + self._reset_result_and_set_hook_fx_name(hook_name) # always profile hooks with self.profiler.profile(hook_name): @@ -900,8 +889,6 @@ def call_hook(self, hook_name, *args, **kwargs): accelerator_hook = getattr(self.accelerator_backend, hook_name) output = accelerator_hook(*args, **kwargs) - # temporary. Don't modify evaluation behaviour - if self.logger_connector._current_stage == "train": - # capture logging - self._cache_logged_metrics() + # capture logging + self._cache_logged_metrics() return output diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index bccc5262a5bda9..f3af5b745a3804 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -333,8 +333,8 @@ def on_test_model_train(self): 'on_validation_batch_start', 'on_validation_batch_end', 'on_validation_epoch_end', - 'on_validation_model_train', 'on_save_checkpoint', + 'on_validation_model_train', 'on_epoch_end', 'on_train_epoch_end', 'on_train_end', diff --git a/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py b/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py index 6329480e10a113..8168f09c68e003 100644 --- a/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py +++ b/tests/trainer/legacy_deprecate_flow_log_tests/test_eval_loop_dict_return.py @@ -44,7 +44,7 @@ def backward(self, loss, optimizer, optimizer_idx): # out are the results of the full loop # eval_results are output of _evaluate out, eval_results = trainer.run_evaluation(test_mode=False) - assert len(out) == 0 + assert len(out) == 1 assert len(eval_results) == 0 # make sure correct steps were called @@ -75,7 +75,7 @@ def test_validation_step_scalar_return(tmpdir): # out are the results of the full loop # eval_results are output of _evaluate out, eval_results = trainer.run_evaluation(test_mode=False) - assert len(out) == 0 + assert len(out) == 1 assert len(eval_results) == 2 assert eval_results[0] == 171 and eval_results[1] == 171 @@ -190,7 +190,7 @@ def test_val_step_step_end_no_return(tmpdir): # out are the results of the full loop # eval_results are output of _evaluate callback_metrics, eval_results = trainer.run_evaluation(test_mode=False) - assert len(callback_metrics) == 0 + assert len(callback_metrics) == 1 assert len(eval_results) == 0 # make sure correct steps were called diff --git a/tests/trainer/logging/test_logger_connector.py b/tests/trainer/logging/test_logger_connector.py index 08936f89eb9f85..38a7dee896a8cb 100644 --- a/tests/trainer/logging/test_logger_connector.py +++ b/tests/trainer/logging/test_logger_connector.py @@ -240,6 +240,14 @@ def test_step(self, batch, batch_idx, dl_idx=0): self.log("test_loss", loss, on_step=True, on_epoch=True) return {"test_loss": loss} + def on_test_batch_end(self, *args, **kwargs): + # save objects as it will be reset at the end of epoch. + self.batch_results = deepcopy(self.trainer.logger_connector.cached_results) + + def on_test_epoch_end(self): + # save objects as it will be reset at the end of epoch. + self.reduce_results = deepcopy(self.trainer.logger_connector.cached_results) + def test_dataloader(self): return [torch.utils.data.DataLoader(RandomDataset(32, 64)) for _ in range(num_dataloaders)] @@ -260,7 +268,7 @@ def test_dataloader(self): ) trainer.test(model) - test_results = trainer.logger_connector._cached_results["test"] + test_results = model.batch_results generated = test_results(fx_name="test_step") assert len(generated) == num_dataloaders @@ -269,7 +277,7 @@ def test_dataloader(self): generated = len(test_results(fx_name="test_step", dl_idx=str(dl_idx))) assert generated == limit_test_batches - test_results.has_batch_loop_finished = True + test_results = model.reduce_results for dl_idx in range(num_dataloaders): expected = torch.stack(model.test_losses[str(dl_idx)]).mean() diff --git a/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py b/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py index 0f3217b3f004cb..12f53328ec98ad 100644 --- a/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py +++ b/tests/trainer/logging_tests/test_eval_loop_logging_1_0.py @@ -20,6 +20,9 @@ from tests.base.deterministic_model import DeterministicModel from tests.base import SimpleModule, BoringModel, RandomDataset import os +import numpy as np +import itertools +import collections import torch import pytest @@ -299,20 +302,16 @@ def validation_epoch_end(self, outputs) -> None: # make sure correct values were logged logged_val = trainer.dev_debugger.logged_metrics - # sanity check - assert logged_val[0]['global_step'] == 0 - assert logged_val[1]['global_step'] == 0 - # 3 val batches - assert logged_val[2]['val_loss_step/epoch_0'] == model.seen_vals[0] - assert logged_val[3]['val_loss_step/epoch_0'] == model.seen_vals[1] - assert logged_val[4]['val_loss_step/epoch_0'] == model.seen_vals[2] + assert logged_val[0]['val_loss_step/epoch_0'] == model.seen_vals[0] + assert logged_val[1]['val_loss_step/epoch_0'] == model.seen_vals[1] + assert logged_val[2]['val_loss_step/epoch_0'] == model.seen_vals[2] # epoch mean - assert logged_val[5]['val_loss_epoch'] == model.manual_epoch_end_mean + assert logged_val[3]['val_loss_epoch'] == model.manual_epoch_end_mean # only those logged - assert len(logged_val) == 6 + assert len(logged_val) == 4 @pytest.mark.parametrize(['batches', 'log_interval', 'max_epochs'], [(1, 1, 1), (64, 32, 2)]) @@ -320,7 +319,7 @@ def test_eval_epoch_only_logging(tmpdir, batches, log_interval, max_epochs): """ Tests that only test_epoch_end can be used to log, and we return them in the results. """ - os.environ['PL_DEV_DEBUG'] = '1' + os.environ['PL_DEV_DEBUG'] = '0' class TestModel(BoringModel): def test_epoch_end(self, outputs): @@ -386,7 +385,6 @@ def test_dataloader(self): weights_summary=None, ) results = trainer.test(model) - assert len(results[0]) == len(results[1]) assert "test_loss_epoch/dataloader_idx_0" in results[0] assert "test_loss_epoch/dataloader_idx_1" in results[1] @@ -419,3 +417,366 @@ def test_dataloader(self): assert len(results) == 1 # error : It is wrong there. `y` should equal test_loss_epoch assert results[0]['test_loss'] == results[0]['y'] + + +def test_log_works_in_val_callback(tmpdir): + """ + Tests that log can be called within callback + """ + os.environ['PL_DEV_DEBUG'] = '1' + + class TestCallback(callbacks.Callback): + + # helpers + count = 1 + choices = [False, True] + # used to compute expected values + callback_funcs_called = collections.defaultdict(list) + funcs_called_count = collections.defaultdict(int) + funcs_attr = {} + + def make_logging(self, pl_module, func_name, + func_idx, on_steps=[], on_epochs=[], prob_bars=[]): + self.funcs_called_count[func_name] += 1 + product = [on_steps, on_epochs, prob_bars] + for idx, (on_step, on_epoch, prog_bar) in enumerate(list(itertools.product(*product))): + # run logging + custom_func_name = f"{func_idx}_{idx}_{func_name}" + pl_module.log(custom_func_name, self.count * func_idx, + on_step=on_step, on_epoch=on_epoch, prog_bar=prog_bar) + # catch information for verification + self.callback_funcs_called[func_name].append([self.count * func_idx]) + self.funcs_attr[custom_func_name] = { + "on_step": on_step, + "on_epoch": on_epoch, + "prog_bar": prog_bar, + "forked": on_step and on_epoch, + "func_name": func_name} + + if on_step and on_epoch: + self.funcs_attr[f"{custom_func_name}_step"] = { + "on_step": True, + "on_epoch": False, + "prog_bar": prog_bar, + "forked": False, + "func_name": func_name} + + self.funcs_attr[f"{custom_func_name}_epoch"] = { + "on_step": False, + "on_epoch": True, + "prog_bar": prog_bar, + "forked": False, + "func_name": func_name} + + def on_validation_start(self, trainer, pl_module): + self.make_logging(pl_module, 'on_validation_start', 1, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_epoch_start(self, trainer, pl_module): + self.make_logging(pl_module, 'on_epoch_start', 2, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_validation_epoch_start(self, trainer, pl_module): + self.make_logging(pl_module, 'on_validation_epoch_start', 3, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_batch_start(self, trainer, pl_module): + self.make_logging(pl_module, 'on_batch_start', 4, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_validation_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx): + self.make_logging(pl_module, 'on_validation_batch_start', 5, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_batch_end(self, trainer, pl_module): + self.make_logging(pl_module, 'on_batch_end', 6, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): + self.make_logging(pl_module, 'on_validation_batch_end', 7, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + # used to make sure aggregation works fine. + # we should obtain func[value * c for c in range(1, max_epochs * limit_validation_batches)]) + # with func = np.mean if on_epoch else func = np.max + self.count += 1 + + def on_epoch_end(self, trainer, pl_module): + self.make_logging(pl_module, 'on_epoch_end', 8, on_steps=[False], + on_epochs=self.choices, prob_bars=self.choices) + + def on_validation_epoch_end(self, trainer, pl_module): + self.make_logging(pl_module, 'on_validation_epoch_end', 9, on_steps=[False], + on_epochs=self.choices, prob_bars=self.choices) + + class TestModel(BoringModel): + + def validation_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.log('val_loss', loss) + + max_epochs = 1 + model = TestModel() + model.validation_epoch_end = None + test_callback = TestCallback() + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=1, + limit_val_batches=4, + limit_test_batches=0, + val_check_interval=0., + num_sanity_val_steps=0, + max_epochs=max_epochs, + callbacks=[test_callback], + ) + trainer.fit(model) + trainer.test() + + assert test_callback.funcs_called_count["on_epoch_start"] == 1 + assert test_callback.funcs_called_count["on_batch_start"] == 1 + assert test_callback.funcs_called_count["on_batch_end"] == 1 + assert test_callback.funcs_called_count["on_validation_start"] == 1 + assert test_callback.funcs_called_count["on_validation_epoch_start"] == 1 + assert test_callback.funcs_called_count["on_validation_batch_start"] == 4 + assert test_callback.funcs_called_count["on_validation_batch_end"] == 4 + assert test_callback.funcs_called_count["on_validation_epoch_end"] == 1 + assert test_callback.funcs_called_count["on_epoch_end"] == 1 + + # Make sure the func_name exists within callback_metrics. If not, we missed some + callback_metrics_keys = [*trainer.callback_metrics.keys()] + for func_name in test_callback.callback_funcs_called.keys(): + is_in = False + for callback_metrics_key in callback_metrics_keys: + if func_name in callback_metrics_key: + is_in = True + assert is_in, (func_name, callback_metrics_keys) + + # function used to describe expected return logic + def get_expected_output(func_attr, original_values): + + if func_attr["on_epoch"] and not func_attr["on_step"]: + # Apply mean on values + expected_output = np.mean(original_values) + else: + # Keep the latest value + expected_output = np.max(original_values) + return expected_output + + # Make sure the func_name output equals the average from all logged values when on_epoch true + # pop extra keys + trainer.callback_metrics.pop("debug_epoch") + trainer.callback_metrics.pop("val_loss") + for func_name, output_value in trainer.callback_metrics.items(): + # not sure how to handle this now + if "epoch_0" in func_name: + func_name = '/'.join(func_name.split('/')[:-1]) + continue + + if torch.is_tensor(output_value): + output_value = output_value.item() + # get creation attr + func_attr = test_callback.funcs_attr[func_name] + + # retrived orginal logged values + original_values = test_callback.callback_funcs_called[func_attr["func_name"]] + + # compute expected output and compare to actual one + expected_output = get_expected_output(func_attr, original_values) + assert float(output_value) == float(expected_output) + + for func_name, func_attr in test_callback.funcs_attr.items(): + if func_attr["prog_bar"] and (func_attr["on_step"] or func_attr["on_epoch"]) and not func_attr["forked"]: + assert func_name in trainer.logger_connector.progress_bar_metrics + else: + assert func_name not in trainer.logger_connector.progress_bar_metrics + + +def test_log_works_in_test_callback(tmpdir): + """ + Tests that log can be called within callback + """ + os.environ['PL_DEV_DEBUG'] = '1' + + class TestCallback(callbacks.Callback): + + # helpers + count = 1 + choices = [False, True] + + # used to compute expected values + callback_funcs_called = collections.defaultdict(list) + funcs_called_count = collections.defaultdict(int) + funcs_attr = {} + + def make_logging(self, pl_module, func_name, + func_idx, on_steps=[], on_epochs=[], prob_bars=[]): + original_func_name = func_name[:] + self.funcs_called_count[original_func_name] += 1 + product = [on_steps, on_epochs, prob_bars] + for idx, t in enumerate(list(itertools.product(*product))): + # run logging + func_name = original_func_name[:] + on_step, on_epoch, prog_bar = t + custom_func_name = f"{func_idx}_{idx}_{func_name}" + + pl_module.log(custom_func_name, self.count * func_idx, + on_step=on_step, on_epoch=on_epoch, prog_bar=prog_bar) + + num_dl_ext = '' + if pl_module._current_dataloader_idx is not None: + dl_idx = pl_module._current_dataloader_idx + num_dl_ext = f"/dataloader_idx_{dl_idx}" + func_name += num_dl_ext + + # catch information for verification + self.callback_funcs_called[func_name].append([self.count * func_idx]) + self.funcs_attr[custom_func_name + num_dl_ext] = { + "on_step": on_step, + "on_epoch": on_epoch, + "prog_bar": prog_bar, + "forked": on_step and on_epoch, + "func_name": func_name} + if on_step and on_epoch: + self.funcs_attr[f"{custom_func_name}_step" + num_dl_ext] = { + "on_step": True, + "on_epoch": False, + "prog_bar": prog_bar, + "forked": False, + "func_name": func_name} + + self.funcs_attr[f"{custom_func_name}_epoch" + num_dl_ext] = { + "on_step": False, + "on_epoch": True, + "prog_bar": prog_bar, + "forked": False, + "func_name": func_name} + + def on_test_start(self, trainer, pl_module): + self.make_logging(pl_module, 'on_test_start', 1, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_epoch_start(self, trainer, pl_module): + self.make_logging(pl_module, 'on_epoch_start', 2, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_test_epoch_start(self, trainer, pl_module): + self.make_logging(pl_module, 'on_test_epoch_start', 3, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_test_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx): + self.make_logging(pl_module, 'on_test_batch_start', 4, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + def on_test_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): + self.make_logging(pl_module, 'on_test_batch_end', 5, on_steps=self.choices, + on_epochs=self.choices, prob_bars=self.choices) + + # used to make sure aggregation works fine. + # we should obtain func[value * c for c in range(1, max_epochs * limit_test_batches)]) + # with func = np.mean if on_epoch else func = np.max + self.count += 1 + + def on_epoch_end(self, trainer, pl_module): + self.make_logging(pl_module, 'on_epoch_end', 6, on_steps=[False], + on_epochs=self.choices, prob_bars=self.choices) + + def on_test_epoch_end(self, trainer, pl_module): + self.make_logging(pl_module, 'on_test_epoch_end', 7, on_steps=[False], + on_epochs=self.choices, prob_bars=self.choices) + + max_epochs = 2 + num_dataloaders = 2 + + class TestModel(BoringModel): + + manual_mean = collections.defaultdict(list) + + def test_step(self, batch, batch_idx, dataloader_idx=None): + output = self.layer(batch) + loss = self.loss(batch, output) + self.log('test_loss', loss) + self.manual_mean[str(dataloader_idx)].append(loss) + + def test_dataloader(self): + return [torch.utils.data.DataLoader(RandomDataset(32, 64)) for _ in range(num_dataloaders)] + + model = TestModel() + model.test_epoch_end = None + test_callback = TestCallback() + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=2, + limit_val_batches=0, + limit_test_batches=2, + val_check_interval=0., + num_sanity_val_steps=0, + max_epochs=max_epochs, + callbacks=[test_callback], + ) + trainer.fit(model) + trainer.test() + + assert test_callback.funcs_called_count["on_test_start"] == 1 + assert test_callback.funcs_called_count["on_epoch_start"] == 2 + assert test_callback.funcs_called_count["on_test_epoch_start"] == 1 + assert test_callback.funcs_called_count["on_test_batch_start"] == 4 + assert test_callback.funcs_called_count["on_test_batch_end"] == 4 + assert test_callback.funcs_called_count["on_epoch_end"] == 2 + assert test_callback.funcs_called_count["on_test_epoch_end"] == 1 + + # Make sure the func_name exists within callback_metrics. If not, we missed some + callback_metrics_keys = [*trainer.callback_metrics.keys()] + + for func_name in test_callback.callback_funcs_called.keys(): + is_in = False + for callback_metrics_key in callback_metrics_keys: + if func_name in callback_metrics_key: + is_in = True + assert is_in, (func_name, callback_metrics_keys) + + # function used to describe expected return logic + def get_expected_output(func_attr, original_values): + # Apply mean on values + if func_attr["on_epoch"] and not func_attr["on_step"]: + expected_output = np.mean(original_values) + else: + expected_output = np.max(original_values) + return expected_output + + # Make sure the func_name output equals the average from all logged values when on_epoch true + # pop extra keys + assert "debug_epoch" in trainer.callback_metrics + trainer.callback_metrics.pop("debug_epoch") + + for dl_idx in range(num_dataloaders): + key = f"test_loss/dataloader_idx_{dl_idx}" + assert key in trainer.callback_metrics + assert torch.stack(model.manual_mean[str(dl_idx)]).mean() == trainer.callback_metrics[key] + trainer.callback_metrics.pop(key) + + for func_name, output_value in trainer.callback_metrics.items(): + # not sure how to handle this now + if "epoch_1" in func_name: + func_name = '/'.join(func_name.split('/')[:-1]) + continue + + if torch.is_tensor(output_value): + output_value = output_value.item() + + # get func attr + func_attr = test_callback.funcs_attr[func_name] + + # retrived orginal logged values + original_values = test_callback.callback_funcs_called[func_attr["func_name"]] + + # compute expected output and compare to actual one + expected_output = get_expected_output(func_attr, original_values) + assert float(output_value) == float(expected_output) + + for func_name, func_attr in test_callback.funcs_attr.items(): + if func_attr["prog_bar"] and (func_attr["on_step"] or func_attr["on_epoch"]) and not func_attr["forked"]: + assert func_name in trainer.logger_connector.progress_bar_metrics + else: + assert func_name not in trainer.logger_connector.progress_bar_metrics From 33470ba605201644254138b4d5288e86ae38ac14 Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Wed, 11 Nov 2020 22:04:05 +0000 Subject: [PATCH 04/11] Prevent crash if sync_dist=True on CPU (#4626) * Added test/fix for sync_dist raising NotImplementedError * Fixed comments/formatting * Revert base class change, enforce sync tensors across accelerators, added GPU test --- .../accelerators/cpu_accelerator.py | 10 ++- .../accelerators/gpu_accelerator.py | 9 ++- .../accelerators/tpu_accelerator.py | 10 ++- .../test_train_loop_logging_1_0.py | 66 +++++++++++++++++++ 4 files changed, 91 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/accelerators/cpu_accelerator.py b/pytorch_lightning/accelerators/cpu_accelerator.py index 083b5193ff8f30..66f9e4f0201b27 100644 --- a/pytorch_lightning/accelerators/cpu_accelerator.py +++ b/pytorch_lightning/accelerators/cpu_accelerator.py @@ -11,9 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional, Union, Any + import torch -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.utilities import AMPType, rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -80,3 +82,9 @@ def test_step(self, args): else: output = self.trainer.model.test_step(*args) return output + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + return tensor diff --git a/pytorch_lightning/accelerators/gpu_accelerator.py b/pytorch_lightning/accelerators/gpu_accelerator.py index e66f5bcb8b48c1..1a52c4037c8d32 100644 --- a/pytorch_lightning/accelerators/gpu_accelerator.py +++ b/pytorch_lightning/accelerators/gpu_accelerator.py @@ -11,10 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Union, Optional, Any import torch -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.utilities import AMPType from pytorch_lightning.distributed.dist import LightningDistributed @@ -120,3 +121,9 @@ def to_device(self, batch): # be referenced from and if there are multiple optimizers the batch will # wind up copying it to the same device repeatedly. return self.batch_to_device(batch, gpu_id) + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + return tensor diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py index 5f4e6cc22cacd9..15386b133f8bd9 100644 --- a/pytorch_lightning/accelerators/tpu_accelerator.py +++ b/pytorch_lightning/accelerators/tpu_accelerator.py @@ -14,13 +14,13 @@ import io import os import re -from typing import Optional +from typing import Optional, Union, Any import torch import torch.multiprocessing as mp from pytorch_lightning import _logger as log -from pytorch_lightning.accelerators.accelerator import Accelerator +from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp from pytorch_lightning.core import LightningModule from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn from pytorch_lightning.utilities.cloud_io import atomic_save @@ -337,3 +337,9 @@ def broadcast(self, obj, src=0): buffer = io.BytesIO(data.cpu().byte().numpy()) obj = torch.load(buffer) return obj + + def sync_tensor(self, + tensor: Union[torch.Tensor], + group: Optional[Any] = None, + reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: + return tensor diff --git a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py index 60ff33b402e4b3..cd8afd268cba81 100644 --- a/tests/trainer/logging_tests/test_train_loop_logging_1_0.py +++ b/tests/trainer/logging_tests/test_train_loop_logging_1_0.py @@ -682,3 +682,69 @@ def get_expected_output(func_attr, original_values): assert func_name in trainer.logger_connector.progress_bar_metrics else: assert func_name not in trainer.logger_connector.progress_bar_metrics + + +def test_logging_sync_dist_true_cpu(tmpdir): + """ + Tests to ensure that the sync_dist flag works with CPU (should just return the original value) + """ + fake_result = 1 + + class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + acc = self.step(batch[0]) + self.log('foo', torch.tensor(fake_result), on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum') + return acc + + def validation_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.log('bar', torch.tensor(fake_result), on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum') + return {"x": loss} + + model = TestModel() + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=1, + limit_val_batches=1, + max_epochs=2, + weights_summary=None, + ) + trainer.fit(model) + + assert trainer.logged_metrics['foo'] == fake_result + assert trainer.logged_metrics['bar'] == fake_result + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +def test_logging_sync_dist_true_gpu(tmpdir): + """ + Tests to ensure that the sync_dist flag works with GPU (should just return the original value) + """ + fake_result = 1 + + class TestModel(BoringModel): + def training_step(self, batch, batch_idx): + acc = self.step(batch[0]) + self.log('foo', torch.tensor(fake_result), on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum') + return acc + + def validation_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + self.log('bar', torch.tensor(fake_result), on_step=False, on_epoch=True, sync_dist=True, sync_dist_op='sum') + return {"x": loss} + + model = TestModel() + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=1, + limit_val_batches=1, + max_epochs=2, + gpus=1, + weights_summary=None, + ) + trainer.fit(model) + + assert trainer.logged_metrics['foo'] == fake_result + assert trainer.logged_metrics['bar'] == fake_result From bff99ee159c207e1e388966b3b0440f6779b90a8 Mon Sep 17 00:00:00 2001 From: Marc Ferradou Date: Thu, 12 Nov 2020 02:59:33 -0500 Subject: [PATCH 05/11] Small typo correction on CONTRIBUTING.md (#4625) * Update CONTRIBUTING.md Small typo correction. * Update .github/CONTRIBUTING.md Co-authored-by: Rohit Gupta Co-authored-by: Rohit Gupta Co-authored-by: Roger Shieh --- .github/CONTRIBUTING.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index bed729e02baeb1..868507df405205 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -112,8 +112,8 @@ In case you adding new dependencies, make sure that they are compatible with the ### Coding Style -1. Use f-strings for output formation (except logging when we stay with lazy `logging.info("Hello %s!`, name); -2. Black code formatter is used using `pre-commit` hook. +1. Use f-strings for output formation (except logging when we stay with lazy `logging.info("Hello %s!", name)`. +2. Black code formatter is used using a `pre-commit` hook. ### Documentation From 396a18eb78568c8676947de25e79525ef9a56512 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 12 Nov 2020 09:21:57 +0100 Subject: [PATCH 06/11] update changelog after 1.0.6 (#4624) * update changelog after 1.0.6 * fix formatting --- CHANGELOG.md | 61 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7da839b74e3432..fbf7a074a0ed53 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [unreleased] - YYYY-MM-DD +## [unreleased.Features] - YYYY-MM-DD ### Added @@ -27,26 +27,39 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added option to log momentum ([#4384](https://github.com/PyTorchLightning/pytorch-lightning/pull/4384)) -- Added `fsspec` to tuner ([#4458](https://github.com/PyTorchLightning/pytorch-lightning/pull/4458)) +- Added logging using `self.log` in train and evaluation for most callbacks and model hooks ( + [#4552](https://github.com/PyTorchLightning/pytorch-lightning/pull/4552), + [#4495](https://github.com/PyTorchLightning/pytorch-lightning/pull/4495), + [#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) -- Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775)) +### Changed +- Tuner algorithms will be skipped if `fast_dev_run=True` ([#3903](https://github.com/PyTorchLightning/pytorch-lightning/pull/3903)) -- Added logging using `self.log` in train and evaluation for most callbacks and model hooks ( - [#4552](https://github.com/PyTorchLightning/pytorch-lightning/pull/4552), - [#4495](https://github.com/PyTorchLightning/pytorch-lightning/pull/4495), - [#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439) -) -- Added `manual_optimizer_step` which work with `AMP Native` and `accumulated_grad_batches` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485)) +### Deprecated + + + +### Removed + + + +### Fixed + + + + +## [unreleased.BugFix] - YYYY-MM-DD + +### Added -- Added `persistent(mode)` method to metrics, to enable and disable metric states being added to `state_dict` ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482)) ### Changed -- Tuner algorithms will be skipped if `fast_dev_run=True` ([#3903](https://github.com/PyTorchLightning/pytorch-lightning/pull/3903)) + ### Deprecated @@ -58,13 +71,35 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- Fixed feature-lack in hpc load ([#4526](https://github.com/PyTorchLightning/pytorch-lightning/pull/4526)) -- Fixed metrics states being overridden in ddp mode ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482)) +## [1.0.6] - 2020-11-11 +### Added + +- Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775)) +- Added `manual_optimizer_step` which work with `AMP Native` and `accumulated_grad_batches` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485)) +- Added `persistent(mode)` method to metrics, to enable and disable metric states being added to `state_dict` ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482)) +- Added congratulations at the end of our notebooks ([#4555](https://github.com/PyTorchLightning/pytorch-lightning/pull/4555)) + +### Changed + +- Changed `fsspec` to tuner ([#4458](https://github.com/PyTorchLightning/pytorch-lightning/pull/4458)) +- Unify SLURM/TorchElastic under backend plugin ([#4578](https://github.com/PyTorchLightning/pytorch-lightning/pull/4578), + [#4580](https://github.com/PyTorchLightning/pytorch-lightning/pull/4580), + [#4581](https://github.com/PyTorchLightning/pytorch-lightning/pull/4581), + [#4582](https://github.com/PyTorchLightning/pytorch-lightning/pull/4582), + [#4583](https://github.com/PyTorchLightning/pytorch-lightning/pull/4583)) + +### Fixed +- Fixed feature-lack in `hpc_load` ([#4526](https://github.com/PyTorchLightning/pytorch-lightning/pull/4526)) +- Fixed metrics states being overridden in DDP mode ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482)) - Fixed `lightning_getattr`, `lightning_hasattr` not finding the correct attributes in datamodule ([#4347](https://github.com/PyTorchLightning/pytorch-lightning/pull/4347)) +- Fixed automatic optimization AMP by `manual_optimization_step` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485)) +- Replace `MisconfigurationException` with warning in `ModelCheckpoint` Callback ([#4560](https://github.com/PyTorchLightning/pytorch-lightning/pull/4560)) +- Fixed logged keys in mlflow logger ([#4412](https://github.com/PyTorchLightning/pytorch-lightning/pull/4412)) +- Fixed `is_picklable` by catching `AttributeError` ([#4508](https://github.com/PyTorchLightning/pytorch-lightning/pull/4508)) ## [1.0.5] - 2020-11-03 From 79fc92647c91b6ac222c0709bf168a6a8d304529 Mon Sep 17 00:00:00 2001 From: Jeff Yang Date: Thu, 12 Nov 2020 15:55:31 +0630 Subject: [PATCH 07/11] [make] Create Makefile (#4620) * [make] Create Makefile * exclude makefile * contributing info * rm .run_local_test.sh --- .github/CONTRIBUTING.md | 4 ++-- .run_local_tests.sh | 14 -------------- MANIFEST.in | 3 +++ Makefile | 15 +++++++++++++++ 4 files changed, 20 insertions(+), 16 deletions(-) delete mode 100644 .run_local_tests.sh create mode 100644 Makefile diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 868507df405205..c37890e9bfda2d 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -182,10 +182,10 @@ python -m pip install ".[dev, examples]" python -m pip install pre-commit ``` -You can run the full test-case in your terminal via this bash script: +You can run the full test-case in your terminal via this make script: ```bash -bash .run_local_tests.sh +make test ``` Note: if your computer does not have multi-GPU nor TPU these tests are skipped. diff --git a/.run_local_tests.sh b/.run_local_tests.sh deleted file mode 100644 index 0b4723828888ef..00000000000000 --- a/.run_local_tests.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash - -# install APEX, see https://github.com/NVIDIA/apex#linux -# to imitate SLURM set only single node -export SLURM_LOCALID=0 - -# use this to run tests -rm -rf _ckpt_* -rm -rf ./lightning_logs -python -m coverage run --source pytorch_lightning -m py.test pytorch_lightning tests pl_examples -v --flake8 -python -m coverage report -m - -# specific file -# python -m coverage run --source pytorch_lightning -m py.test --flake8 --durations=0 -v -k diff --git a/MANIFEST.in b/MANIFEST.in index ac8ba7cc625aa7..4cedbd0f44e9e0 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -57,6 +57,9 @@ exclude *.yaml # Exclude pyright config exclude .pyrightconfig.json +# Exclude Makefile +exclude Makefile + prune .git prune .github prune .circleci diff --git a/Makefile b/Makefile new file mode 100644 index 00000000000000..76e8bac4e3748a --- /dev/null +++ b/Makefile @@ -0,0 +1,15 @@ +.PHONY: test + +test: + # install APEX, see https://github.com/NVIDIA/apex#linux + # to imitate SLURM set only single node + export SLURM_LOCALID=0 + + # use this to run tests + rm -rf _ckpt_* + rm -rf ./lightning_logs + python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests pl_examples -v --flake8 + python -m coverage report -m + + # specific file + # python -m coverage run --source pytorch_lightning -m py.test --flake8 --durations=0 -v -k From 35f00df176f1e345e49924f55e019cb33403aa0b Mon Sep 17 00:00:00 2001 From: chaton Date: Thu, 12 Nov 2020 11:48:54 +0000 Subject: [PATCH 08/11] [FEAT] Add pytest section to Contribution how to ? (#4633) * update contributing * formatting --- .github/CONTRIBUTING.md | 51 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index c37890e9bfda2d..b202af9068280d 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -276,6 +276,57 @@ We welcome any useful contribution! For your convenience here's a recommended wo git push -f ``` +4. **How to add new tests** + +We are using [pytest](https://docs.pytest.org/en/stable/) in Pytorch Lightning. + +Here are tutorials: +* (recommended) [Visual Testing with pytest](https://www.youtube.com/playlist?list=PLCTHcU1KoD99Rim2tzg-IhYY2iu9FFvNo) from JetBrains on YouTube +* [Effective Python Testing With Pytest](https://realpython.com/pytest-python-testing/) article on realpython.com + +Here is the process to create a new test + +* 0. Optional: Follow tutorials ! +* 1. Find a file in tests/ which match what you want to test. If none, create one. +* 2. Use this template to get started ! +* 3. Use `BoringModel and derivates to test out your code`. + +```python +# TEST SHOULD BE IN YOUR FILE: tests/..../...py + +# RUN OUR TEST WITH: pytest tests/..../...py::test_explain_what_is_being_tested --verbose --color=yes --capture=no + +# TEST CODE TEMPLATE + +# pytest decorator +# @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +def test_explain_what_is_being_tested(tmpdir): + """ + Test description about text reason to be + """ + + # os.environ["PL_DEV_DEBUG"] = '1' optional. When activated, you can use internal trainer.dev_debugger + + class ExtendedModel(BoringModel): + ... + + model = ExtendedModel() + + # BoringModel is a functional model. You might want to set methods to None to test your behaviour + # Example: model.training_step_end = None + + trainer = Trainer( + default_root_dir=tmpdir, # will save everything within a tmpdir generated for this test + ... + ) + trainer.fit(model) + result = trainer.test() + + # assert the behaviour is correct. + assert ... + assert ... +``` + ### Bonus Workflow Tip If you don't want to remember all the commands above every time you want to push some code/setup a Lightning Dev environment on a new VM, you can set up bash aliases for some common commands. You can add these to one of your `~/.bashrc`, `~/.zshrc`, or `~/.bash_aliases` files. From bd6c413829a8bfedfa1c4237d77687ad7a6d4dba Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 12 Nov 2020 15:03:43 +0100 Subject: [PATCH 09/11] Conda: PT 1.8 (#3833) * PT 1.8 * unfreeze PT * drop nightly from full * add PT 1.8 to workflow * readme table * cuda * skip cuda * test 1.8 * unfreeze torch vision Co-authored-by: ydcjeff Co-authored-by: chaton Co-authored-by: Nicki Skafte --- .github/workflows/ci_dockers.yml | 8 +++++--- .github/workflows/ci_test-conda.yml | 2 +- .github/workflows/nightly.yml | 6 ++++-- README.md | 16 ++++++++-------- dockers/base-conda/Dockerfile | 8 ++++---- dockers/base-cuda/Dockerfile | 7 ++++++- requirements.txt | 2 +- requirements/examples.txt | 2 +- 8 files changed, 30 insertions(+), 21 deletions(-) diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index ef410a9afd9a85..0a7f326dfce8e3 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -66,6 +66,9 @@ jobs: fail-fast: false matrix: include: + # todo: see notes in Dockerfile + #- python_version: 3.7 + # pytorch_version: 1.8 - python_version: 3.8 pytorch_version: 1.7 - python_version: 3.7 @@ -110,9 +113,8 @@ jobs: pytorch_version: 1.4 - python_version: 3.7 pytorch_version: 1.7 - # TODO - # - python_version: 3.7 - # pytorch_version: 1.8 + - python_version: 3.7 + pytorch_version: 1.8 steps: - name: Checkout uses: actions/checkout@v2 diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index c86806785880bd..f5e5e6470e5508 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -16,7 +16,7 @@ jobs: matrix: # os: [ubuntu-20.04] python-version: [3.7] - pytorch-version: [1.3, 1.4, 1.5, 1.6, 1.7] + pytorch-version: [1.3, 1.4, 1.5, 1.6, 1.7, 1.8] # Timeout: https://stackoverflow.com/a/59076067/4521646 timeout-minutes: 35 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 1395b7ede4b1df..7d4490f435ee85 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -70,13 +70,13 @@ jobs: tags: pytorchlightning/pytorch_lightning:base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }} timeout-minutes: 55 - docker-cuda: + docker-cuda-conda: runs-on: ubuntu-20.04 strategy: fail-fast: false matrix: python_version: [3.6, 3.7, 3.8] - pytorch_version: [1.3, 1.4, 1.5, 1.6, 1.7] + pytorch_version: [1.3, 1.4, 1.5, 1.6, 1.7, 1.8] exclude: # excludes PT 1.3 as it is missing on pypi - python_version: 3.8 @@ -104,6 +104,8 @@ jobs: id: extend - name: Publish CUDA to Docker Hub + # ToDo: extend also building for Nightly from pip + if: matrix.pytorch_version < 1.8 # publish master/release uses: docker/build-push-action@v2 with: diff --git a/README.md b/README.md index 30079df9317590..e7ecfc1a45126c 100644 --- a/README.md +++ b/README.md @@ -89,14 +89,14 @@ Lightning can automatically export to ONNX or TorchScript for those cases. ## Continuous Integration
-| System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 | 1.7 (latest) | -| :---: | :---: | :---: | :---: | :---: | :---: | -| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | -| Linux py3.7 [GPUs**] | - | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | -| Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | -| Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | -| OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | -| Windows py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | +| System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 | 1.7 (latest) | 1.8 (nightly) | +| :---: | :---: | :---: | :---: | :---: | :---: | :---: | +| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | +| Linux py3.7 [GPUs**] | - | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | - | +| Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | - | +| Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | +| OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | +| Windows py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - _\* `torch>=1.4` is the minimal pytorch version for Python 3.8_ - _\** tests run on two NVIDIA K80_ diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index ea8c6bc5d001da..09c675de3068e2 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -99,15 +99,15 @@ COPY ./requirements/test.txt requirements-test.txt RUN \ # Disable cache pip config set global.cache-dir false && \ - #echo ". ${WORKDIR}/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \ - #echo "conda activate ${CONDA_ENV}" >> ~/.bashrc && \ - #source ~/.bashrc && \ # Install remaining requirements pip install -r requirements-extra.txt --upgrade-strategy only-if-needed && \ pip install -r requirements-test.txt --upgrade-strategy only-if-needed && \ - pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda${CUDA_VERSION%%.*}0 && \ rm requirements* +RUN \ + # install DALI, needed for examples + pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda${CUDA_VERSION%%.*}0 + RUN \ # install NVIDIA AMP git clone https://github.com/NVIDIA/apex && \ diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index f886ccc30be7a9..2557e855cf3eed 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -93,10 +93,15 @@ RUN \ python -c "import re ; fname = 'requirements.txt' ; req = re.sub(r'torch[>=]+[\d\.]+', 'torch==${PYTORCH_VERSION}.*', open(fname).read()) ; open(fname, 'w').write(req)" && \ # Install all requirements + # todo: find a way how to install nightly PT version + # --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu${cuda_ver[0]}${cuda_ver[1]}/torch_nightly.html pip install -r requirements/devel.txt --upgrade-strategy only-if-needed --use-feature=2020-resolver && \ - pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda${CUDA_VERSION%%.*}0 && \ rm -rf requirements* +RUN \ + # install DALI, needed for examples + pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda${CUDA_VERSION%%.*}0 + RUN \ # install NVIDIA AMP git clone https://github.com/NVIDIA/apex && \ diff --git a/requirements.txt b/requirements.txt index d270e2bc5d8543..0f8423e0860f02 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # the default package dependencies numpy>=1.16.4 -torch>=1.3,<1.8 +torch>=1.3 future>=0.17.1 # required for builtins in setup.py # pyyaml>=3.13 PyYAML>=5.1 # OmegaConf requirement >=5.1 diff --git a/requirements/examples.txt b/requirements/examples.txt index 0afa62f9ffa95e..6e48778cb222a0 100644 --- a/requirements/examples.txt +++ b/requirements/examples.txt @@ -1,2 +1,2 @@ -torchvision>=0.4.1,<0.9.0 +torchvision>=0.4.1 gym>=0.17.0 From 4a01fd048cebb65405021d3f24ae4dc07cb735e6 Mon Sep 17 00:00:00 2001 From: chaton Date: Thu, 12 Nov 2020 15:59:01 +0000 Subject: [PATCH 10/11] [FIX] Average Pbar Metrics (#4534) * wip * update * normalize loss * update test * resolve bug * update test and add TODO * make sure it can be sync * add TODO * update sol --- pytorch_lightning/core/step_result.py | 3 ++ .../test_trainer_steps_scalar_return.py | 50 ++++++++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 8f8a517d544f01..12f1b57f836f27 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -134,6 +134,9 @@ def log( # sync across workers when using distributed training sync_fn = sync_fn or sync_ddp_if_available if sync_dist and isinstance(value, (torch.Tensor, numbers.Number)): + is_dist_initialized = torch.distributed.is_available() and torch.distributed.is_initialized() + # TODO: Find a way to make the reduction only once, so we don't need to clone. + value = value.clone() if is_dist_initialized else value value = sync_fn(value, group=sync_dist_group, reduce_op=sync_dist_op) if 'meta' not in self: diff --git a/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py b/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py index 2a66f743a49ef8..b85646e1c290f0 100644 --- a/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py +++ b/tests/trainer/legacy_deprecate_flow_log_tests/test_trainer_steps_scalar_return.py @@ -14,11 +14,13 @@ """ Tests to ensure that the training loop works with a scalar """ -import torch import os +import torch +import pytest from pytorch_lightning import Trainer from tests.base.deterministic_model import DeterministicModel +from tests.base import BoringModel def test_training_step_scalar(tmpdir): @@ -190,3 +192,49 @@ def test_train_step_epoch_end_scalar(tmpdir): opt_closure_result = trainer.train_loop.training_step_and_backward( batch, batch_idx, 0, trainer.optimizers[0], trainer.hiddens) assert opt_closure_result['loss'].item() == 171 + + +class DPPReduceMeanPbarModel(BoringModel): + + logged = [] + + def training_step(self, batch, batch_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + loss /= loss.clone().detach() + self.log('self_log', loss, prog_bar=True, sync_dist=True) + return {"loss": loss, "progress_bar":{"loss_2": loss}} + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +def test_dpp_reduce_mean_pbar(tmpdir): + os.environ['PL_DEV_DEBUG'] = '1' + + model = DPPReduceMeanPbarModel() + model.training_step_end = None + model.training_epoch_end = None + + distributed_backend = "ddp_spawn" + + trainer = Trainer( + max_epochs=1, + default_root_dir=os.getcwd(), + limit_train_batches=10, + limit_test_batches=2, + limit_val_batches=2, + distributed_backend=distributed_backend, + gpus=2, + precision=32) + + trainer.fit(model) + + # TODO: Move this test to DDP. pbar_added_metrics is empty with ddp_spawn for some reasons + + pbar_added_metrics = trainer.dev_debugger.pbar_added_metrics + is_in = False + for pbar_metrics in pbar_added_metrics: + if 'loss_2' in pbar_metrics: + is_in = True + assert pbar_metrics["loss_2"].item() == 1 + if distributed_backend == "ddp": + assert is_in is True From bacabaebaf16b0492cf9090b75238215c2c19de5 Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Thu, 12 Nov 2020 17:18:09 +0000 Subject: [PATCH 11/11] Sharded Accelerator 1/n: Expose clip gradients to plugins via abstract class (#4639) * Added abstract precision plugin to expose clip_gradients function, use within accelerator to clip gradients * Exclude model from override, keep optimizer (needed for sharded clip gradients), add override for O2 support apex * Fix doc * Applied codereview changes * Refactored clip function to encapsulate tpu changes with tpu accelerator. Default to standard clip function for vanilla torch * Pass correct grad clip val * Moved var to property * Apply code review suggestions --- pytorch_lightning/accelerators/accelerator.py | 52 ++++--------------- .../accelerators/tpu_accelerator.py | 31 +++++++++-- pytorch_lightning/plugins/apex.py | 39 +++++++++++++- pytorch_lightning/plugins/native_amp.py | 10 +++- pytorch_lightning/plugins/precision_plugin.py | 38 ++++++++++++++ 5 files changed, 120 insertions(+), 50 deletions(-) create mode 100644 pytorch_lightning/plugins/precision_plugin.py diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 3b762e08ed5e6d..a0d8f6f21a2f7a 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -12,33 +12,25 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -import math from enum import Enum from typing import Any, Optional, Union import torch +from torch.optim import Optimizer -from pytorch_lightning.utilities import AMPType, rank_zero_warn +from pytorch_lightning.utilities import AMPType from pytorch_lightning.utilities.apply_func import move_data_to_device from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.parsing import AttributeDict import torch.distributed as torch_distrib from pytorch_lightning import _logger as log -try: - from apex import amp -except ImportError: - amp = None - if torch.distributed.is_available(): from torch.distributed import ReduceOp else: class ReduceOp: SUM = None -EPSILON = 1e-6 -EPSILON_FP16 = 1e-5 - class Accelerator(object): @@ -139,48 +131,22 @@ def optimizer_zero_grad(self, batch_idx, optimizer, opt_idx): model_ref.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx) def clip_gradients(self, optimizer, clip_val=None): - # TODO: separate TPU case from here - self._clip_gradients(optimizer, clip_val) - - def _clip_gradients(self, optimizer, clip_val=None): # use the trainer's clip val if none passed grad_clip_val = self.trainer.gradient_clip_val if clip_val is not None: grad_clip_val = clip_val grad_clip_val = float(grad_clip_val) - # this code is a modification of torch.nn.utils.clip_grad_norm_ - # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md if grad_clip_val <= 0: return + self._clip_gradients(optimizer, grad_clip_val) - model = self.trainer.get_model() - if self.trainer.amp_backend == AMPType.APEX: - parameters = amp.master_params(optimizer) + def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: Union[float, int], norm_type: float = 2.0): + if self.trainer.amp_backend: + self.trainer.precision_connector.backend.clip_gradients(grad_clip_val, optimizer, norm_type) else: - parameters = model.parameters() - - max_norm = grad_clip_val - norm_type = float(2.0) - - if isinstance(parameters, torch.Tensor): - parameters = [parameters] - parameters = list(filter(lambda p: p.grad is not None, parameters)) - - if norm_type == math.inf: - total_norm = max(p.grad.data.abs().max() for p in parameters) - else: - device = parameters[0].device - out = torch.empty(len(parameters), device=device) - for i, p in enumerate(parameters): - torch.norm(p.grad.data.to(device), norm_type, out=out[i]) - total_norm = torch.norm(out, norm_type) - - eps = EPSILON_FP16 if self.trainer.precision == 16 else EPSILON - clip_coef = torch.tensor(max_norm, device=device) / (total_norm + eps) - clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef)) - for p in parameters: - p.grad.data.mul_(clip_coef.to(p.grad.data.device)) + model = self.trainer.get_model() + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type) def on_train_epoch_end(self, outputs): pass @@ -201,7 +167,7 @@ def setup_optimizers(self, model): self.trainer.optimizer_frequencies = optimizer_frequencies def init_ddp_connection( - self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True + self, global_rank: int, world_size: int, is_slurm_managing_tasks: bool = True ) -> None: os.environ["MASTER_ADDR"] = str(self.cluster_environment.master_address()) os.environ["MASTER_PORT"] = str(self.cluster_environment.master_port()) diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py index 15386b133f8bd9..54ee57b74a16a8 100644 --- a/pytorch_lightning/accelerators/tpu_accelerator.py +++ b/pytorch_lightning/accelerators/tpu_accelerator.py @@ -12,12 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. import io +import math import os import re from typing import Optional, Union, Any import torch import torch.multiprocessing as mp +from torch.optim import Optimizer from pytorch_lightning import _logger as log from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp @@ -261,10 +263,27 @@ def optimizer_step(self, optimizer, batch_idx, opt_idx, lambda_closure): using_lbfgs=is_lbfgs ) - def clip_gradients(self, optimizer, clip_val=None): - # apply clip gradients - # TODO: separate TPU case from here - self._clip_gradients(optimizer, clip_val) + def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: Union[float, int], norm_type: float = 2.0): + # this code is a modification of torch.nn.utils.clip_grad_norm_ + # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md + model = self.trainer.get_model() + parameters = model.parameters() + max_norm = grad_clip_val + + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + + device = parameters[0].device + out = torch.empty(len(parameters), device=device) + for i, p in enumerate(parameters): + torch.norm(p.grad.data.to(device), norm_type, out=out[i]) + total_norm = torch.norm(out, norm_type) + + clip_coef = torch.tensor(max_norm, device=device) / (total_norm + self.norm_clipping_epsilon) + clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef)) + for p in parameters: + p.grad.data.mul_(clip_coef.to(p.grad.data.device)) def barrier(self, name: Optional[str] = None): torch_xla.core.xla_model.rendezvous(f"pl.Trainer.{name}") @@ -343,3 +362,7 @@ def sync_tensor(self, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None) -> torch.Tensor: return tensor + + @property + def norm_clipping_epsilon(self): + return 1e-6 diff --git a/pytorch_lightning/plugins/apex.py b/pytorch_lightning/plugins/apex.py index 0c8665e3719f35..654f7202fb9d1b 100644 --- a/pytorch_lightning/plugins/apex.py +++ b/pytorch_lightning/plugins/apex.py @@ -11,11 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Tuple +import math +from typing import List, Tuple, Union +import torch from torch.optim.optimizer import Optimizer from pytorch_lightning.core.lightning import LightningModule +from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin from pytorch_lightning.utilities.distributed import rank_zero_warn from pytorch_lightning.utilities import AMPType @@ -25,7 +28,7 @@ amp = None -class ApexPlugin: +class ApexPlugin(PrecisionPlugin): def __init__(self, trainer=None): self.trainer = trainer @@ -98,3 +101,35 @@ def configure_apex(self, amp, model, optimizers, amp_level): """ model, optimizers = amp.initialize(model, optimizers, opt_level=amp_level) return model, optimizers + + def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float): + """ + This code is a modification of :meth:`torch.nn.utils.clip_grad_norm_` using a higher epsilon for fp16 weights. + This is important when setting amp_level to O2, and the master weights are in fp16. + Args: + grad_clip_val: Maximum norm of gradients. + optimizer: Optimizer with gradients that will be clipped. + norm_type: (float or int): type of the used p-norm. Can be ``'inf'`` for + infinity norm. + """ + model = self.trainer.get_model() + parameters = model.parameters() + max_norm = float(grad_clip_val) + + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = [p for p in parameters if p.grad is not None] + + if len(parameters) == 0: + return torch.tensor(0.) + device = parameters[0].grad.device + total_norm = torch.norm( + torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type) + clip_coef = max_norm / (total_norm + self.norm_clipping_epsilon) + if clip_coef < 1: + for p in parameters: + p.grad.detach().mul_(clip_coef.to(p.grad.device)) + + @property + def norm_clipping_epsilon(self): + return 1e-5 diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py index 98bc8dfc87d25a..1a6649986132cb 100644 --- a/pytorch_lightning/plugins/native_amp.py +++ b/pytorch_lightning/plugins/native_amp.py @@ -11,11 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Union import torch +from torch.optim import Optimizer +from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin -class NativeAMPPlugin: + +class NativeAMPPlugin(PrecisionPlugin): def __init__(self, trainer=None): """ @@ -51,3 +55,7 @@ def training_step(self, fx, args): with torch.cuda.amp.autocast(): output = fx(*args) return output + + def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float): + model = self.trainer.get_model() + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type) diff --git a/pytorch_lightning/plugins/precision_plugin.py b/pytorch_lightning/plugins/precision_plugin.py new file mode 100644 index 00000000000000..0102f677391ff9 --- /dev/null +++ b/pytorch_lightning/plugins/precision_plugin.py @@ -0,0 +1,38 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import abc +from typing import Union + +from torch.optim import Optimizer + + +class PrecisionPlugin(abc.ABC): + """ + Abstract class to extend for precision support (32/16 etc). + + This is extended to cover any specific logic required for precision support such as AMP/APEX or sharded + training. + """ + + def connect(self, model, optimizers): + raise NotImplementedError + + def training_step(self, fx, args): + raise NotImplementedError + + def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): + raise NotImplementedError + + def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float): + raise NotImplementedError