From a6da0286dc85a39a6b7d794772101f9c9e5a2c4a Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Tue, 23 Feb 2021 15:37:06 +0900 Subject: [PATCH 01/33] Call zero_grad inside closure --- pytorch_lightning/trainer/training_loop.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 9d10a1f67c5dc..9e20bc0379d37 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -734,6 +734,9 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, wrap the forward step in a closure so second order methods work """ with self.trainer.profiler.profile("training_step_and_backward"): + if self.automatic_optimization and isinstance(optimizer, torch.optim.LBFGS): + optimizer.zero_grad() + # lightning module hook result = self.training_step(split_batch, batch_idx, opt_idx, hiddens) self._curr_step_result = result From 57840f9555b9ecbf7703aef7c0e9765bf65c0b99 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Tue, 23 Feb 2021 17:43:28 +0900 Subject: [PATCH 02/33] Call zero_grad inside closure independently of optim --- pytorch_lightning/trainer/training_loop.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 9e20bc0379d37..29e023115f84b 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -733,8 +733,10 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, """ wrap the forward step in a closure so second order methods work """ + accumulate_grad_batches_enabled = self.trainer.accumulate_grad_batches > 1 + with self.trainer.profiler.profile("training_step_and_backward"): - if self.automatic_optimization and isinstance(optimizer, torch.optim.LBFGS): + if not accumulate_grad_batches_enabled: optimizer.zero_grad() # lightning module hook @@ -763,6 +765,10 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, # revert back to previous state self.trainer.lightning_module.untoggle_optimizer(opt_idx) + # when the last batch of accumulation + if accumulate_grad_batches_enabled and not self.should_accumulate(): + optimizer.zero_grad() + return result def backward(self, result, optimizer, opt_idx, *args, **kwargs): From 8d1253d3604a152d9a68051755066b5920030a49 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 24 Feb 2021 17:24:58 +0900 Subject: [PATCH 03/33] Remove optimizer.zero_grad from optimizer.step --- pytorch_lightning/core/optimizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py index b6aca3b457a1f..c95890849cb28 100644 --- a/pytorch_lightning/core/optimizer.py +++ b/pytorch_lightning/core/optimizer.py @@ -136,7 +136,6 @@ def __optimizer_step(self, closure: Optional[Callable] = None, profiler_name: st if self._trainer.train_loop.automatic_optimization: trainer.train_loop.on_before_zero_grad(optimizer) - model.optimizer_zero_grad(trainer.current_epoch, trainer.batch_idx, optimizer, self._optimizer_idx) def step(self, *args, closure: Optional[Callable] = None, **kwargs): """ From eaf42fdf747946dae0e6a472cc57b9dfe6193fa3 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 24 Feb 2021 17:25:32 +0900 Subject: [PATCH 04/33] Use accelerator's zero_grad --- pytorch_lightning/trainer/training_loop.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 29e023115f84b..a28284899ba97 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -737,7 +737,7 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, with self.trainer.profiler.profile("training_step_and_backward"): if not accumulate_grad_batches_enabled: - optimizer.zero_grad() + self.trainer.accelerator.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx) # lightning module hook result = self.training_step(split_batch, batch_idx, opt_idx, hiddens) @@ -767,7 +767,7 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, # when the last batch of accumulation if accumulate_grad_batches_enabled and not self.should_accumulate(): - optimizer.zero_grad() + self.trainer.accelerator.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx) return result From 00376fc65036c57c911a025907f9a5d3280c87e4 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 24 Feb 2021 21:02:38 +0900 Subject: [PATCH 05/33] Update manual optimization docs --- docs/source/common/optimizers.rst | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 3f7cd7f224a97..bde051321d035 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -23,17 +23,16 @@ to manually manage the optimization process. To do so, do the following: * Override your LightningModule ``automatic_optimization`` property to return ``False`` * Drop or ignore the optimizer_idx argument -* Use `self.manual_backward(loss)` instead of `loss.backward()`. +* Use ``self.manual_backward(loss)`` instead of ``loss.backward()``. -.. note:: This is only recommended for experts who need ultimate flexibility. Lightning will handle only precision and accelerators logic. The users are left with zero_grad, accumulated_grad_batches, model toggling, etc.. +.. note:: This is only recommended for experts who need ultimate flexibility. Lightning will handle only precision and accelerators logic. The users are left with ``optimizer.zero_grad()``, gradient accumulation, model toggling, etc.. -.. warning:: Before 1.2, ``optimzer.step`` was calling ``zero_grad`` internally. From 1.2, it is left to the users expertize. +.. warning:: Before 1.2, ``optimzer.step`` was calling ``optimizer.zero_grad()`` internally. From 1.2, it is left to the users expertize. .. tip:: To perform ``accumulate_grad_batches`` with one optimizer, you can do as such. .. tip:: ``self.optimizers()`` will return ``LightningOptimizer`` objects. You can access your own optimizer with ``optimizer.optimizer``. However, if you use your own optimizer to perform a step, Lightning won't be able to support accelerators and precision for you. - .. code-block:: python def training_step(batch, batch_idx, optimizer_idx): @@ -41,14 +40,14 @@ to manually manage the optimization process. To do so, do the following: loss = self.compute_loss(batch) self.manual_backward(loss) - opt.step() # accumulate gradient batches if batch_idx % 2 == 0: + opt.step() opt.zero_grad() -.. tip:: It is a good practice to provide the optimizer with a ``closure`` function that performs a ``forward`` and ``backward`` pass of your model. It is optional for most optimizers, but makes your code compatible if you switch to an optimizer which requires a closure. +.. tip:: It is a good practice to provide the optimizer with a ``closure`` function that performs a ``forward`` and ``backward`` pass of your model. It is optional for most optimizers, but makes your code compatible if you switch to an optimizer which requires a closure. See also `the PyTorch docs`_. Here is the same example as above using a ``closure``. @@ -71,7 +70,6 @@ Here is the same example as above using a ``closure``. .. code-block:: python # Scenario for a GAN. - def training_step(...): opt_gen, opt_dis = self.optimizers() From be4307779e67ff2e641bfef811cd9653dd1c198b Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 24 Feb 2021 22:17:56 +0900 Subject: [PATCH 06/33] Update automatic optimization docs --- docs/source/common/optimizers.rst | 61 ++++++++++++++++++------------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index bde051321d035..26971aae0d12e 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -47,7 +47,7 @@ to manually manage the optimization process. To do so, do the following: opt.zero_grad() -.. tip:: It is a good practice to provide the optimizer with a ``closure`` function that performs a ``forward`` and ``backward`` pass of your model. It is optional for most optimizers, but makes your code compatible if you switch to an optimizer which requires a closure. See also `the PyTorch docs`_. +.. tip:: It is a good practice to provide the optimizer with a ``closure`` function that performs a ``forward`` and ``backward`` pass of your model. It is optional for most optimizers, but makes your code compatible if you switch to an optimizer which requires a closure. See also `the PyTorch docs `_. Here is the same example as above using a ``closure``. @@ -135,8 +135,12 @@ Here is an example on how to use it: Automatic optimization ====================== -With Lightning most users don't have to think about when to call .backward(), .step(), .zero_grad(), since -Lightning automates that for you. +With Lightning most users don't have to think about when to call ``.zero_grad()``, ``.backward()`` and ``.step()`` +since Lightning automates that for you. + +.. warning:: + Before 1.2.1, ``.zero_grad()`` was called after ``.backward()`` and ``.step()`` internally. + From 1.2.1, Lightning calls ``.zero_grad()`` before ``.backward()``. Under the hood Lightning does the following: @@ -145,33 +149,33 @@ Under the hood Lightning does the following: for epoch in epochs: for batch in data: loss = model.training_step(batch, batch_idx, ...) + optimizer.zero_grad() loss.backward() optimizer.step() - optimizer.zero_grad() - for scheduler in schedulers: - scheduler.step() + for lr_scheduler in lr_schedulers: + lr_scheduler.step() In the case of multiple optimizers, Lightning does the following: .. code-block:: python for epoch in epochs: - for batch in data: - for opt in optimizers: - disable_grads_for_other_optimizers() - train_step(opt) - opt.step() + for batch in data: + for opt in optimizers: + loss = model.training_step(batch, batch_idx, optimizer_idx) + opt.zero_grad() + loss.backward() + opt.step() - for scheduler in schedulers: - scheduler.step() + for lr_scheduler in lr_schedulers: + lr_scheduler.step() Learning rate scheduling ------------------------ -Every optimizer you use can be paired with any `LearningRateScheduler `_. -In the basic use-case, the scheduler (or multiple schedulers) should be returned as the second output from the ``.configure_optimizers`` -method: +Every optimizer you use can be paired with any `Learning Rate Scheduler `_. +In the basic use-case, the scheduler (or multiple schedulers) should be returned as the second output from the ``.configure_optimizers`` method: .. testcode:: @@ -260,7 +264,7 @@ returned as a dict which can contain the following keywords: Use multiple optimizers (like GANs) ----------------------------------- -To use multiple optimizers return > 1 optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers` +To use multiple optimizers return two or more optimizers from :meth:`pytorch_lightning.core.LightningModule.configure_optimizers` .. testcode:: @@ -281,13 +285,15 @@ Lightning will call each optimizer sequentially: .. code-block:: python for epoch in epochs: - for batch in data: - for opt in optimizers: - train_step(opt) - opt.step() + for batch in data: + for opt in optimizers: + loss = train_step(batch, batch_idx, optimizer_idx) + opt.zero_grad() + loss.backward() + opt.step() - for scheduler in schedulers: - scheduler.step() + for lr_scheduler in lr_schedulers: + lr_scheduler.step() ---------- @@ -298,7 +304,7 @@ override the :meth:`optimizer_step` function. For example, here step optimizer A every 2 batches and optimizer B every 4 batches -.. note:: When using Trainer(enable_pl_optimizer=True), there is no need to call `.zero_grad()`. +.. note:: ``.zero_grad()`` is called inside the default closure, so there is no need to call ``.zero_grad()`` manually. .. testcode:: @@ -332,7 +338,7 @@ Here we add a learning-rate warm up # update params optimizer.step(closure=closure) -.. note:: The default ``optimizer_step`` is relying on the internal ``LightningOptimizer`` to properly perform a step. It handles TPUs, AMP, accumulate_grad_batches, zero_grad, and much more ... +.. note:: The default ``optimizer_step`` is relying on the internal ``LightningOptimizer`` to properly perform a step. It handles TPUs, AMP, accumulate_grad_batches and much more ... .. testcode:: @@ -362,6 +368,11 @@ Using the closure functions for optimization When using optimization schemes such as LBFGS, the `second_order_closure` needs to be enabled. By default, this function is defined by wrapping the `training_step` and the backward steps as follows +.. warning:: + Before 1.2.1, ``.zero_grad()`` was called outside the closure internally. + From 1.2.1, the closure calls ``.zero_grad()`` inside, so there is no need to define your own closure + when using similar optimizers to :class:`torch.optim.LBFGS` which requires reevaluation of the loss with the closure in ``optimizer.step()``. + .. testcode:: def second_order_closure(pl_module, split_batch, batch_idx, opt_idx, optimizer, hidden): From 3f6e0861f51ad1e462e080516ab231b9ddf855ea Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 24 Feb 2021 23:32:52 +0900 Subject: [PATCH 07/33] Update new-project docs --- docs/source/starter/new-project.rst | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/docs/source/starter/new-project.rst b/docs/source/starter/new-project.rst index 0a07e9903b348..156db1e060023 100644 --- a/docs/source/starter/new-project.rst +++ b/docs/source/starter/new-project.rst @@ -248,7 +248,7 @@ as long as you return a loss with an attached graph from the `training_step`, Li .. code-block:: python def training_step(self, batch, batch_idx): - loss = self.encoder(batch[0]) + loss = self.encoder(batch) return loss .. _manual_opt: @@ -267,19 +267,18 @@ Turn off automatic optimization and you control the train loop! def training_step(self, batch, batch_idx, optimizer_idx): # access your optimizers with use_pl_optimizer=False. Default is True - (opt_a, opt_b, opt_c) = self.optimizers(use_pl_optimizer=True) + opt_a, opt_b = self.optimizers(use_pl_optimizer=True) - loss_a = self.generator(batch[0]) - - # use this instead of loss.backward so we can automate half precision, etc... - self.manual_backward(loss_a, opt_a, retain_graph=True) - self.manual_backward(loss_a, opt_a) - opt_a.step() + loss_a = self.generator(batch) opt_a.zero_grad() + # use `manual_backward()` instead of `loss.backward` to automate half precision, etc... + self.manual_backward(loss_a) + opt_a.step() - loss_b = self.discriminator(batch[0]) - self.manual_backward(loss_b, opt_b) - ... + loss_b = self.discriminator(batch) + opt_b.zero_grad() + self.manual_backward(loss_b) + opt_b.step() Predict or Deploy From c422ffa4563fb8f918bce9639de4dc8ff2ee1c4e Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Feb 2021 12:13:19 +0900 Subject: [PATCH 08/33] Move on_before_zero_grad to trainloop --- pytorch_lightning/core/optimizer.py | 3 --- pytorch_lightning/trainer/training_loop.py | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py index c95890849cb28..53f7cb23804aa 100644 --- a/pytorch_lightning/core/optimizer.py +++ b/pytorch_lightning/core/optimizer.py @@ -134,9 +134,6 @@ def __optimizer_step(self, closure: Optional[Callable] = None, profiler_name: st with trainer.profiler.profile(profiler_name): trainer.accelerator.optimizer_step(optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs) - if self._trainer.train_loop.automatic_optimization: - trainer.train_loop.on_before_zero_grad(optimizer) - def step(self, *args, closure: Optional[Callable] = None, **kwargs): """ Call this directly from your training_step when doing optimizations manually. diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index a28284899ba97..fd73483ebb1e6 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -737,6 +737,7 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, with self.trainer.profiler.profile("training_step_and_backward"): if not accumulate_grad_batches_enabled: + self.trainer.train_loop.on_before_zero_grad(optimizer) self.trainer.accelerator.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx) # lightning module hook @@ -744,6 +745,7 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, self._curr_step_result = result if not self._skip_backward and self.trainer.train_loop.automatic_optimization: + # backward pass if result is not None: with self.trainer.profiler.profile("model_backward"): From 95e0a0bb0fe1a96c06d16cb0773bf9b35a298dfc Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Feb 2021 12:16:11 +0900 Subject: [PATCH 09/33] Use trainerloop methods --- pytorch_lightning/trainer/training_loop.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index fd73483ebb1e6..fbbb0f209cdcb 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -736,15 +736,14 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, accumulate_grad_batches_enabled = self.trainer.accumulate_grad_batches > 1 with self.trainer.profiler.profile("training_step_and_backward"): - if not accumulate_grad_batches_enabled: - self.trainer.train_loop.on_before_zero_grad(optimizer) - self.trainer.accelerator.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx) - # lightning module hook result = self.training_step(split_batch, batch_idx, opt_idx, hiddens) self._curr_step_result = result if not self._skip_backward and self.trainer.train_loop.automatic_optimization: + if not accumulate_grad_batches_enabled: + self.on_before_zero_grad(optimizer) + self.optimizer_zero_grad(batch_idx, optimizer, opt_idx) # backward pass if result is not None: @@ -767,9 +766,9 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, # revert back to previous state self.trainer.lightning_module.untoggle_optimizer(opt_idx) - # when the last batch of accumulation - if accumulate_grad_batches_enabled and not self.should_accumulate(): - self.trainer.accelerator.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx) + # when the last batch of accumulation + if accumulate_grad_batches_enabled and not self.should_accumulate(): + self.optimizer_zero_grad(batch_idx, optimizer, opt_idx) return result From 8243b80f4069f5b7357638e6d4c786b7adcc17c6 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Feb 2021 13:15:45 +0900 Subject: [PATCH 10/33] Remove zero_grad after backward --- pytorch_lightning/trainer/training_loop.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index e807b4668e3f3..417a109a27be6 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -768,10 +768,6 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, # revert back to previous state self.trainer.lightning_module.untoggle_optimizer(opt_idx) - # when the last batch of accumulation - if accumulate_grad_batches_enabled and not self.should_accumulate(): - self.optimizer_zero_grad(batch_idx, optimizer, opt_idx) - return result def backward(self, result, optimizer, opt_idx, *args, **kwargs): From 8269f454a349e44afca8e8160fa805332f1f3cbc Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Feb 2021 13:48:27 +0900 Subject: [PATCH 11/33] Split tests to step and zero_grad --- tests/core/test_lightning_module.py | 65 ++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 5 deletions(-) diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py index 17449b96d7cab..6984e3834db2b 100644 --- a/tests/core/test_lightning_module.py +++ b/tests/core/test_lightning_module.py @@ -95,12 +95,10 @@ def optimizer_step(self, *_, **__): trainer.fit(model) -def test_automatic_optimization_num_calls(tmpdir): +def test_automatic_optimization_num_step_calls(tmpdir): with patch("torch.optim.SGD.step") as sgd_step, \ - patch("torch.optim.SGD.zero_grad") as sgd_zero_grad, \ - patch("torch.optim.Adam.step") as adam_step, \ - patch("torch.optim.Adam.zero_grad") as adam_zero_grad: + patch("torch.optim.Adam.step") as adam_step: class TestModel(BoringModel): @@ -154,8 +152,65 @@ def optimizer_step( trainer.fit(model) assert sgd_step.call_count == 4 - assert sgd_zero_grad.call_count == 4 assert adam_step.call_count == 2 + + +def test_automatic_optimization_num_zero_grad_calls(tmpdir): + + with patch("torch.optim.SGD.zero_grad") as sgd_zero_grad, \ + patch("torch.optim.Adam.zero_grad") as adam_zero_grad: + + class TestModel(BoringModel): + + def training_step(self, batch, batch_idx, optimizer_idx): + output = self.layer(batch) + loss = self.loss(batch, output) + return {"loss": loss} + + def configure_optimizers(self): + optimizer = SGD(self.layer.parameters(), lr=0.1) + optimizer_2 = Adam(self.layer.parameters(), lr=0.1) + return [optimizer, optimizer_2] + + def optimizer_step( + self, + epoch, + batch_idx, + optimizer, + optimizer_idx, + optimizer_closure, + on_tpu, + using_native_amp, + using_lbfgs, + ): + assert optimizer_closure.__name__ == "train_step_and_backward_closure" + + # update generator opt every 2 steps + if optimizer_idx == 0: + if batch_idx % 2 == 0: + assert isinstance(optimizer, SGD) + optimizer.step(closure=optimizer_closure) + + # update discriminator opt every 4 steps + if optimizer_idx == 1: + if batch_idx % 4 == 0: + assert isinstance(optimizer, Adam) + optimizer.step(closure=optimizer_closure) + + model = TestModel() + model.training_epoch_end = None + + trainer = Trainer( + max_epochs=1, + default_root_dir=tmpdir, + limit_train_batches=8, + limit_val_batches=1, + accumulate_grad_batches=1, + ) + + trainer.fit(model) + + assert sgd_zero_grad.call_count == 4 assert adam_zero_grad.call_count == 2 From 4694e3ea01d624d95f0574722c5e02e792b174cc Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Feb 2021 18:33:21 +0900 Subject: [PATCH 12/33] Call zero_grad before backward in tests --- tests/models/test_hooks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py index 416a858537245..62a252eaa3128 100644 --- a/tests/models/test_hooks.py +++ b/tests/models/test_hooks.py @@ -446,12 +446,12 @@ def teardown(self, stage: str): 'on_epoch_start', 'on_train_epoch_start', 'on_train_batch_start', - 'on_after_backward', 'on_before_zero_grad', + 'on_after_backward', 'on_train_batch_end', 'on_train_batch_start', - 'on_after_backward', 'on_before_zero_grad', + 'on_after_backward', 'on_train_batch_end', 'on_train_epoch_end', 'on_epoch_end', From d658920933e8b135e08e64bec4131957ebbd8436 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Feb 2021 18:56:44 +0900 Subject: [PATCH 13/33] Call zero_grad before backward in tests --- tests/callbacks/test_callbacks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py index 379bc79263a6e..845048e602c9d 100644 --- a/tests/callbacks/test_callbacks.py +++ b/tests/callbacks/test_callbacks.py @@ -73,20 +73,20 @@ def test_trainer_callback_system(torch_save, tmpdir): call.on_train_epoch_start(trainer, model), call.on_batch_start(trainer, model), call.on_train_batch_start(trainer, model, ANY, 0, 0), - call.on_after_backward(trainer, model), call.on_before_zero_grad(trainer, model, trainer.optimizers[0]), + call.on_after_backward(trainer, model), call.on_train_batch_end(trainer, model, ANY, ANY, 0, 0), call.on_batch_end(trainer, model), call.on_batch_start(trainer, model), call.on_train_batch_start(trainer, model, ANY, 1, 0), - call.on_after_backward(trainer, model), call.on_before_zero_grad(trainer, model, trainer.optimizers[0]), + call.on_after_backward(trainer, model), call.on_train_batch_end(trainer, model, ANY, ANY, 1, 0), call.on_batch_end(trainer, model), call.on_batch_start(trainer, model), call.on_train_batch_start(trainer, model, ANY, 2, 0), - call.on_after_backward(trainer, model), call.on_before_zero_grad(trainer, model, trainer.optimizers[0]), + call.on_after_backward(trainer, model), call.on_train_batch_end(trainer, model, ANY, ANY, 2, 0), call.on_batch_end(trainer, model), call.on_train_epoch_end(trainer, model, ANY), From 2dd615488ffbd676b172f3c83e712980dff52ffc Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Feb 2021 21:34:02 +0900 Subject: [PATCH 14/33] Add a test for optimization with lbfgs --- tests/core/test_lightning_optimizer.py | 27 +++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/tests/core/test_lightning_optimizer.py b/tests/core/test_lightning_optimizer.py index fab6ceccbfd88..1a16c1a1be0d6 100644 --- a/tests/core/test_lightning_optimizer.py +++ b/tests/core/test_lightning_optimizer.py @@ -18,7 +18,8 @@ from pytorch_lightning import Trainer from pytorch_lightning.core.optimizer import LightningOptimizer -from tests.helpers.boring_model import BoringModel +from tests.base import EvalModelTemplate + def test_lightning_optimizer(tmpdir): @@ -327,3 +328,27 @@ def configure_optimizers(self): assert adam_zero_grad.call_count == 4 assert sgd_zero_grad.call_count == 10 + + +def test_lightning_optimizer_automatic_optimization_lbfgs_zero_grad(tmpdir): + """ + Test zero_grad is called the same number of times as LBFGS requires + for reevaluation of the loss in automatic_optimization. + """ + + with patch("torch.optim.LBFGS.zero_grad") as zero_grad: + + model = EvalModelTemplate() + model.configure_optimizers = model.configure_optimizers__lbfgs + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=1, + limit_val_batches=1, + max_epochs=1, + weights_summary=None, + ) + trainer.fit(model) + + lbfgs = model.optimizers() + max_iter = lbfgs.param_groups[0]["max_iter"] + assert zero_grad.call_count == max_iter From 2fe4d28e3d61809450f8ae9b802a6559d1f7f892 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Feb 2021 21:38:03 +0900 Subject: [PATCH 15/33] Remove unused model --- pytorch_lightning/core/optimizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py index 53f7cb23804aa..8b6548f438756 100644 --- a/pytorch_lightning/core/optimizer.py +++ b/pytorch_lightning/core/optimizer.py @@ -129,7 +129,6 @@ def toggle_model(self, sync_grad: bool = True): def __optimizer_step(self, closure: Optional[Callable] = None, profiler_name: str = None, **kwargs): trainer = self._trainer optimizer = self._optimizer - model = trainer.lightning_module with trainer.profiler.profile(profiler_name): trainer.accelerator.optimizer_step(optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs) From 1845a671ef4a323ec6b1fe578a180daf83b60913 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Feb 2021 21:38:41 +0900 Subject: [PATCH 16/33] Add back BoringModel --- tests/core/test_lightning_optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/test_lightning_optimizer.py b/tests/core/test_lightning_optimizer.py index 1a16c1a1be0d6..706f876e96644 100644 --- a/tests/core/test_lightning_optimizer.py +++ b/tests/core/test_lightning_optimizer.py @@ -19,7 +19,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.core.optimizer import LightningOptimizer from tests.base import EvalModelTemplate - +from tests.helpers.boring_model import BoringModel def test_lightning_optimizer(tmpdir): From eb875b4b586c321afcfcd8c58d8bbdb1679a893c Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Thu, 25 Feb 2021 21:59:40 +0900 Subject: [PATCH 17/33] Update CHANGELOG --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 98332ee496fca..43465a8f90723 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed +- Changed the order of `zero_grad` and `backward` calls in automatic optimization. Lightning now calls `zero_grad` before `backward` ([#6147](https://github.com/PyTorchLightning/pytorch-lightning/pull/6147)) + ### Deprecated @@ -54,6 +56,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed epoch level schedulers not being called when `val_check_interval < 1.0` ([#6075](https://github.com/PyTorchLightning/pytorch-lightning/pull/6075)) +- Fixed LBFGS optimizer support which didn't converge in automatic optimization ([#6147](https://github.com/PyTorchLightning/pytorch-lightning/pull/6147)) + + ## [1.2.1] - 2021-02-23 ### Fixed From d898e7b5c2e220fbd0ada98e7ec0c37839de0670 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 26 Feb 2021 00:06:20 +0900 Subject: [PATCH 18/33] zero_grad when the first batch of accumulation --- pytorch_lightning/trainer/training_loop.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 417a109a27be6..3b9cd6544a234 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -735,15 +735,15 @@ def training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, """ wrap the forward step in a closure so second order methods work """ - accumulate_grad_batches_enabled = self.trainer.accumulate_grad_batches > 1 - with self.trainer.profiler.profile("training_step_and_backward"): # lightning module hook result = self.training_step(split_batch, batch_idx, opt_idx, hiddens) self._curr_step_result = result - if not self._skip_backward and self.trainer.train_loop.automatic_optimization: - if not accumulate_grad_batches_enabled: + if not self._skip_backward and self.automatic_optimization: + is_first_batch_to_accumulate = batch_idx % self.trainer.accumulate_grad_batches == 0 + + if is_first_batch_to_accumulate: self.on_before_zero_grad(optimizer) self.optimizer_zero_grad(batch_idx, optimizer, opt_idx) From 435a3c8b4a4e9385d41ba59f69b32623718ba99d Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 26 Feb 2021 01:24:39 +0100 Subject: [PATCH 19/33] Refactor tests. Remove duplicates --- docs/source/starter/introduction_guide.rst | 4 +- tests/core/test_lightning_module.py | 123 +---------- tests/core/test_lightning_optimizer.py | 244 ++++++++------------- tests/trainer/test_trainer.py | 152 +++---------- 4 files changed, 127 insertions(+), 396 deletions(-) diff --git a/docs/source/starter/introduction_guide.rst b/docs/source/starter/introduction_guide.rst index 9a78310d17400..2ee31304299e0 100644 --- a/docs/source/starter/introduction_guide.rst +++ b/docs/source/starter/introduction_guide.rst @@ -361,9 +361,9 @@ The training step is what happens inside the training loop. # TRAINING STEP # .... # TRAINING STEP + optimizer.zero_grad() loss.backward() optimizer.step() - optimizer.zero_grad() In the case of MNIST, we do the following @@ -377,9 +377,9 @@ In the case of MNIST, we do the following loss = F.nll_loss(logits, y) # ------ TRAINING STEP END ------ + optimizer.zero_grad() loss.backward() optimizer.step() - optimizer.zero_grad() In Lightning, everything that is in the training step gets organized under the :func:`~pytorch_lightning.core.LightningModule.training_step` function in the LightningModule. diff --git a/tests/core/test_lightning_module.py b/tests/core/test_lightning_module.py index 6984e3834db2b..8270867ae863c 100644 --- a/tests/core/test_lightning_module.py +++ b/tests/core/test_lightning_module.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from unittest.mock import Mock, patch +from unittest.mock import Mock import pytest from torch import nn @@ -74,7 +74,7 @@ def test_property_logger(tmpdir): assert model.logger == logger -def test_automatic_optimization(tmpdir): +def test_automatic_optimization_raises(tmpdir): class TestModel(BoringModel): @@ -95,125 +95,6 @@ def optimizer_step(self, *_, **__): trainer.fit(model) -def test_automatic_optimization_num_step_calls(tmpdir): - - with patch("torch.optim.SGD.step") as sgd_step, \ - patch("torch.optim.Adam.step") as adam_step: - - class TestModel(BoringModel): - - def training_step(self, batch, batch_idx, optimizer_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - return {"loss": loss} - - def configure_optimizers(self): - optimizer = SGD(self.layer.parameters(), lr=0.1) - optimizer_2 = Adam(self.layer.parameters(), lr=0.1) - return [optimizer, optimizer_2] - - def optimizer_step( - self, - epoch, - batch_idx, - optimizer, - optimizer_idx, - optimizer_closure, - on_tpu, - using_native_amp, - using_lbfgs, - ): - - assert optimizer_closure.__name__ == "train_step_and_backward_closure" - - # update generator opt every 2 steps - if optimizer_idx == 0: - if batch_idx % 2 == 0: - assert isinstance(optimizer, SGD) - optimizer.step(closure=optimizer_closure) - - # update discriminator opt every 4 steps - if optimizer_idx == 1: - if batch_idx % 4 == 0: - assert isinstance(optimizer, Adam) - optimizer.step(closure=optimizer_closure) - - model = TestModel() - model.training_epoch_end = None - - trainer = Trainer( - max_epochs=1, - default_root_dir=tmpdir, - limit_train_batches=8, - limit_val_batches=1, - accumulate_grad_batches=1, - ) - - trainer.fit(model) - - assert sgd_step.call_count == 4 - assert adam_step.call_count == 2 - - -def test_automatic_optimization_num_zero_grad_calls(tmpdir): - - with patch("torch.optim.SGD.zero_grad") as sgd_zero_grad, \ - patch("torch.optim.Adam.zero_grad") as adam_zero_grad: - - class TestModel(BoringModel): - - def training_step(self, batch, batch_idx, optimizer_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - return {"loss": loss} - - def configure_optimizers(self): - optimizer = SGD(self.layer.parameters(), lr=0.1) - optimizer_2 = Adam(self.layer.parameters(), lr=0.1) - return [optimizer, optimizer_2] - - def optimizer_step( - self, - epoch, - batch_idx, - optimizer, - optimizer_idx, - optimizer_closure, - on_tpu, - using_native_amp, - using_lbfgs, - ): - assert optimizer_closure.__name__ == "train_step_and_backward_closure" - - # update generator opt every 2 steps - if optimizer_idx == 0: - if batch_idx % 2 == 0: - assert isinstance(optimizer, SGD) - optimizer.step(closure=optimizer_closure) - - # update discriminator opt every 4 steps - if optimizer_idx == 1: - if batch_idx % 4 == 0: - assert isinstance(optimizer, Adam) - optimizer.step(closure=optimizer_closure) - - model = TestModel() - model.training_epoch_end = None - - trainer = Trainer( - max_epochs=1, - default_root_dir=tmpdir, - limit_train_batches=8, - limit_val_batches=1, - accumulate_grad_batches=1, - ) - - trainer.fit(model) - - assert sgd_zero_grad.call_count == 4 - assert adam_zero_grad.call_count == 2 - - def test_params_groups_and_state_are_accessible(tmpdir): class TestModel(BoringModel): diff --git a/tests/core/test_lightning_optimizer.py b/tests/core/test_lightning_optimizer.py index 706f876e96644..fd37f812c2337 100644 --- a/tests/core/test_lightning_optimizer.py +++ b/tests/core/test_lightning_optimizer.py @@ -11,14 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from unittest.mock import patch +from unittest.mock import patch, DEFAULT import torch -from torch.optim import Adam, Optimizer +from torch.optim import Adam, Optimizer, SGD from pytorch_lightning import Trainer from pytorch_lightning.core.optimizer import LightningOptimizer -from tests.base import EvalModelTemplate from tests.helpers.boring_model import BoringModel @@ -77,11 +76,9 @@ def configure_optimizers(self): assert trainer._lightning_optimizers[0].__repr__() == expected -@patch("torch.optim.Adam.step", autospec=True) -@patch("torch.optim.SGD.step", autospec=True) -def test_lightning_optimizer_manual_optimization(mock_sgd_step, mock_adam_step, tmpdir): +def test_lightning_optimizer_manual_optimization_and_accumulated_gradients(tmpdir): """ - Test that the user can use our LightningOptimizer. Not recommended for now. + Test that the user can use our LightningOptimizer. Not recommended. """ class TestModel(BoringModel): @@ -91,29 +88,26 @@ def __init__(self): self.automatic_optimization = False def training_step(self, batch, batch_idx, optimizer_idx=None): - (opt_1, opt_2) = self.optimizers() + opt_1, opt_2 = self.optimizers() assert isinstance(opt_1, LightningOptimizer) assert isinstance(opt_2, LightningOptimizer) - output = self.layer(batch) - loss_1 = self.loss(batch, output) - self.manual_backward(loss_1) - opt_1.step() - opt_1.zero_grad() - - output = self.layer(batch) - loss_2 = self.loss(batch, output) - self.manual_backward(loss_2) + def closure(opt): + output = self.layer(batch) + loss = self.loss(batch, output) + opt.zero_grad() + self.manual_backward(loss) if batch_idx % 2 == 0: - opt_2.step() - opt_2.zero_grad() + closure(opt_1) + opt_1.step() + + closure(opt_2) + opt_2.step() def configure_optimizers(self): optimizer_1 = torch.optim.SGD(self.layer.parameters(), lr=0.1) optimizer_2 = torch.optim.Adam(self.layer.parameters(), lr=0.1) - optimizer_1 = LightningOptimizer(optimizer_1) - lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_1, step_size=1) return [optimizer_1, optimizer_2], [lr_scheduler] @@ -126,66 +120,18 @@ def configure_optimizers(self): limit_val_batches=1, max_epochs=1, weights_summary=None, + accumulate_grad_batches=999, # does not do anything if manual optimization ) - trainer.fit(model) - - assert len(mock_sgd_step.mock_calls) == 8 - assert len(mock_adam_step.mock_calls) == 4 - - -@patch("torch.optim.Adam.step", autospec=True) -@patch("torch.optim.SGD.step", autospec=True) -def test_lightning_optimizer_manual_optimization_and_accumulated_gradients(mock_sgd_step, mock_adam_step, tmpdir): - """ - Test that the user can use our LightningOptimizer. Not recommended. - """ - class TestModel(BoringModel): - - def __init__(self): - super().__init__() - self.automatic_optimization = False - - def training_step(self, batch, batch_idx, optimizer_idx=None): - (opt_1, opt_2) = self.optimizers() - assert isinstance(opt_1, LightningOptimizer) - assert isinstance(opt_2, LightningOptimizer) - - output = self.layer(batch) - loss_1 = self.loss(batch, output) - self.manual_backward(loss_1) - opt_1.step() - - def closure(): - output = self.layer(batch) - loss_2 = self.loss(batch, output) - self.manual_backward(loss_2) - - opt_2.step(closure=closure) - - def configure_optimizers(self): - optimizer_1 = torch.optim.SGD(self.layer.parameters(), lr=0.1) - optimizer_2 = torch.optim.Adam(self.layer.parameters(), lr=0.1) - optimizer_1 = LightningOptimizer(optimizer_1) - - lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_1, step_size=1) - return [optimizer_1, optimizer_2], [lr_scheduler] + with patch.multiple(torch.optim.SGD, zero_grad=DEFAULT, step=DEFAULT) as sgd, \ + patch.multiple(torch.optim.Adam, zero_grad=DEFAULT, step=DEFAULT) as adam: + trainer.fit(model) - model = TestModel() - model.training_step_end = None - model.training_epoch_end = None - trainer = Trainer( - default_root_dir=tmpdir, - limit_train_batches=8, - limit_val_batches=1, - max_epochs=1, - weights_summary=None, - accumulate_grad_batches=2, - ) - trainer.fit(model) + assert sgd["step"].call_count == 4 + assert adam["step"].call_count == 8 - assert len(mock_sgd_step.mock_calls) == 8 - assert len(mock_adam_step.mock_calls) == 8 + assert sgd["zero_grad"].call_count == 4 + assert adam["zero_grad"].call_count == 8 def test_state(tmpdir): @@ -238,96 +184,96 @@ def test_state(tmpdir): assert optimizer.state == lightning_optimizer.state -def test_lightning_optimizer_automatic_optimization(tmpdir): +def test_lightning_optimizer_automatic_optimization_optimizer_zero_grad(tmpdir): """ - Test lightning optimize works with in automatic_optimization + Test overriding zero_grad works in automatic_optimization """ - class TestModel(BoringModel): def training_step(self, batch, batch_idx, optimizer_idx=None): - output = self.layer(batch) - loss = self.loss(batch, output) - return {"loss": loss} + return super().training_step(batch, batch_idx) def training_epoch_end(self, outputs): - outputs = sum(outputs, []) - torch.stack([x["loss"] for x in outputs]).mean() + ... - def optimizer_step( - self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs - ): - assert optimizer_closure.__name__ == "train_step_and_backward_closure" - optimizer_closure() - if batch_idx % 2 == 0: - optimizer.step() + def optimizer_zero_grad(self, epoch, batch_idx, optimizer, optimizer_idx): + if isinstance(optimizer, SGD) and batch_idx % 2 == 0: + optimizer.zero_grad() + if isinstance(optimizer, Adam) and batch_idx % 5 == 0: optimizer.zero_grad() def configure_optimizers(self): optimizer_1 = torch.optim.SGD(self.layer.parameters(), lr=0.1) optimizer_2 = torch.optim.Adam(self.layer.parameters(), lr=0.1) - optimizer_1 = LightningOptimizer(optimizer_1) - lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_1, step_size=1) return [optimizer_1, optimizer_2], [lr_scheduler] model = TestModel() trainer = Trainer( default_root_dir=tmpdir, - limit_train_batches=10, + limit_train_batches=20, limit_val_batches=1, max_epochs=1, weights_summary=None, ) - trainer.fit(model) + with patch("torch.optim.Adam.zero_grad") as adam_zero_grad, \ + patch("torch.optim.SGD.zero_grad") as sgd_zero_grad: + trainer.fit(model) + + assert adam_zero_grad.call_count == 4 + assert sgd_zero_grad.call_count == 10 -def test_lightning_optimizer_automatic_optimization_optimizer_zero_grad(tmpdir): + +def test_lightning_optimizer_automatic_optimization_optimizer_step(tmpdir): """ - Test lightning optimize works with optimizer_zero_grad overrides in automatic_optimization + Test overriding step works in automatic_optimization """ - with patch("torch.optim.Adam.zero_grad") as adam_zero_grad, \ - patch("torch.optim.SGD.zero_grad") as sgd_zero_grad: + class TestModel(BoringModel): + + def training_step(self, batch, batch_idx, optimizer_idx=None): + return super().training_step(batch, batch_idx) - class TestModel(BoringModel): + def training_epoch_end(self, outputs): + ... - def training_step(self, batch, batch_idx, optimizer_idx=None): - output = self.layer(batch) - loss = self.loss(batch, output) - return {"loss": loss} - - def training_epoch_end(self, outputs): - outputs = sum(outputs, []) - torch.stack([x["loss"] for x in outputs]).mean() - - def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int): - if optimizer_idx == 0: - if batch_idx % 2 == 0: - optimizer.zero_grad() - - if optimizer_idx == 1: - if batch_idx % 5 == 0: - optimizer.zero_grad() - - def configure_optimizers(self): - optimizer_1 = torch.optim.SGD(self.layer.parameters(), lr=0.1) - optimizer_2 = torch.optim.Adam(self.layer.parameters(), lr=0.1) - lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_1, step_size=1) - return [optimizer_1, optimizer_2], [lr_scheduler] - - model = TestModel() - trainer = Trainer( - default_root_dir=tmpdir, - limit_train_batches=20, - limit_val_batches=1, - max_epochs=1, - weights_summary=None, - ) + def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, **_): + assert optimizer_closure.__name__ == "train_step_and_backward_closure" + # not passing the closure to the optimizer because step is mocked + # zero_grad is called inside the closure + if isinstance(optimizer, SGD) and batch_idx % 2 == 0: + optimizer_closure() + optimizer.step() + if isinstance(optimizer, Adam) and batch_idx % 4 == 0: + optimizer_closure() + optimizer.step() # not passing the closure here because it's a mock + + def configure_optimizers(self): + optimizer_1 = torch.optim.SGD(self.layer.parameters(), lr=0.1) + optimizer_2 = torch.optim.Adam(self.layer.parameters(), lr=0.1) + lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer_1, step_size=1) + return [optimizer_1, optimizer_2], [lr_scheduler] + + model = TestModel() + + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=8, + limit_val_batches=1, + max_epochs=1, + weights_summary=None, + ) + + with patch.multiple(torch.optim.SGD, zero_grad=DEFAULT, step=DEFAULT) as sgd, \ + patch.multiple(torch.optim.Adam, zero_grad=DEFAULT, step=DEFAULT) as adam: trainer.fit(model) - assert adam_zero_grad.call_count == 4 - assert sgd_zero_grad.call_count == 10 + assert sgd["step"].call_count == 4 + assert adam["step"].call_count == 2 + + assert sgd["zero_grad"].call_count == 4 + assert adam["zero_grad"].call_count == 2 def test_lightning_optimizer_automatic_optimization_lbfgs_zero_grad(tmpdir): @@ -335,20 +281,22 @@ def test_lightning_optimizer_automatic_optimization_lbfgs_zero_grad(tmpdir): Test zero_grad is called the same number of times as LBFGS requires for reevaluation of the loss in automatic_optimization. """ + class TestModel(BoringModel): + def configure_optimizers(self): + return torch.optim.LBFGS(self.parameters()) - with patch("torch.optim.LBFGS.zero_grad") as zero_grad: + model = TestModel() + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=1, + limit_val_batches=1, + max_epochs=1, + weights_summary=None, + ) - model = EvalModelTemplate() - model.configure_optimizers = model.configure_optimizers__lbfgs - trainer = Trainer( - default_root_dir=tmpdir, - limit_train_batches=1, - limit_val_batches=1, - max_epochs=1, - weights_summary=None, - ) + with patch("torch.optim.LBFGS.zero_grad") as zero_grad: trainer.fit(model) - lbfgs = model.optimizers() - max_iter = lbfgs.param_groups[0]["max_iter"] - assert zero_grad.call_count == max_iter + lbfgs = model.optimizers() + max_iter = lbfgs.param_groups[0]["max_iter"] + assert zero_grad.call_count == max_iter diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index f14ce984ffb67..a4765c9503b96 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -199,129 +199,32 @@ def test_strict_model_load(monkeypatch, tmpdir, tmpdir_server, url_ckpt): assert not failed, "Model should be loaded due to strict=False." -@pytest.mark.parametrize( - ["schedule", "expected"], - [ - pytest.param({ - 1: 2, - 3: 4 - }, [1, 2, 4]), - pytest.param(3, [3, 3, 3]), - pytest.param(4, [4, 4, 4]), - ], -) -def test_gradient_accumulation_scheduling(tmpdir, schedule, expected): - """ - Test grad accumulation by the freq of optimizer updates - """ - - # test incorrect configs - with pytest.raises(IndexError): - assert Trainer(accumulate_grad_batches={-1: 3, 1: 4, 4: 6}) - with pytest.raises(IndexError): - assert Trainer(accumulate_grad_batches={-2: 3}) - - with pytest.raises(TypeError): - assert Trainer(accumulate_grad_batches={}) - with pytest.raises(TypeError): - assert Trainer(accumulate_grad_batches=[[2, 3], [4, 6]]) - with pytest.raises(TypeError): - assert Trainer(accumulate_grad_batches={1: 2, 3.0: 4}) - with pytest.raises(TypeError): - assert Trainer(accumulate_grad_batches={1: 2.5, 3: 5}) - - model = EvalModelTemplate() - - trainer = Trainer( - accumulate_grad_batches=schedule, - limit_train_batches=0.7, # not to be divisible by accumulate_grad_batches on purpose - limit_val_batches=0.8, - max_epochs=4, - default_root_dir=tmpdir, - ) - - model.old_optimizer_step = model.optimizer_step - - # test optimizer call freq matches scheduler - def _optimizer_step( - epoch, - batch_idx, - optimizer, - optimizer_idx, - second_order_closure=None, - on_tpu=False, - using_native_amp=False, - using_lbfgs=False, - ): - # only test the first 12 batches in epoch - if batch_idx < 12: - if epoch == 0: - # reset counter when starting epoch - if batch_idx == expected[0] - 1: - model.prev_called_batch_idx = expected[0] - 1 - - # use this opportunity to test once - assert trainer.accumulate_grad_batches == expected[0] - - # separate check for last batch with accumulate 1 step - if expected[0] == 1 and (batch_idx + 1) == trainer.num_training_batches: - assert batch_idx == model.prev_called_batch_idx - elif (batch_idx + 1) == trainer.num_training_batches: - # prev_called_batch_idx - schedule + modulus remainder - assert batch_idx == (model.prev_called_batch_idx - expected[0] + (batch_idx + 1) % expected[0]) - else: - assert batch_idx == model.prev_called_batch_idx - model.prev_called_batch_idx += expected[0] - - elif 1 <= epoch <= 2: - # reset counter when starting epoch - if batch_idx == expected[1] - 1: - model.prev_called_batch_idx = expected[1] - 1 - - # use this opportunity to test once - assert trainer.accumulate_grad_batches == expected[1] - - if trainer.num_training_batches == batch_idx + 1: - # prev_called_batch_idx - schedule + modulus remainder - assert batch_idx == (model.prev_called_batch_idx - expected[1] + (batch_idx + 1) % expected[1]) - else: - assert batch_idx == model.prev_called_batch_idx - model.prev_called_batch_idx += expected[1] - - else: - if batch_idx == expected[2] - 1: - model.prev_called_batch_idx = expected[2] - 1 - - # use this opportunity to test once - assert trainer.accumulate_grad_batches == expected[2] - - if (batch_idx + 1) == trainer.num_training_batches: - # prev_called_batch_idx - schedule + modulus remainder - assert batch_idx == (model.prev_called_batch_idx - expected[2] + (batch_idx + 1) % expected[2]) - else: - assert batch_idx == model.prev_called_batch_idx - model.prev_called_batch_idx += expected[2] - - model.old_optimizer_step( - epoch, batch_idx, optimizer, optimizer_idx, second_order_closure, on_tpu, using_native_amp, using_lbfgs +@pytest.mark.parametrize("accumulate_grad_batches", (1, 2, 3)) +def test_trainer_accumulate_grad_batches_zero_grad(tmpdir, accumulate_grad_batches): + with patch("torch.optim.SGD.zero_grad") as sgd_zero_grad: + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=20, + limit_val_batches=1, + max_epochs=1, + weights_summary=None, + accumulate_grad_batches=accumulate_grad_batches, ) + trainer.fit(model) + + assert sgd_zero_grad.call_count == math.ceil(trainer.limit_train_batches / accumulate_grad_batches) @pytest.mark.parametrize( ["accumulate_grad_batches", "limit_train_batches"], [ - pytest.param({ - 1: 2, - 3: 4 - }, 1.0), - pytest.param({ - 1: 2, - 3: 4 - }, 0.5), # not to be divisible by accumulate_grad_batches on purpose - pytest.param(3, 1.0), - pytest.param(3, 0.8), # not to be divisible by accumulate_grad_batches on purpose - pytest.param(4, 1.0), - pytest.param(4, 0.7), # not to be divisible by accumulate_grad_batches on purpose + ({1: 2, 3: 4}, 1.0), + ({1: 2, 3: 4}, 0.5), # not to be divisible by accumulate_grad_batches on purpose + (3, 1.0), + (3, 0.8), # not to be divisible by accumulate_grad_batches on purpose + (4, 1.0), + (4, 0.7), # not to be divisible by accumulate_grad_batches on purpose ], ) def test_gradient_accumulation_scheduling_last_batch(tmpdir, accumulate_grad_batches, limit_train_batches): @@ -329,20 +232,19 @@ def test_gradient_accumulation_scheduling_last_batch(tmpdir, accumulate_grad_bat class CurrentModel(BoringModel): - def on_batch_start(self, batch, batch_idx, dataloader_idx): + def on_batch_start(self, *_): self.on_train_batch_start_state_dict = self.state_dict() - def on_batch_end(self, outputs, batch, batch_idx, dataloader_idx): + def on_batch_end(self, outputs, batch, batch_idx, *_): self.on_train_batch_start_end_dict = self.state_dict() for key in self.on_train_batch_start_end_dict.keys(): + equal = torch.equal( + self.on_train_batch_start_state_dict[key], self.on_train_batch_start_end_dict[key] + ) if (batch_idx + 1) == self.trainer.num_training_batches: - assert torch.equal( - self.on_train_batch_start_state_dict[key], self.on_train_batch_start_end_dict[key] - ) + assert equal else: - assert not torch.equal( - self.on_train_batch_start_state_dict[key], self.on_train_batch_start_end_dict[key] - ) + assert not equal model = CurrentModel() From 22fd39957646ce0367a2665e4a5d3ad1eeb03422 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sat, 27 Feb 2021 02:07:17 +0900 Subject: [PATCH 20/33] Add a test to check zero_grad call order --- tests/trainer/test_trainer.py | 77 +++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index a4765c9503b96..709f136f74b12 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import inspect import math import os import pickle @@ -26,6 +27,7 @@ import pytest import torch from omegaconf import OmegaConf +from torch.optim import SGD from torch.utils.data import DataLoader import tests.helpers.utils as tutils @@ -1691,3 +1693,78 @@ def training_step(self, batch, batch_idx): model = TestModel() trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, gpus=1) trainer.fit(model, train_data) + + +def test_train_loop_system(tmpdir): + """ + Test the following methods are called in the order. + 1. training_step + 2. zero_grad (skip when accumulate gradients) + 3. backward + """ + called_methods = [] + + trainer_options = dict( + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=3, + limit_val_batches=1, + limit_test_batches=1, + progress_bar_refresh_rate=0, + ) + + class TestOptimizer(SGD): + def zero_grad(self, set_to_none=False): + called_methods.append(inspect.currentframe().f_code.co_name) + return super().zero_grad(set_to_none) + + + class TestModel(BoringModel): + def configure_optimizers(self): + return TestOptimizer(self.parameters(), lr=0.1) + + def training_step(self, *args, **kwargs): + called_methods.append(inspect.currentframe().f_code.co_name) + return super().training_step(*args, **kwargs) + + def backward(self, *args, **kwargs): + called_methods.append(inspect.currentframe().f_code.co_name) + return super().backward(*args, **kwargs) + + + model = TestModel() + trainer = Trainer(**trainer_options) + + # No methods are called yet. + assert called_methods == [] + + trainer.fit(model) + assert called_methods == [ + "training_step", + "zero_grad", + "backward", + "training_step", + "zero_grad", + "backward", + "training_step", + "zero_grad", + "backward", + ] + + called_methods.clear() + trainer = Trainer(**trainer_options, accumulate_grad_batches=2) + + # No methods are called yet. + assert called_methods == [] + + trainer.fit(model) + assert called_methods == [ + "training_step", + "zero_grad", + "backward", + "training_step", + "backward", + "training_step", + "zero_grad", + "backward", + ] From 02ec3aa56a8026a6048738aea30ee9146de7dc70 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sat, 27 Feb 2021 02:12:47 +0900 Subject: [PATCH 21/33] flake8 --- tests/trainer/test_trainer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 709f136f74b12..13ac8b6fcf8b0 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1718,7 +1718,6 @@ def zero_grad(self, set_to_none=False): called_methods.append(inspect.currentframe().f_code.co_name) return super().zero_grad(set_to_none) - class TestModel(BoringModel): def configure_optimizers(self): return TestOptimizer(self.parameters(), lr=0.1) @@ -1731,7 +1730,6 @@ def backward(self, *args, **kwargs): called_methods.append(inspect.currentframe().f_code.co_name) return super().backward(*args, **kwargs) - model = TestModel() trainer = Trainer(**trainer_options) From f92f7082bed38d049f0629662928911596bd9313 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sat, 27 Feb 2021 02:15:56 +0900 Subject: [PATCH 22/33] Update test comment --- tests/trainer/test_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 13ac8b6fcf8b0..0450aedb89ff6 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1697,7 +1697,7 @@ def training_step(self, batch, batch_idx): def test_train_loop_system(tmpdir): """ - Test the following methods are called in the order. + Test the following methods are called in the order in automatic optimization. 1. training_step 2. zero_grad (skip when accumulate gradients) 3. backward From 85431ffaa4618dbeeb0fe8fd0216948f19c8f118 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sat, 27 Feb 2021 02:41:57 +0900 Subject: [PATCH 23/33] Make test compatible with PT1.4 --- tests/trainer/test_trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 0450aedb89ff6..fabcedee878ba 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1714,9 +1714,9 @@ def test_train_loop_system(tmpdir): ) class TestOptimizer(SGD): - def zero_grad(self, set_to_none=False): + def zero_grad(self, *args, **kwargs): called_methods.append(inspect.currentframe().f_code.co_name) - return super().zero_grad(set_to_none) + return super().zero_grad(*args, **kwargs) class TestModel(BoringModel): def configure_optimizers(self): From 1be883e3aa60e0ee2ffc2bd42678a2cc1db87b32 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sat, 27 Feb 2021 03:03:22 +0900 Subject: [PATCH 24/33] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos MocholĂ­ --- tests/trainer/test_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index fabcedee878ba..701ede9965447 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1715,7 +1715,7 @@ def test_train_loop_system(tmpdir): class TestOptimizer(SGD): def zero_grad(self, *args, **kwargs): - called_methods.append(inspect.currentframe().f_code.co_name) + called_methods.append("zero_grad") return super().zero_grad(*args, **kwargs) class TestModel(BoringModel): From 337d85b803603d2417a6b325bb5dd1a5b188430e Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sat, 27 Feb 2021 03:06:17 +0900 Subject: [PATCH 25/33] Use simple string for logging called methods --- tests/trainer/test_trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 701ede9965447..bf1d523d7b030 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1723,11 +1723,11 @@ def configure_optimizers(self): return TestOptimizer(self.parameters(), lr=0.1) def training_step(self, *args, **kwargs): - called_methods.append(inspect.currentframe().f_code.co_name) + called_methods.append("training_step") return super().training_step(*args, **kwargs) def backward(self, *args, **kwargs): - called_methods.append(inspect.currentframe().f_code.co_name) + called_methods.append("backward") return super().backward(*args, **kwargs) model = TestModel() From 3194bb3d8de17993da1df6ccf3cd358a67144be8 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sat, 27 Feb 2021 16:46:29 +0900 Subject: [PATCH 26/33] Add optimzer.step to test --- tests/trainer/test_trainer.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index bf1d523d7b030..a19a7dd9e138d 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1714,6 +1714,10 @@ def test_train_loop_system(tmpdir): ) class TestOptimizer(SGD): + def step(self, *args, **kwargs): + called_methods.append("step") + return super().step(*args, **kwargs) + def zero_grad(self, *args, **kwargs): called_methods.append("zero_grad") return super().zero_grad(*args, **kwargs) @@ -1738,12 +1742,15 @@ def backward(self, *args, **kwargs): trainer.fit(model) assert called_methods == [ + "step", "training_step", "zero_grad", "backward", + "step", "training_step", "zero_grad", "backward", + "step", "training_step", "zero_grad", "backward", @@ -1757,11 +1764,14 @@ def backward(self, *args, **kwargs): trainer.fit(model) assert called_methods == [ + "step", "training_step", "zero_grad", "backward", + "step", "training_step", "backward", + "step", "training_step", "zero_grad", "backward", From 91e7ae1d062e0b2ea62ab5c30566e318c7c67f12 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sat, 27 Feb 2021 18:01:57 +0900 Subject: [PATCH 27/33] Update the test comment --- tests/trainer/test_trainer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index a19a7dd9e138d..1ab068f861292 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1698,9 +1698,10 @@ def training_step(self, batch, batch_idx): def test_train_loop_system(tmpdir): """ Test the following methods are called in the order in automatic optimization. - 1. training_step - 2. zero_grad (skip when accumulate gradients) - 3. backward + 1. optimizer.step + 2. model.training_step + 3. optimizer.zero_grad (skip when accumulate gradients) + 4. model.backward """ called_methods = [] From 96a550483b814d668f02416473a9e8e311729cd7 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sat, 27 Feb 2021 22:13:58 +0900 Subject: [PATCH 28/33] Update the test --- tests/trainer/test_trainer.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 1ab068f861292..2ea41e17d750a 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1708,7 +1708,7 @@ def test_train_loop_system(tmpdir): trainer_options = dict( default_root_dir=tmpdir, max_epochs=1, - limit_train_batches=3, + limit_train_batches=5, limit_val_batches=1, limit_test_batches=1, progress_bar_refresh_rate=0, @@ -1747,33 +1747,33 @@ def backward(self, *args, **kwargs): "training_step", "zero_grad", "backward", - "step", - "training_step", - "zero_grad", - "backward", - "step", - "training_step", - "zero_grad", - "backward", - ] + ] * trainer.limit_train_batches called_methods.clear() - trainer = Trainer(**trainer_options, accumulate_grad_batches=2) + trainer = Trainer(**trainer_options, accumulate_grad_batches=3) # No methods are called yet. assert called_methods == [] trainer.fit(model) assert called_methods == [ - "step", + # 0 "training_step", "zero_grad", "backward", - "step", + # 1 "training_step", "backward", + # 2 "step", "training_step", + "backward", + # 3 + "training_step", "zero_grad", "backward", + # 4 + "step", + "training_step", + "backward", ] From 87fdfb80dccf4f9dc692b33c02cfe09498c53523 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sat, 27 Feb 2021 22:16:51 +0900 Subject: [PATCH 29/33] Update the test --- tests/trainer/test_trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 2ea41e17d750a..0b8807247a9ca 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1698,9 +1698,9 @@ def training_step(self, batch, batch_idx): def test_train_loop_system(tmpdir): """ Test the following methods are called in the order in automatic optimization. - 1. optimizer.step + 1. optimizer.step (skip when gradient accumulation) 2. model.training_step - 3. optimizer.zero_grad (skip when accumulate gradients) + 3. optimizer.zero_grad (run when the first batch of gradient accumulation) 4. model.backward """ called_methods = [] From 9d11f34673b4b7c6216579b46fb202e6055281ea Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sat, 27 Feb 2021 22:21:07 +0900 Subject: [PATCH 30/33] Remove unused import --- tests/trainer/test_trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 0b8807247a9ca..54721c50cf2ed 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import inspect import math import os import pickle From 93d3f57553e660f3ce7640c2f8de671230e034bf Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 28 Feb 2021 00:59:47 +0900 Subject: [PATCH 31/33] Update test comment --- tests/trainer/test_trainer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 54721c50cf2ed..47900fd33e783 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1701,6 +1701,10 @@ def test_train_loop_system(tmpdir): 2. model.training_step 3. optimizer.zero_grad (run when the first batch of gradient accumulation) 4. model.backward + + Note that the order is NOT `training_step`->`zero_grad`->`backward`->`step`. + This is because `optimizer.step(closure)` calls `closure()` which then calls + the three remaining methods `training_step`, `zero_grad` and `backward` inside. """ called_methods = [] From 05c28d52ef83d08cce2ff5c0e8590eab2b6c3308 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 28 Feb 2021 13:46:41 +0900 Subject: [PATCH 32/33] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad694f86a01b7..d96e248d1f225 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed -- Changed the order of `zero_grad` and `backward` calls in automatic optimization. Lightning now calls `zero_grad` before `backward` ([#6147](https://github.com/PyTorchLightning/pytorch-lightning/pull/6147)) +- Changed the order of `backward`, `step`, `zero_grad` to `zero_grad`, `backward`, `step` ([#6147](https://github.com/PyTorchLightning/pytorch-lightning/pull/6147)) ### Deprecated From 9836bb7e2907a3cb8506d4841b191a1eb1492eec Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Sun, 28 Feb 2021 13:53:29 +0900 Subject: [PATCH 33/33] Update docs --- docs/source/common/optimizers.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 3ad01856eea28..22898e4f1a1b2 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -139,8 +139,8 @@ With Lightning most users don't have to think about when to call ``.zero_grad()` since Lightning automates that for you. .. warning:: - Before 1.2.1, ``.zero_grad()`` was called after ``.backward()`` and ``.step()`` internally. - From 1.2.1, Lightning calls ``.zero_grad()`` before ``.backward()``. + Before 1.2.2, ``.zero_grad()`` was called after ``.backward()`` and ``.step()`` internally. + From 1.2.2, Lightning calls ``.zero_grad()`` before ``.backward()``. Under the hood Lightning does the following: @@ -367,8 +367,8 @@ Using the closure functions for optimization When using optimization schemes such as LBFGS, the `second_order_closure` needs to be enabled. By default, this function is defined by wrapping the `training_step` and the backward steps as follows .. warning:: - Before 1.2.1, ``.zero_grad()`` was called outside the closure internally. - From 1.2.1, the closure calls ``.zero_grad()`` inside, so there is no need to define your own closure + Before 1.2.2, ``.zero_grad()`` was called outside the closure internally. + From 1.2.2, the closure calls ``.zero_grad()`` inside, so there is no need to define your own closure when using similar optimizers to :class:`torch.optim.LBFGS` which requires reevaluation of the loss with the closure in ``optimizer.step()``. .. testcode::