From 833c850a421e7749765348f06e12f3b658541ceb Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 2 Mar 2021 10:34:51 +0000 Subject: [PATCH 01/12] give a more complete GAN example --- docs/source/common/optimizers.rst | 100 ++++++++++++++++++++++++------ 1 file changed, 82 insertions(+), 18 deletions(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 22898e4f1a1b2..8a2e749bb8a91 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -67,27 +67,91 @@ Here is the same example as above using a ``closure``. opt.zero_grad() -.. code-block:: python - - # Scenario for a GAN. - def training_step(...): - opt_gen, opt_dis = self.optimizers() +.. tip:: Be careful where you call ``zero_grad`` or your model won't converge. It is good pratice to call ``zero_grad`` before ``manual_backward``. - # compute generator loss - loss_gen = self.compute_generator_loss(...) - - # zero_grad needs to be called before backward - opt_gen.zero_grad() - self.manual_backward(loss_gen) - opt_gen.step() - # compute discriminator loss - loss_dis = self.compute_discriminator_loss(...) +.. code-block:: python - # zero_grad needs to be called before backward - opt_dis.zero_grad() - self.manual_backward(loss_dis) - opt_dis.step() + import torch + from pytorch_lightning import LightningModule + from torch.utils.data import Dataset + + class SimpleGAN(LightningModule): + + def __init__(self): + super().__init__() + latent_dim = 64 + self._Z = MultivariateNormal(tr.zeros(latent_dim, device=self.device), + tr.eye(latent_dim, device=self.device)) + self.G = MnistDenseGenerator(latent_dim) + self.D = MnistDenseDiscriminator() + self.num_workers = 0 + + @property + def automatic_optimization(self): + # Important: This property activate ``manual optimization`` for this model + return False + + def train_dataloader(self) -> DataLoader: + return tr.utils.data.DataLoader(data, batch_size=64, shuffle=True, + num_workers=self.num_workers) + + def forward(self, x): + return self.G(x) + + def generator_loss(self, d_z: Tensor) -> Tensor: + # the closer ``d_z`` is from 1, + # the better the generator is able to fool the discriminator + return -1 * tr.log(d_z).mean() + + def discriminator_loss(self, d_x: Tensor, d_z: Tensor) -> Tensor: + # the closer is ``d_x`` from 1 and ``dz`` from 0, + # the better the discriminator is able to distinguish + # true data from generated ones + return -1 * (tr.log(d_x).mean() + tr.log(1 - d_z).mean()) + + def sample_z(self, n) -> Tensor: + sample = self._Z.sample((n,)) + return sample + + def sample_G(self, n) -> Tensor: + z = self.sample_z(n) + return self.G(z) + + def training_step(self, batch, batch_idx, optimizer_idx, *args): + # Get optimizers + g_opt, d_opt = self.optimizers() + + # Train generator + X, _ = batch + batch_size = X.shape[0] + g_X = self.sample_G(batch_size) + d_z = self.D(g_X) + g_loss = self.generator_loss(d_z) + + # zero_grad should be called before manual_backward + g_opt.zero_grad() + self.manual_backward(g_loss) + + g_opt.step() + + # Train discriminator + d_x = self.D(X) + d_z = self.D(g_X.detach()) + + d_loss = self.discriminator_loss(d_x, d_z) + + # zero_grad should be called before manual_backward + d_opt.zero_grad() + self.manual_backward(d_loss) + d_opt.step() + + self.log_dict({'g_loss': g_loss, 'd_loss': d_loss}, prog_bar=True, logger=True) + + def configure_optimizers(self): + g_opt = torch.optim.RMSprop(self.G.parameters(), lr=1e-5) + d_opt = torch.optim.RMSprop(self.D.parameters(), lr=1e-5) + return g_opt, d_opt .. note:: ``LightningOptimizer`` provides a ``toggle_model`` function as a ``@context_manager`` for advanced users. It can be useful when performing gradient accumulation with several optimizers or training in a distributed setting. From c495318978e0c5c1b123c3bf9fa61afd061dd1ce Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 2 Mar 2021 18:06:00 +0000 Subject: [PATCH 02/12] i --- docs/source/common/optimizers.rst | 144 ++++++++++++++++++------------ 1 file changed, 88 insertions(+), 56 deletions(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 8a2e749bb8a91..6451626e3b2a2 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -80,79 +80,67 @@ Here is the same example as above using a ``closure``. def __init__(self): super().__init__() - latent_dim = 64 - self._Z = MultivariateNormal(tr.zeros(latent_dim, device=self.device), - tr.eye(latent_dim, device=self.device)) - self.G = MnistDenseGenerator(latent_dim) - self.D = MnistDenseDiscriminator() - self.num_workers = 0 + self.G = Generator(...) + self.D = Discriminator(...) @property def automatic_optimization(self): # Important: This property activate ``manual optimization`` for this model return False - def train_dataloader(self) -> DataLoader: - return tr.utils.data.DataLoader(data, batch_size=64, shuffle=True, - num_workers=self.num_workers) + def generator_loss(self, d_z: Tensor) -> Tensor: + # the closer ``d_z`` is from 1, + # the better the generator is able to fool the discriminator + return -1 * tr.log(d_z).mean() - def forward(self, x): - return self.G(x) + def discriminator_loss(self, d_x: Tensor, d_z: Tensor) -> Tensor: + # the closer is ``d_x`` from 1 and ``dz`` from 0, + # the better the discriminator is able to distinguish + # true data from generated ones + return -1 * (tr.log(d_x).mean() + tr.log(1 - d_z).mean()) - def generator_loss(self, d_z: Tensor) -> Tensor: - # the closer ``d_z`` is from 1, - # the better the generator is able to fool the discriminator - return -1 * tr.log(d_z).mean() + def sample_z(self, n) -> Tensor: + sample = self._Z.sample((n,)) + return sample - def discriminator_loss(self, d_x: Tensor, d_z: Tensor) -> Tensor: - # the closer is ``d_x`` from 1 and ``dz`` from 0, - # the better the discriminator is able to distinguish - # true data from generated ones - return -1 * (tr.log(d_x).mean() + tr.log(1 - d_z).mean()) + def sample_G(self, n) -> Tensor: + z = self.sample_z(n) + return self.G(z) - def sample_z(self, n) -> Tensor: - sample = self._Z.sample((n,)) - return sample + def training_step(self, batch, batch_idx, optimizer_idx, *args): + # Get optimizers + g_opt, d_opt = self.optimizers() - def sample_G(self, n) -> Tensor: - z = self.sample_z(n) - return self.G(z) + # Train generator + X, _ = batch + batch_size = X.shape[0] + g_X = self.sample_G(batch_size) + d_z = self.D(g_X) + g_loss = self.generator_loss(d_z) - def training_step(self, batch, batch_idx, optimizer_idx, *args): - # Get optimizers - g_opt, d_opt = self.optimizers() + # zero_grad should be called before manual_backward + g_opt.zero_grad() + self.manual_backward(g_loss) - # Train generator - X, _ = batch - batch_size = X.shape[0] - g_X = self.sample_G(batch_size) - d_z = self.D(g_X) - g_loss = self.generator_loss(d_z) + g_opt.step() - # zero_grad should be called before manual_backward - g_opt.zero_grad() - self.manual_backward(g_loss) + # Train discriminator + d_x = self.D(X) + d_z = self.D(g_X.detach()) - g_opt.step() + d_loss = self.discriminator_loss(d_x, d_z) - # Train discriminator - d_x = self.D(X) - d_z = self.D(g_X.detach()) + # zero_grad should be called before manual_backward + d_opt.zero_grad() + self.manual_backward(d_loss) + d_opt.step() - d_loss = self.discriminator_loss(d_x, d_z) - - # zero_grad should be called before manual_backward - d_opt.zero_grad() - self.manual_backward(d_loss) - d_opt.step() - - self.log_dict({'g_loss': g_loss, 'd_loss': d_loss}, prog_bar=True, logger=True) - - def configure_optimizers(self): - g_opt = torch.optim.RMSprop(self.G.parameters(), lr=1e-5) - d_opt = torch.optim.RMSprop(self.D.parameters(), lr=1e-5) - return g_opt, d_opt + self.log_dict({'g_loss': g_loss, 'd_loss': d_loss}, prog_bar=True) + def configure_optimizers(self): + g_opt = torch.optim.RMSprop(self.G.parameters(), lr=1e-5) + d_opt = torch.optim.RMSprop(self.D.parameters(), lr=1e-5) + return g_opt, d_opt .. note:: ``LightningOptimizer`` provides a ``toggle_model`` function as a ``@context_manager`` for advanced users. It can be useful when performing gradient accumulation with several optimizers or training in a distributed setting. @@ -164,7 +152,51 @@ Toggling means that all parameters from B exclusive to A will have their ``requi When performing gradient accumulation, there is no need to perform grad synchronization during the accumulation phase. Setting ``sync_grad`` to ``False`` will block this synchronization and improve your training speed. -Here is an example on how to use it: + +Here is the same example as before with this helper. + +.. code-block:: python + + import torch + from pytorch_lightning import LightningModule + from torch.utils.data import Dataset + + class SimpleGAN(LightningModule): + + ... + + def training_step(self, batch, batch_idx, optimizer_idx, *args): + g_opt, d_opt = self.optimizers() + X, _ = batch + batch_size = X.shape[0] + + def gen_closure(): + g_X = self.sample_G(batch_size) + d_x = self.D(X) + d_z = self.D(g_X) + g_loss = self.generator_loss(d_z) + g_opt.zero_grad() + self.manual_backward(g_loss) + self.log('g_loss', g_loss, prog_bar=True) + + with g_opt.toggle_model(): + g_opt.step(closure=gen_closure) + + def dis_closure(): + g_X = self.sample_G(batch_size) + d_x = self.D(X) + d_z = self.D(g_X) + d_loss = self.discriminator_loss(d_x, d_z) + d_opt.zero_grad() + self.manual_backward(d_loss) + self.log('d_loss', d_loss, prog_bar=True) + + with d_opt.toggle_model(): + d_opt.step(closure=dis_closure) + + +Here is an example for advanced use-case. + .. code-block:: python From 5b3aa0e0e94dc2c806975def58e70e58f17c6eb1 Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 2 Mar 2021 20:37:50 +0000 Subject: [PATCH 03/12] update example --- docs/source/common/optimizers.rst | 125 ++++++++++++++---------------- 1 file changed, 60 insertions(+), 65 deletions(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 6451626e3b2a2..9842ff8372158 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -108,38 +108,49 @@ Here is the same example as above using a ``closure``. return self.G(z) def training_step(self, batch, batch_idx, optimizer_idx, *args): - # Get optimizers + # Implementation follows https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html g_opt, d_opt = self.optimizers() - # Train generator X, _ = batch batch_size = X.shape[0] - g_X = self.sample_G(batch_size) - d_z = self.D(g_X) - g_loss = self.generator_loss(d_z) - # zero_grad should be called before manual_backward - g_opt.zero_grad() - self.manual_backward(g_loss) + real_label = torch.ones((batch_size, 1), device=self.device) + fake_label = torch.zeros((batch_size, 1), device=self.device) - g_opt.step() + g_X = self.sample_G(batch_size) + + ########################### + # Optimize Discriminator # + ########################### + d_opt.zero_grad() - # Train discriminator d_x = self.D(X) + errD_real = self.criterion(d_x, real_label) + d_z = self.D(g_X.detach()) + errD_fake = self.criterion(d_z, fake_label) - d_loss = self.discriminator_loss(d_x, d_z) + errD = (errD_real + errD_fake) - # zero_grad should be called before manual_backward - d_opt.zero_grad() - self.manual_backward(d_loss) + self.manual_backward(errD) d_opt.step() - self.log_dict({'g_loss': g_loss, 'd_loss': d_loss}, prog_bar=True) + ####################### + # Optimize Generator # + ####################### + g_opt.zero_grad() + + d_z = self.D(g_X) + errG = self.criterion(d_z, real_label) + + self.manual_backward(errG) + g_opt.step() + + self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True) def configure_optimizers(self): - g_opt = torch.optim.RMSprop(self.G.parameters(), lr=1e-5) - d_opt = torch.optim.RMSprop(self.D.parameters(), lr=1e-5) + g_opt = torch.optim.Adam(self.G.parameters(), lr=1e-5) + d_opt = torch.optim.Adam(self.D.parameters(), lr=1e-5) return g_opt, d_opt .. note:: ``LightningOptimizer`` provides a ``toggle_model`` function as a ``@context_manager`` for advanced users. It can be useful when performing gradient accumulation with several optimizers or training in a distributed setting. @@ -153,79 +164,63 @@ When performing gradient accumulation, there is no need to perform grad synchron Setting ``sync_grad`` to ``False`` will block this synchronization and improve your training speed. -Here is the same example as before with this helper. +Here is an example for advanced use-case. + .. code-block:: python - import torch - from pytorch_lightning import LightningModule - from torch.utils.data import Dataset + + # Scenario for a GAN with gradient accumulation every 2 batches and optimized for multiple gpus. class SimpleGAN(LightningModule): ... def training_step(self, batch, batch_idx, optimizer_idx, *args): + # Implementation follows https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html g_opt, d_opt = self.optimizers() + X, _ = batch + X.requires_grad = True batch_size = X.shape[0] - def gen_closure(): - g_X = self.sample_G(batch_size) - d_x = self.D(X) - d_z = self.D(g_X) - g_loss = self.generator_loss(d_z) - g_opt.zero_grad() - self.manual_backward(g_loss) - self.log('g_loss', g_loss, prog_bar=True) + real_label = torch.ones((batch_size, 1), device=self.device) + fake_label = torch.zeros((batch_size, 1), device=self.device) - with g_opt.toggle_model(): - g_opt.step(closure=gen_closure) + accumulated_grad_batches = batch_idx % 2 == 0 - def dis_closure(): g_X = self.sample_G(batch_size) - d_x = self.D(X) - d_z = self.D(g_X) - d_loss = self.discriminator_loss(d_x, d_z) - d_opt.zero_grad() - self.manual_backward(d_loss) - self.log('d_loss', d_loss, prog_bar=True) - - with d_opt.toggle_model(): - d_opt.step(closure=dis_closure) - -Here is an example for advanced use-case. - - -.. code-block:: python - - - # Scenario for a GAN with gradient accumulation every 2 batches and optimized for multiple gpus. + ########################### + # Optimize Discriminator # + ########################### + with d_opt.toggle_model(sync_grad=accumulated_grad_batches): + d_x = self.D(X) + errD_real = self.criterion(d_x, real_label) - def training_step(self, batch, batch_idx, ...): - opt_gen, opt_dis = self.optimizers() + d_z = self.D(g_X.detach()) + errD_fake = self.criterion(d_z, fake_label) - accumulated_grad_batches = batch_idx % 2 == 0 + errD = (errD_real + errD_fake) - # compute generator loss - def closure_gen(): - loss_gen = self.compute_generator_loss(...) - self.manual_backward(loss_gen) + self.manual_backward(errD) if accumulated_grad_batches: - opt_gen.zero_grad() + d_opt.step() + d_opt.zero_grad() - with opt_gen.toggle_model(sync_grad=accumulated_grad_batches): - opt_gen.step(closure=closure_gen) + ####################### + # Optimize Generator # + ####################### + with g_opt.toggle_model(sync_grad=accumulated_grad_batches): + d_z = self.D(g_X) + errG = self.criterion(d_z, real_label) - def closure_dis(): - loss_dis = self.compute_discriminator_loss(...) - self.manual_backward(loss_dis) + self.manual_backward(errG) if accumulated_grad_batches: - opt_dis.zero_grad() + g_opt.step() + g_opt.zero_grad() - with opt_dis.toggle_model(sync_grad=accumulated_grad_batches): - opt_dis.step(closure=closure_dis) + self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True) ------ From 4baa3713f0705add5e5498f697aac04d156b6200 Mon Sep 17 00:00:00 2001 From: tchaton Date: Tue, 2 Mar 2021 20:38:23 +0000 Subject: [PATCH 04/12] update --- docs/source/common/optimizers.rst | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 9842ff8372158..d16ffb9a8206a 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -195,30 +195,30 @@ Here is an example for advanced use-case. # Optimize Discriminator # ########################### with d_opt.toggle_model(sync_grad=accumulated_grad_batches): - d_x = self.D(X) - errD_real = self.criterion(d_x, real_label) + d_x = self.D(X) + errD_real = self.criterion(d_x, real_label) - d_z = self.D(g_X.detach()) - errD_fake = self.criterion(d_z, fake_label) + d_z = self.D(g_X.detach()) + errD_fake = self.criterion(d_z, fake_label) - errD = (errD_real + errD_fake) + errD = (errD_real + errD_fake) - self.manual_backward(errD) - if accumulated_grad_batches: - d_opt.step() - d_opt.zero_grad() + self.manual_backward(errD) + if accumulated_grad_batches: + d_opt.step() + d_opt.zero_grad() ####################### # Optimize Generator # ####################### with g_opt.toggle_model(sync_grad=accumulated_grad_batches): - d_z = self.D(g_X) - errG = self.criterion(d_z, real_label) + d_z = self.D(g_X) + errG = self.criterion(d_z, real_label) - self.manual_backward(errG) - if accumulated_grad_batches: - g_opt.step() - g_opt.zero_grad() + self.manual_backward(errG) + if accumulated_grad_batches: + g_opt.step() + g_opt.zero_grad() self.log_dict({'g_loss': errG, 'd_loss': errD}, prog_bar=True) From 0186d0958f8946b3a5b8bbd68d509020ba696297 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 3 Mar 2021 13:17:32 +0000 Subject: [PATCH 05/12] Update docs/source/common/optimizers.rst Co-authored-by: Akihiro Nitta --- docs/source/common/optimizers.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index d16ffb9a8206a..43a96965a10de 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -94,7 +94,7 @@ Here is the same example as above using a ``closure``. return -1 * tr.log(d_z).mean() def discriminator_loss(self, d_x: Tensor, d_z: Tensor) -> Tensor: - # the closer is ``d_x`` from 1 and ``dz`` from 0, + # the closer is ``d_x`` from 1 and ``d_z`` from 0, # the better the discriminator is able to distinguish # true data from generated ones return -1 * (tr.log(d_x).mean() + tr.log(1 - d_z).mean()) From d26ee8ba10390e5ebef4eac6ef7110710e6a9405 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 3 Mar 2021 13:23:17 +0000 Subject: [PATCH 06/12] update --- docs/source/common/optimizers.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index d16ffb9a8206a..ada2708f20a75 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -51,7 +51,7 @@ to manually manage the optimization process. To do so, do the following: Here is the same example as above using a ``closure``. -.. code-block:: python +.. testcode:: python def training_step(batch, batch_idx, optimizer_idx): opt = self.optimizers() @@ -70,7 +70,7 @@ Here is the same example as above using a ``closure``. .. tip:: Be careful where you call ``zero_grad`` or your model won't converge. It is good pratice to call ``zero_grad`` before ``manual_backward``. -.. code-block:: python +.. testcode:: python import torch from pytorch_lightning import LightningModule @@ -80,8 +80,8 @@ Here is the same example as above using a ``closure``. def __init__(self): super().__init__() - self.G = Generator(...) - self.D = Discriminator(...) + self.G = Generator() + self.D = Discriminator() @property def automatic_optimization(self): @@ -91,13 +91,13 @@ Here is the same example as above using a ``closure``. def generator_loss(self, d_z: Tensor) -> Tensor: # the closer ``d_z`` is from 1, # the better the generator is able to fool the discriminator - return -1 * tr.log(d_z).mean() + return -1 * torch.log(d_z).mean() def discriminator_loss(self, d_x: Tensor, d_z: Tensor) -> Tensor: # the closer is ``d_x`` from 1 and ``dz`` from 0, # the better the discriminator is able to distinguish # true data from generated ones - return -1 * (tr.log(d_x).mean() + tr.log(1 - d_z).mean()) + return -1 * (torch.log(d_x).mean() + torch.log(1 - d_z).mean()) def sample_z(self, n) -> Tensor: sample = self._Z.sample((n,)) @@ -167,7 +167,7 @@ Setting ``sync_grad`` to ``False`` will block this synchronization and improve y Here is an example for advanced use-case. -.. code-block:: python +.. testcode:: python # Scenario for a GAN with gradient accumulation every 2 batches and optimized for multiple gpus. From 20e93ac4015d1321d63d07ee2ee7440fc2f7f1ac Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 3 Mar 2021 13:37:30 +0000 Subject: [PATCH 07/12] update --- docs/source/common/optimizers.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index af62bf71781f9..952b774cf3e81 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -73,8 +73,8 @@ Here is the same example as above using a ``closure``. .. testcode:: python import torch + from torch import Tensor from pytorch_lightning import LightningModule - from torch.utils.data import Dataset class SimpleGAN(LightningModule): From dcaa4bc82ec00e1b503a1adcd7d517af04615612 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 3 Mar 2021 20:35:45 +0000 Subject: [PATCH 08/12] update doc --- docs/source/common/optimizers.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 952b774cf3e81..1142ddd93ad3a 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -83,10 +83,8 @@ Here is the same example as above using a ``closure``. self.G = Generator() self.D = Discriminator() - @property - def automatic_optimization(self): # Important: This property activate ``manual optimization`` for this model - return False + self.automatic_optimization = False def generator_loss(self, d_z: Tensor) -> Tensor: # the closer ``d_z`` is from 1, From f2c1b24fd5a64d0dfc63c33055bbcfce50d5d561 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 3 Mar 2021 20:38:10 +0000 Subject: [PATCH 09/12] update --- docs/source/common/optimizers.rst | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 1142ddd93ad3a..13bfb8978e156 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -21,10 +21,21 @@ Manual optimization For advanced research topics like reinforcement learning, sparse coding, or GAN research, it may be desirable to manually manage the optimization process. To do so, do the following: -* Override your LightningModule ``automatic_optimization`` property to return ``False`` +* Set the ``automatic_optimization`` property to ``False`` in your ``LightningModule`` ``__init__`` function * Drop or ignore the optimizer_idx argument * Use ``self.manual_backward(loss)`` instead of ``loss.backward()``. +.. testcode:: python + + from pytorch_lightning import LightningModule + + class MyModel(LightningModule): + + def __init__(self): + super().__init__() + # Important: This property activate ``manual optimization`` for your model + self.automatic_optimization = False + .. note:: This is only recommended for experts who need ultimate flexibility. Lightning will handle only precision and accelerators logic. The users are left with ``optimizer.zero_grad()``, gradient accumulation, model toggling, etc.. .. warning:: Before 1.2, ``optimzer.step`` was calling ``optimizer.zero_grad()`` internally. From 1.2, it is left to the users expertize. From df4b06eddd1c9ec3c83147d05768f3e5ee7aef83 Mon Sep 17 00:00:00 2001 From: tchaton Date: Thu, 4 Mar 2021 22:33:14 +0000 Subject: [PATCH 10/12] update do --- docs/source/common/optimizers.rst | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 13bfb8978e156..22262d9b00d2b 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -22,7 +22,6 @@ For advanced research topics like reinforcement learning, sparse coding, or GAN to manually manage the optimization process. To do so, do the following: * Set the ``automatic_optimization`` property to ``False`` in your ``LightningModule`` ``__init__`` function -* Drop or ignore the optimizer_idx argument * Use ``self.manual_backward(loss)`` instead of ``loss.backward()``. .. testcode:: python @@ -36,6 +35,18 @@ to manually manage the optimization process. To do so, do the following: # Important: This property activate ``manual optimization`` for your model self.automatic_optimization = False +.. testcode:: python + + from pytorch_lightning import LightningModule + + class MyModel(LightningModule): + + def training_step(batch, batch_idx): + opt = self.optimizers() + loss = self.compute_loss(batch) + self.manual_backward(loss) + + .. note:: This is only recommended for experts who need ultimate flexibility. Lightning will handle only precision and accelerators logic. The users are left with ``optimizer.zero_grad()``, gradient accumulation, model toggling, etc.. .. warning:: Before 1.2, ``optimzer.step`` was calling ``optimizer.zero_grad()`` internally. From 1.2, it is left to the users expertize. @@ -46,7 +57,7 @@ to manually manage the optimization process. To do so, do the following: .. code-block:: python - def training_step(batch, batch_idx, optimizer_idx): + def training_step(batch, batch_idx): opt = self.optimizers() loss = self.compute_loss(batch) @@ -64,7 +75,7 @@ Here is the same example as above using a ``closure``. .. testcode:: python - def training_step(batch, batch_idx, optimizer_idx): + def training_step(batch, batch_idx): opt = self.optimizers() def forward_and_backward(): @@ -116,7 +127,7 @@ Here is the same example as above using a ``closure``. z = self.sample_z(n) return self.G(z) - def training_step(self, batch, batch_idx, optimizer_idx, *args): + def training_step(self, batch, batch_idx): # Implementation follows https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html g_opt, d_opt = self.optimizers() @@ -185,7 +196,7 @@ Here is an example for advanced use-case. ... - def training_step(self, batch, batch_idx, optimizer_idx, *args): + def training_step(self, batch, batch_idx): # Implementation follows https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html g_opt, d_opt = self.optimizers() From 65eb3dce4e4d3e4427ae9fde4a308fea9aa3d1c0 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Fri, 5 Mar 2021 12:44:04 +0000 Subject: [PATCH 11/12] Update docs/source/common/optimizers.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- docs/source/common/optimizers.rst | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 22262d9b00d2b..563c89ab365f7 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -108,17 +108,6 @@ Here is the same example as above using a ``closure``. # Important: This property activate ``manual optimization`` for this model self.automatic_optimization = False - def generator_loss(self, d_z: Tensor) -> Tensor: - # the closer ``d_z`` is from 1, - # the better the generator is able to fool the discriminator - return -1 * torch.log(d_z).mean() - - def discriminator_loss(self, d_x: Tensor, d_z: Tensor) -> Tensor: - # the closer is ``d_x`` from 1 and ``d_z`` from 0, - # the better the discriminator is able to distinguish - # true data from generated ones - return -1 * (torch.log(d_x).mean() + torch.log(1 - d_z).mean()) - def sample_z(self, n) -> Tensor: sample = self._Z.sample((n,)) return sample From 988ab0395b350b3e81f17985de40437c90816168 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Fri, 5 Mar 2021 12:44:15 +0000 Subject: [PATCH 12/12] Update docs/source/common/optimizers.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- docs/source/common/optimizers.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst index 563c89ab365f7..3b29fd4c08f13 100644 --- a/docs/source/common/optimizers.rst +++ b/docs/source/common/optimizers.rst @@ -35,12 +35,6 @@ to manually manage the optimization process. To do so, do the following: # Important: This property activate ``manual optimization`` for your model self.automatic_optimization = False -.. testcode:: python - - from pytorch_lightning import LightningModule - - class MyModel(LightningModule): - def training_step(batch, batch_idx): opt = self.optimizers() loss = self.compute_loss(batch)