diff --git a/docs/source/child_modules.rst b/docs/source/child_modules.rst index afa1e0afbcd7c7..c50075e9d4b50a 100644 --- a/docs/source/child_modules.rst +++ b/docs/source/child_modules.rst @@ -66,7 +66,9 @@ that change in the `Autoencoder` model are the init, forward, training, validati x_hat = self(representation) loss = F.nll_loss(logits, y) - return {f'{prefix}_loss': loss} + result = pl.EvalResult() + result.log(f'{prefix}_loss', loss) + return result and we can train this using the same trainer diff --git a/docs/source/hyperparameters.rst b/docs/source/hyperparameters.rst index 26d38679fd847d..4af29f35859625 100644 --- a/docs/source/hyperparameters.rst +++ b/docs/source/hyperparameters.rst @@ -42,6 +42,8 @@ It is best practice to layer your arguments in three sections. 2. Model specific arguments (layer_dim, num_layers, learning_rate, etc...) 3. Program arguments (data_path, cluster_email, etc...) +| + We can do this as follows. First, in your LightningModule, define the arguments specific to that module. Remember that data splits or data paths may also be specific to a module (ie: if your project has a model that trains on Imagenet and another on CIFAR-10). diff --git a/docs/source/introduction_guide.rst b/docs/source/introduction_guide.rst index abca6333cba119..440fe2536f2d88 100644 --- a/docs/source/introduction_guide.rst +++ b/docs/source/introduction_guide.rst @@ -320,6 +320,8 @@ When your models need to know about the data, it's best to process the data befo 1. use `prepare_data` to download and process the dataset. 2. use `setup` to do splits, and build your model internals +| + .. testcode:: class LitMNIST(LightningModule): @@ -391,11 +393,11 @@ In the case of MNIST we do the following for epoch in epochs: for batch in data: - # TRAINING STEP START + # ------ TRAINING STEP START ------ x, y = batch logits = model(x) loss = F.nll_loss(logits, y) - # TRAINING STEP END + # ------ TRAINING STEP END ------ loss.backward() optimizer.step() @@ -419,12 +421,13 @@ This code is not restricted which means it can be as complicated as a full seq-2 TrainResult ^^^^^^^^^^^ -Whenever you'd like more control over the outputs of the `training_step` use a `TrainResult` object which can: +Whenever you'd like to log, or sync values across GPUs use `TrainResult`. - log to Tensorboard or the other logger of your choice. - log to the progress-bar. - log on every step. - log aggregate epoch metrics. +- average values across GPUs/TPU cores .. code-block:: python @@ -441,6 +444,13 @@ Whenever you'd like more control over the outputs of the `training_step` use a ` # equivalent result.log('train_loss', loss, on_step=True, on_epoch=False, prog_bar=False, logger=True, reduce_fx=torch.mean) +When training across accelerators (GPUs/TPUs) you can sync a metric if needed. + +.. code-block:: python + + # sync across GPUs / TPUs, etc... + result.log('train_loss', loss, sync_dist=True) + If you are only using a training_loop (`training_step`) without a validation or test loop (`validation_step`, `test_step`), you can still use EarlyStopping or automatic checkpointing @@ -460,6 +470,8 @@ So far we defined 4 key ingredients in pure PyTorch but organized the code with 3. Optimizer. 4. What happens in the training loop. +| + For clarity, we'll recall that the full LightningModule now looks like this. .. code-block:: python @@ -533,6 +545,9 @@ Which will generate automatic tensorboard logs. .. figure:: /_images/mnist_imgs/mnist_tb.png :alt: mnist CPU bar + :width: 500 + +| But you can also use any of the `number of other loggers `_ we support. @@ -585,13 +600,20 @@ First, change the runtime to TPU (and reinstall lightning). .. figure:: /_images/mnist_imgs/runtime_tpu.png :alt: mnist GPU bar + :width: 400 .. figure:: /_images/mnist_imgs/restart_runtime.png :alt: mnist GPU bar + :width: 400 + +| Next, install the required xla library (adds support for PyTorch on TPUs) +.. code-block:: shell + !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py + !python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev In distributed training (multiple GPUs and multiple TPU cores) each GPU or TPU core will run a copy @@ -607,6 +629,10 @@ In this method we do all the preparation we need to do once (instead of on every .. code-block:: python class MNISTDataModule(LightningDataModule): + def __init__(self, batch_size=64): + super().__init__() + self.batch_size = batch_size + def prepare_data(self): # download only MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor()) @@ -614,7 +640,7 @@ In this method we do all the preparation we need to do once (instead of on every def setup(self, stage): # transform - transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) + transform=transforms.Compose([transforms.ToTensor()]) MNIST(os.getcwd(), train=True, download=False, transform=transform) MNIST(os.getcwd(), train=False, download=False, transform=transform) @@ -627,13 +653,13 @@ In this method we do all the preparation we need to do once (instead of on every self.test_dataset = mnist_test def train_dataloader(self): - return DataLoader(self.train_dataset, batch_size=64) + return DataLoader(self.train_dataset, batch_size=self.batch_size) def val_dataloader(self): - return DataLoader(self.val_dataset, batch_size=64) + return DataLoader(self.val_dataset, batch_size=self.batch_size) def test_dataloader(self): - return DataLoader(self.test_dataset, batch_size=64) + return DataLoader(self.test_dataset, batch_size=self.batch_size) The `prepare_data` method is also a good place to do any data processing that needs to be done only once (ie: download or tokenize, etc...). @@ -653,11 +679,13 @@ You'll now see the TPU cores booting up. .. figure:: /_images/mnist_imgs/tpu_start.png :alt: TPU start + :width: 400 Notice the epoch is MUCH faster! .. figure:: /_images/mnist_imgs/tpu_fast.png :alt: TPU speed + :width: 600 ---------------- @@ -737,12 +765,13 @@ If you still need even more fine-grain control, define the other optional method .. code-block:: python def validation_step(self, batch, batch_idx): - val_step_output = {'step_output': x} - return val_step_output + result = pl.EvalResult() + result.prediction = some_prediction + return result def validation_epoch_end(self, val_step_outputs): - for val_step_output in val_step_outputs: - # each object here is what you passed back at each validation_step + # do something with all the predictions from each validation_step + all_predictions = val_step_outputs.prediction ---------------- diff --git a/docs/source/lightning-module.rst b/docs/source/lightning-module.rst index 3e329bec3a4e82..bf202129a65c7c 100644 --- a/docs/source/lightning-module.rst +++ b/docs/source/lightning-module.rst @@ -3,10 +3,1233 @@ LightningModule =============== +A :class:`~LightningModule` organizes your PyTorch code into 5 sections -.. automodule:: pytorch_lightning.core - :noindex: - :exclude-members: - _abc_impl, - summarize, +- Computations (init). +- Train loop (training_step) +- Validation loop (validation_step) +- Test loop (test_step) +- Optimizers (configure_optimizers) +| + +.. figure:: https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_mod_small.gif + :alt: Convert from PyTorch to Lightning + +| + +Notice a few things. + +1. It's the SAME code. +2. The PyTorch code IS NOT abstracted - just organized. +3. All the other code that's not in the :class:`~LightningModule` + has been automated for you by the trainer. + +| + + .. code-block:: python + + net = Net() + trainer = Trainer() + trainer.fit(net) + +4. There are no .cuda() or .to() calls... Lightning does these for you. + +| + + .. code-block:: python + + # don't do in lightning + x = torch.Tensor(2, 3) + x = x.cuda() + x = x.to(device) + + # do this instead + x = x # leave it alone! + + # or to init a new tensor + new_x = torch.Tensor(2, 3) + new_x = new_x.type_as(x.type()) + +5. There are no samplers for distributed, Lightning also does this for you. + +| + + .. code-block:: python + + # Don't do in Lightning... + data = MNIST(...) + sampler = DistributedSampler(data) + DataLoader(data, sampler=sampler) + + # do this instead + data = MNIST(...) + DataLoader(data) + +6. A :class:`~LightningModule` is a :class:`torch.nn.Module` but with added functionality. Use it as such! + +| + + .. code-block:: python + + net = Net.load_from_checkpoint(PATH) + net.freeze() + out = net(x) + +Thus, to use Lightning, you just need to organize your code which takes about 30 minutes, +(and let's be real, you probably should do anyhow). + +------------ + +Minimal Example +--------------- + +Here are the only required methods. + +.. code-block:: python + + >>> import pytorch_lightning as pl + >>> class LitModel(pl.LightningModule): + ... + ... def __init__(self): + ... super().__init__() + ... self.l1 = torch.nn.Linear(28 * 28, 10) + ... + ... def forward(self, x): + ... return torch.relu(self.l1(x.view(x.size(0), -1))) + ... + ... def training_step(self, batch, batch_idx): + ... x, y = batch + ... y_hat = self(x) + ... loss = F.cross_entropy(y_hat, y) + ... return pl.TrainResult(loss) + ... + ... def configure_optimizers(self): + ... return torch.optim.Adam(self.parameters(), lr=0.02) + +Which you can train by doing: + +.. code-block:: python + + train_loader = DataLoader(MNIST(os.getcwd(), download=True, transform=transforms.ToTensor())) + trainer = pl.Trainer() + model = LitModel() + + trainer.fit(model, train_loader) + +---------- + +LightningModule for research +---------------------------- +For research, LightningModules are best structured as systems. + +A model (colloquially) refers to something like a resnet or RNN. A system, may be a collection of models. Here +are examples of systems: + +- GAN (generator, discriminator) +- RL (policy, actor, critic) +- Autoencoders (encoder, decoder) +- Seq2Seq (encoder, attention, decoder) +- etc... + +A LightningModule is best used to define a complex system: + +.. code-block:: python + + import pytorch_lightning as pl + import torch + from torch import nn + + class Autoencoder(pl.LightningModule): + + def __init__(self, latent_dim=2): + super().__init__() + self.encoder = nn.Sequential(nn.Linear(28 * 28, 256), nn.ReLU(), nn.Linear(256, latent_dim)) + self.decoder = nn.Sequential(nn.Linear(latent_dim, 256), nn.ReLU(), nn.Linear(256, 28 * 28)) + + def training_step(self, batch, batch_idx): + x, _ = batch + + # encode + x = x.view(x.size(0), -1) + z = self.encoder(x) + + # decode + recons = self.decoder(z) + + # reconstruction + reconstruction_loss = nn.functional.mse_loss(recons, x) + return pl.TrainResult(reconstruction_loss) + + def validation_step(self, batch, batch_idx): + x, _ = batch + x = x.view(x.size(0), -1) + z = self.encoder(x) + recons = self.decoder(z) + reconstruction_loss = nn.functional.mse_loss(recons, x) + + result = pl.EvalResult(checkpoint_on=reconstruction_loss) + return result + + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=0.0002) + +Which can be trained like this: + +.. code-block:: python + + autoencoder = Autoencoder() + trainer = pl.Trainer(gpus=1) + trainer.fit(autoencoder, train_dataloader, val_dataloader) + +This simple model generates examples that look like this (the encoders and decoders are too weak) + +.. figure:: https://pl-bolts-doc-images.s3.us-east-2.amazonaws.com/pl_docs/ae_docs.png + :width: 300 + +The methods above are part of the lightning interface: + +- training_step +- validation_step +- test_step +- configure_optimizers + +Note that in this case, the train loop and val loop are exactly the same. We can of course reuse this code. + +.. code-block:: python + + class Autoencoder(pl.LightningModule): + + def __init__(self, latent_dim=2): + super().__init__() + self.encoder = nn.Sequential(nn.Linear(28 * 28, 256), nn.ReLU(), nn.Linear(256, latent_dim)) + self.decoder = nn.Sequential(nn.Linear(latent_dim, 256), nn.ReLU(), nn.Linear(256, 28 * 28)) + + def training_step(self, batch, batch_idx): + loss = self.shared_step(batch) + return pl.TrainResult(loss) + + def validation_step(self, batch, batch_idx): + loss = self.shared_step(batch) + result = pl.EvalResult(checkpoint_on=loss) + return result + + def shared_step(self, batch): + x, _ = batch + + # encode + x = x.view(x.size(0), -1) + z = self.encoder(x) + + # decode + recons = self.decoder(z) + + # loss + return nn.functional.mse_loss(recons, x) + + def configure_optimizers(self): + return torch.optim.Adam(self.parameters(), lr=0.0002) + +We create a new method called `shared_step` that all loops can use. This method name is arbitrary and NOT reserved. + +Inference in Research +^^^^^^^^^^^^^^^^^^^^^ +In the case where we want to perform inference with the system we can add a `forward` method to the LightningModule. + +.. code-block:: python + + class Autoencoder(pl.LightningModule): + def forward(self, x): + return self.decoder(x) + +The advantage of adding a forward is that in complex systems, you can do a much more involved inference procedure, +such as text generation: + +.. code-block:: python + + class Seq2Seq(pl.LightningModule): + + def forward(self, x): + embeddings = self(x) + hidden_states = self.encoder(embeddings) + for h in hidden_states: + # decode + ... + return decoded + +--------------------- + +LightningModule for production +------------------------------ +For cases like production, you might want to iterate different models inside a LightningModule. + +.. code-block:: python + + import pytorch_lightning as pl + from pytorch_lightning.metrics import functional as FM + + class ClassificationTask(pl.LightningModule): + + def __init__(self, model): + super().__init__() + self.model = model + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self.model(x) + loss = F.cross_entropy(y_hat, y) + return pl.TrainResult(loss) + + def validation_step(self, batch, batch_idx): + x, y = batch + y_hat = self.model(x) + loss = F.cross_entropy(y_hat, y) + acc = FM.accuracy(y_hat, y) + result = pl.EvalResult(checkpoint_on=loss) + result.log_dict({'val_acc': acc, 'val_loss': loss}) + return result + + def test_step(self, batch, batch_idx): + result = self.validation_step(batch, batch_idx) + result.rename_keys({'val_acc': 'test_acc', 'val_loss': 'test_loss'}) + return result + + def configure_optimizers(self): + return torch.optim.Adam(self.model.parameters(), lr=0.02) + +Then pass in any arbitrary model to be fit with this task + +.. code-block:: python + + for model in [resnet50(), vgg16(), BidirectionalRNN()]: + task = ClassificationTask(model) + + trainer = Trainer(gpus=2) + trainer.fit(task, train_dataloader, val_dataloader) + +Tasks can be arbitrarily complex such as implementing GAN training, self-supervised or even RL. + +.. code-block:: python + + class GANTask(pl.LightningModule): + + def __init__(self, generator, discriminator): + super().__init__() + self.generator = generator + self.discriminator = discriminator + ... + +Inference in production +^^^^^^^^^^^^^^^^^^^^^^^ +When used like this, the model can be separated from the Task and thus used in production without needing to keep it in +a `LightningModule`. + +- You can export to onnx. +- Or trace using Jit. +- or run in the python runtime. + +.. code-block:: python + + task = ClassificationTask(model) + + trainer = Trainer(gpus=2) + trainer.fit(task, train_dataloader, val_dataloader) + + # use model after training or load weights and drop into the production system + model.eval() + y_hat = model(x) + + +Training loop +------------- +To add a training loop use the `training_step` method + +.. code-block:: python + + class LitClassifier(pl.LightningModule): + + def __init__(self, model): + super().__init__() + self.model = model + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self.model(x) + loss = F.cross_entropy(y_hat, y) + return pl.TrainResult(loss) + +Under the hood, Lightning does the following (pseudocode): + +.. code-block:: python + + # put model in train mode + model.train() + torch.set_grad_enabled(True) + + outs = [] + for batch in train_dataloader: + # forward + out = training_step(val_batch) + + # backward + loss.backward() + + # apply and clear grads + optimizer.step() + optimizer.zero_grad() + +Training epoch-level metrics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +If you want to calculate epoch-level metrics and log them, use the `TrainResult.log` method + +.. code-block:: python + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self.model(x) + loss = F.cross_entropy(y_hat, y) + result = pl.TrainResult(loss) + + # logs metrics for each training_step, and the average across the epoch, to the progress bar and logger + result.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) + return result + +The `TrainResult.log` object automatically reduces the requested metrics across the full epoch. +Here's the pseudocode of what it does under the hood: + +.. code-block:: python + + outs = [] + for batch in train_dataloader: + # forward + out = training_step(val_batch) + + # backward + loss.backward() + + # apply and clear grads + optimizer.step() + optimizer.zero_grad() + + epoch_metric = torch.mean(torch.stack([x['train_loss'] for x in outs])) + +Train epoch-level operations +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +If you need to do something with all the outputs of each `training_step`, override `training_epoch_end` yourself. + +.. code-block:: python + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self.model(x) + loss = F.cross_entropy(y_hat, y) + result = pl.TrainResult(loss) + result.prediction = some_prediction + + def training_epoch_end(self, training_step_outputs): + all_predictions = training_step_outputs.prediction + ... + return result + +The matching pseudocode is: + +.. code-block:: python + + outs = [] + for batch in train_dataloader: + # forward + out = training_step(val_batch) + + # backward + loss.backward() + + # apply and clear grads + optimizer.step() + optimizer.zero_grad() + + epoch_out = training_epoch_end(outs) + +Training with DataParallel +^^^^^^^^^^^^^^^^^^^^^^^^^^ +When training using a `distributed_backend` that splits data from each batch across GPUs, sometimes you might +need to aggregate them on the master GPU for processing (dp, or ddp2). + +In this case, implement the `training_step_end` method + +.. code-block:: python + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self.model(x) + loss = F.cross_entropy(y_hat, y) + result = pl.TrainResult(loss) + result.prediction = some_prediction + + def training_step_end(self, batch_parts): + gpu_0_prediction = batch_parts.prediction[0] + gpu_1_prediction = batch_parts.prediction[1] + + # do something with both outputs + return result + + def training_epoch_end(self, training_step_outputs): + all_predictions = training_step_outputs.prediction + ... + return result + +The full pseudocode that lighting does under the hood is: + +.. code-block:: python + + outs = [] + for train_batch in train_dataloader: + batches = split_batch(train_batch) + dp_outs = [] + for sub_batch in batches: + # 1 + dp_out = training_step(sub_batch) + dp_outs.append(dp_out) + + # 2 + out = training_step_end(dp_outs) + outs.append(out) + + # do something with the outputs for all batches + # 3 + training_epoch_end(outs) + +------------------ + +Validation loop +--------------- +To add a validation loop, override the `validation_step` method of the :class:`~LightningModule`: + +.. code-block:: python + + class LitModel(pl.LightningModule): + def validation_step(self, batch, batch_idx): + x, y = batch + y_hat = self.model(x) + loss = F.cross_entropy(y_hat, y) + result = pl.EvalResult(checkpoint_on=loss) + return result + +Under the hood, Lightning does the following: + +.. code-block:: python + + # ... + for batch in train_dataloader: + loss = model.training_step() + loss.backward() + # ... + + if validate_at_some_point: + # disable grads + batchnorm + dropout + torch.set_grad_enabled(False) + model.eval() + + # ----------------- VAL LOOP --------------- + for val_batch in model.val_dataloader: + val_out = model.validation_step(val_batch) + # ----------------- VAL LOOP --------------- + + # enable grads + batchnorm + dropout + torch.set_grad_enabled(True) + model.train() + +Validation epoch-level metrics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +If you need to do something with all the outputs of each `validation_step`, override `validation_epoch_end`. + +.. code-block:: python + + def validation_step(self, batch, batch_idx): + x, y = batch + y_hat = self.model(x) + loss = F.cross_entropy(y_hat, y) + result = pl.EvalResult(loss) + result.prediction = some_prediction + + def validation_epoch_end(self, validation_step_outputs): + all_predictions = validation_step_outputs.prediction + ... + return result + +Validating with DataParallel +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +When training using a `distributed_backend` that splits data from each batch across GPUs, sometimes you might +need to aggregate them on the master GPU for processing (dp, or ddp2). + +In this case, implement the `validation_step_end` method + +.. code-block:: python + + def validation_step(self, batch, batch_idx): + x, y = batch + y_hat = self.model(x) + loss = F.cross_entropy(y_hat, y) + result = pl.EvalResult(loss) + result.prediction = some_prediction + + def validation_step_end(self, batch_parts): + gpu_0_prediction = batch_parts.prediction[0] + gpu_1_prediction = batch_parts.prediction[1] + + # do something with both outputs + return result + + def validation_epoch_end(self, validation_step_outputs): + all_predictions = validation_step_outputs.prediction + ... + return result + +The full pseudocode that lighting does under the hood is: + +.. code-block:: python + + outs = [] + for batch in dataloader: + batches = split_batch(batch) + dp_outs = [] + for sub_batch in batches: + # 1 + dp_out = validation_step(sub_batch) + dp_outs.append(dp_out) + + # 2 + out = validation_step_end(dp_outs) + outs.append(out) + + # do something with the outputs for all batches + # 3 + validation_epoch_end(outs) + +---------------- + +Test loop +--------- +The process for adding a test loop is the same as the process for adding a validation loop. Please refer to +the section above for details. + +The only difference is that the test loop is only called when `.test()` is used: + +.. code-block:: python + + model = Model() + trainer = Trainer() + trainer.fit() + + # automatically loads the best weights for you + trainer.test(model) + +There are two ways to call `test()`: + +.. code-block:: python + + # call after training + trainer = Trainer() + trainer.fit(model) + + # automatically auto-loads the best weights + trainer.test(test_dataloaders=test_dataloader) + + # or call with pretrained model + model = MyLightningModule.load_from_checkpoint(PATH) + trainer = Trainer() + trainer.test(model, test_dataloaders=test_dataloader) + +---------- + +Live demo +--------- +Check out this +`COLAB `_ +for a live demo. + +----------- + +LightningModule API +------------------- + +Training loop methods +^^^^^^^^^^^^^^^^^^^^^ + +training_step +~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.training_step + :noindex: + +training_step_end +~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.training_step_end + :noindex: + +training_epoch_end +~~~~~~~~~~~~~~~~~~ +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.training_epoch_end + :noindex: + +--------------- + +Validation loop methods +^^^^^^^^^^^^^^^^^^^^^^^ + +validation_step +~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.validation_step + :noindex: + +validation_step_end +~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.validation_step_end + :noindex: + +validation_epoch_end +~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.validation_epoch_end + :noindex: + +---------------- + +test loop methods +^^^^^^^^^^^^^^^^^ + +test_step +~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.test_step + :noindex: + +test_step_end +~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.test_step_end + :noindex: + +test_epoch_end +~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.test_epoch_end + :noindex: + +-------------- + +configure_optimizers +^^^^^^^^^^^^^^^^^^^^ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.configure_optimizers + :noindex: + +-------------- + +Convenience methods +^^^^^^^^^^^^^^^^^^^ +Use these methods for convenience + +print +~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.print + :noindex: + +save_hyperparameters +~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.save_hyperparameters + :noindex: + +------------ + +Inference methods +^^^^^^^^^^^^^^^^^ +Use these hooks for inference with a lightning module + +forward +~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.forward + :noindex: + +freeze +~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.freeze + :noindex: + +to_onnx +~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.to_onnx + :noindex: + +unfreeze +~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.unfreeze + :noindex: + +------------ + +Properties +^^^^^^^^^^ +These are properties available in a LightningModule. + +----------- + +current_epoch +~~~~~~~~~~~~~ +The current epoch + +.. code-block:: python + + def training_step(...): + if self.current_epoch == 0: + +------------- + +device +~~~~~~ +The device the module is on. Use it to keep your code device agnostic + +.. code-block:: python + + def training_step(...): + z = torch.rand(2, 3, device=self.device) + +------------- + +global_rank +~~~~~~~~~~~ +The global_rank of this LightningModule. Lightning saves logs, weights etc only from global_rank = 0. You +normally do not need to use this property + +Global rank refers to the index of that GPU across ALL GPUs. For example, if using 10 machines, each with 4 GPUs, +the 4th GPU on the 10th machine has global_rank = 39 + +------------- + +global_step +~~~~~~~~~~~ +The current step (does not reset each epoch) + +.. code-block:: python + + def training_step(...): + self.logger.experiment.log_image(..., step=self.global_step) + +------------- + +hparams +~~~~~~~ +After calling `save_hyperparameters` anything passed to init() is available via hparams. + +.. code-block:: python + + def __init__(self, learning_rate): + self.save_hyperparameters() + + def configure_optimizers(self): + return Adam(self.parameters(), lr=self.hparams.learning_rate) + +-------------- + +logger +~~~~~~ +The current logger being used (tensorboard or other supported logger) + +.. code-block:: python + + def training_step(...): + # the generic logger (same no matter if tensorboard or other supported logger) + self.logger + + # the particular logger + tensorboard_logger = self.logger.experiment + +-------------- + +local_rank +~~~~~~~~~~~ +The local_rank of this LightningModule. Lightning saves logs, weights etc only from global_rank = 0. You +normally do not need to use this property + +Local rank refers to the rank on that machine. For example, if using 10 machines, the GPU at index 0 on each machine +has local_rank = 0. + + +----------- + +precision +~~~~~~~~~ +The type of precision used: + +.. code-block:: python + + def training_step(...): + if self.precision == 16: + +------------ + +trainer +~~~~~~~ +Pointer to the trainer + +.. code-block:: python + + def training_step(...): + max_steps = self.trainer.max_steps + any_flag = self.trainer.any_flag + +------------ + +use_ddp +~~~~~~~ +True if using ddp + +------------ + +use_ddp2 +~~~~~~~~ +True if using ddp2 + +------------ + +use_dp +~~~~~~ +True if using dp + +------------ + +use_tpu +~~~~~~~ +True if using TPUs + +-------------- + +Hooks +----- + +Hook lifecycle pseudocode +^^^^^^^^^^^^^^^^^^^^^^^^^ +This is the pseudocode to describe how all the hooks are called during a call to `.fit()` + +.. code-block:: python + + def fit(...): + on_fit_start() + + if global_rank == 0: + # prepare data is called on GLOBAL_ZERO only + prepare_data() + + for gpu/tpu in gpu/tpus: + train_on_device(model.copy()) + + on_fit_end() + + def train_on_device(model): + # setup is called PER DEVICE + setup() + configure_optimizers() + on_pretrain_routine_start() + + for epoch in epochs: + train_loop() + + teardown() + + def train_loop(): + on_train_epoch_start() + train_outs = [] + for train_batch in train_dataloader(): + on_train_batch_start() + + # ----- train_step methods ------- + out = training_step(batch) + train_outs.append(out) + + loss = out.loss + + backward() + on_after_backward() + optimizer_step() + on_before_zero_grad() + optimizer_zero_grad() + + on_train_batch_end() + + if should_check_val: + val_loop() + + # end training epoch + logs = training_epoch_end(outs) + + def val_loop(): + model.eval() + torch.set_grad_enabled(False) + + on_validation_epoch_start() + val_outs = [] + for val_batch in val_dataloader(): + on_validation_batch_start() + + # -------- val step methods ------- + out = validation_step(val_batch) + val_outs.append(out) + + on_validation_batch_end() + + validation_epoch_end(val_outs) + on_validation_epoch_end() + + # set up for train + model.train() + torch.set_grad_enabled(True) + + +Advanced hooks +^^^^^^^^^^^^^^ +Use these hooks to modify advanced functionality + +configure_apex +~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.configure_apex + :noindex: + +configure_ddp +~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.configure_ddp + :noindex: + +configure_sync_batchnorm +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.configure_ddp + :noindex: + +get_progress_bar_dict +~~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.get_progress_bar_dict + :noindex: + +init_ddp_connection +~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.init_ddp_connection + :noindex: + +tbptt_split_batch +~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.tbptt_split_batch + :noindex: + +Checkpoint hooks +^^^^^^^^^^^^^^^^ +These hooks allow you to modify checkpoints + +on_load_checkpoint +~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.on_load_checkpoint + :noindex: + +on_save_checkpoint +~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.on_save_checkpoint + :noindex: + +------------- + +Data hooks +^^^^^^^^^^ +Use these hooks if you want to couple a LightningModule to a dataset. + +.. note:: The same collection of hooks is available in a DataModule class to decouple the data from the model. + +train_dataloader +~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.train_dataloader + :noindex: + +val_dataloader +~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.val_dataloader + :noindex: + +test_dataloader +~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.test_dataloader + :noindex: + +prepare_data +~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.prepare_data + :noindex: + +------------ + +Optimization hooks +^^^^^^^^^^^^^^^^^^ +These are hooks related to the optimization procedure. + +backward +~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.backward + :noindex: + +on_after_backward +~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.on_after_backward + :noindex: + +on_before_zero_grad +~~~~~~~~~~~~~~~~~~~ +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.on_before_zero_grad + :noindex: + +optimizer_step +~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.optimizer_step + :noindex: + +optimizer_zero_grad +~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.lightning.LightningModule.optimizer_zero_grad + :noindex: + +Training lifecycle hooks +^^^^^^^^^^^^^^^^^^^^^^^^^ +These hooks are called during training + +on_fit_start +~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.on_fit_start + :noindex: + +on_fit_end +~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.on_fit_end + :noindex: + +on_pretrain_routine_start +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.on_pretrain_routine_start + :noindex: + +on_pretrain_routine_end +~~~~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.on_pretrain_routine_end + :noindex: + +on_test_epoch_start +~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.on_test_epoch_start + :noindex: + +on_test_epoch_end +~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.on_test_epoch_end + :noindex: + +on_test_batch_start +~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.on_test_batch_start + :noindex: + +on_test_batch_end +~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.on_test_batch_end + :noindex: + +on_train_batch_start +~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.on_train_batch_start + :noindex: + +on_train_batch_end +~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.on_train_batch_end + :noindex: + +on_train_epoch_start +~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.on_train_epoch_start + :noindex: + +on_train_epoch_end +~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.on_train_epoch_end + :noindex: + +on_validation_batch_start +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.on_validation_batch_start + :noindex: + +on_validation_batch_end +~~~~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.on_validation_batch_end + :noindex: + +on_validation_epoch_start +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.on_validation_epoch_start + :noindex: + +on_validation_epoch_end +~~~~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.on_validation_epoch_end + :noindex: + +setup +~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.setup + :noindex: + +teardown +~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.teardown + :noindex: + +transfer_batch_to_device +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: pytorch_lightning.core.hooks.ModelHooks.transfer_batch_to_device + :noindex: diff --git a/docs/source/new-project.rst b/docs/source/new-project.rst index 166750bb652b09..e3aebefc788126 100644 --- a/docs/source/new-project.rst +++ b/docs/source/new-project.rst @@ -12,6 +12,7 @@ Quick Start =========== PyTorch Lightning is nothing more than organized PyTorch code. + Once you've organized it into a LightningModule, it automates most of the training for you. To illustrate, here's the typical PyTorch project structure organized in a LightningModule. @@ -107,7 +108,7 @@ All of it 100% rigorously tested and benchmarked Training loop under the hood ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Under the hood, lightning does (in high-level pseudocode): +Under the hood, lightning does the following (in high-level pseudocode): .. code-block:: python @@ -156,7 +157,12 @@ To add an (optional) validation loop add the following function x, y = batch y_hat = self(x) loss = F.cross_entropy(y_hat, y) - return {'val_loss': loss, 'log': {'val_loss': loss}} + + result = pl.EvalResult(checkpoint_on=loss) + result.log('val_loss', loss) + return result + +.. note:: EvalResult is a plain Dict, with convenience functions for logging And now the trainer will call the validation loop automatically @@ -216,7 +222,10 @@ You might also need an optional test loop x, y = batch y_hat = self(x) loss = F.cross_entropy(y_hat, y) - return {'test_loss': loss, 'log': {'test_loss': loss}} + + result = pl.EvalResult() + result.log('test_loss', loss) + return result However, this time you need to specifically call test (this is done so you don't use the test set by mistake) @@ -426,33 +435,18 @@ Lightning has built-in logging to any of the supported loggers or progress bar. Log in train loop ^^^^^^^^^^^^^^^^^ -To log from the training loop use the `log` reserved key. - -.. code-block:: python - - def training_step(self, batch, batch_idx): - loss = ... - return {'loss': loss, 'log': {'train_loss': loss}} - - -However, for more fine-grain control use the `TrainResult` object. -These are equivalent: +To log from the training loop use the `log` method in the `TrainResult`. .. code-block:: python def training_step(self, batch, batch_idx): loss = ... - return {'loss': loss, 'log': {'train_loss': loss}} - - # equivalent - def training_step(self, batch, batch_idx): - loss = ... - result = pl.TrainResult(minimize=loss) result.log('train_loss', loss) return result -But the TrainResult gives you error-checking and greater flexibility: +The `TrainResult` gives you options for logging on every step and/or at the end of the epoch. +It also allows logging to the progress bar. .. code-block:: python @@ -471,42 +465,16 @@ Then boot up your logger or tensorboard instance to view training logs Log in Val/Test loop ^^^^^^^^^^^^^^^^^^^^ -To log from the validation or test loop use a similar approach +To log from the validation or test loop use the `EvalResult`. .. code-block:: python def validation_step(self, batch, batch_idx): loss = ... - acc = ... - val_output = {'loss': loss, 'acc': acc} - return val_output - - def validation_epoch_end(self, validation_step_outputs): - # this step allows you to aggregate whatever you passed in from every val step - val_epoch_loss = torch.stack([x['loss'] for x in val_output]).mean() - val_epoch_acc = torch.stack([x['acc'] for x in val_output]).mean() - return { - 'val_loss': val_epoch_loss, - 'log': {'avg_val_loss': val_epoch_loss, 'avg_val_acc': val_epoch_acc} - } - -The recommended equivalent version in case you don't need to do anything special -with all the outputs of the validation step: - -.. code-block:: python - - def validation_step(self, batch, batch_idx): - loss = ... - acc = ... - - result = pl.EvalResult(checkpoint_on=loss) - result.log('val_loss', loss) - result.log('val_acc', acc) + result = pl.EvalResult() + result.log_dict({'val_loss': loss, 'val_acc': acc}) return result -.. note:: Only use `validation_epoch_end` if you need fine-grain control over aggreating all step outputs - - Log to the progress bar ^^^^^^^^^^^^^^^^^^^^^^^ | @@ -518,25 +486,47 @@ Log to the progress bar | -In addition to visual logging, you can log to the progress bar by using the keyword `progress_bar`: +In addition to visual logging, you can log to the progress bar by setting `prog_bar` to True .. code-block:: python def training_step(self, batch, batch_idx): loss = ... - return {'loss': loss, 'progress_bar': {'train_loss': loss}} + result = pl.TrainResult(loss) + result.log('train_loss', loss, prog_bar=True) + +----------------- -Or simply set `prog_bar=True` in either of the `EvalResult` or `TrainResult` +Advanced loop aggregation +------------------------- +For certain train/val/test loops, you may wish to do more than just logging. In this case, +you can also implement `__epoch_end` which gives you the output for each step + +Here's the motivating Pytorch example: .. code-block:: python - def training_step(self, batch, batch_idx): - result = TrainResult(loss) - result.log('train_loss', loss, prog_bar=True) - return result + validation_step_outputs = [] + for batch_idx, batch in val_dataloader(): + out = validation_step(batch, batch_idx) + validation_step_outputs.append(out) + validation_epoch_end(validation_step_outputs) ------------------ +And the lightning equivalent + +.. code-block:: python + + def validation_step(self, batch, batch_idx): + loss = ... + predictions = ... + result = pl.EvalResult(checkpoint_on=loss) + result.log('val_loss', loss) + result.predictions = predictions + + def validation_epoch_end(self, validation_step_outputs): + all_val_losses = validation_step_outputs.val_loss + all_predictions = validation_step_outputs.predictions Why do you need Lightning? -------------------------- @@ -544,12 +534,19 @@ The MAIN teakeaway points are: - Lightning is for professional AI researchers/production teams. - Lightning is organized PyTorch. It is not an abstraction. +- You STILL keep pure PyTorch. +- You DON't lose any flexibility. +- You can get rid of all of your boilerplate. +- You make your code generalizable to any hardware. +- Your code is now readable and easier to reproduce (ie: you help with the reproducibility crisis). +- Your LightningModule is still just a pure PyTorch module. Lightning is for you if ^^^^^^^^^^^^^^^^^^^^^^^ - You're a professional researcher/ml engineer working on non-trivial deep learning. - You already know PyTorch and are not a beginner. +- You want to iterate through research much faster. - You want to put models into production much faster. - You need full control of all the details but don't need the boilerplate. - You want to leverage code written by hundreds of AI researchers, research engs and PhDs from the world's top AI labs. @@ -617,13 +614,12 @@ would normally do. --------------- -Summary -------- -In short, by refactoring your PyTorch code: +Masterclass +----------- +You can learn Lightning in-depth by watching our Masterclass. -1. You STILL keep pure PyTorch. -2. You DON't lose any flexibility. -3. You can get rid of all of your boilerplate. -4. You make your code generalizable to any hardware. -5. Your code is now readable and easier to reproduce (ie: you help with the reproducibility crisis). -6. Your LightningModule is still just a pure PyTorch module. +.. image:: _images/general/PTL101_youtube_thumbnail.jpg + :width: 500 + :align: center + :alt: Masterclass + :target: https://www.youtube.com/playlist?list=PLaMu-SDt_RB5NUm67hU2pdE75j6KaIOv2 diff --git a/docs/source/results.rst b/docs/source/results.rst index 960cda2bcf399f..ed583ded863977 100644 --- a/docs/source/results.rst +++ b/docs/source/results.rst @@ -2,33 +2,17 @@ Result ====== Lightning has two results objects `TrainResult` and `EvalResult`. -When your `_step_end` or `_epoch_end` does nothing but aggregate metrics to log, you can delete those -methods and use a Result object instead. +Use these to control: -However, if you need fine-grain control to do more than logging or a complex aggregation, then keep -the loops as they are and do not use the `EvalResult` or `TrainResult` objects. - -.. note:: These objects are optional and should only be used if you don't need full control of the loops. +- When to log (each step and/or epoch aggregate). +- Where to log (progress bar or a logger). +- How to sync across accelerators. ------------------ Training loop example --------------------- -We can simplify the following multi-method training loop: - -.. code-block:: python - - def training_step(self, batch, batch_idx): - return {'loss': loss} - - def training_epoch_end(self, training_step_outputs): - epoch_loss = torch.stack([x['loss'] for x in training_step_outputs]).mean() - return { - 'log': {'epoch_loss': epoch_loss}, - 'progress_bar': {'epoch_loss': epoch_loss} - } - -using the equivalent syntax via the `TrainResult` object: +Return a `TrainResult` from the Training loop. .. code-block:: python @@ -38,34 +22,54 @@ using the equivalent syntax via the `TrainResult` object: result.log('train_loss', loss, prog_bar=True) return result +If you'd like to do something special with the outputs other than logging, implement `__epoch_end`. + +.. code-block:: python + + def training_step(self, batch, batch_idx): + result = pl.TrainResult(loss) + result.some_prediction = some_prediction + return result + + def training_epoch_end(self, training_step_output_result): + all_train_predictions = training_step_output_result.some_prediction + + training_step_output_result.some_new_prediction = some_new_prediction + return training_step_output_result + -------------------- Validation/Test loop example ----------------------------- -We can replace the following validation/test loop: +Return a `EvalResult` object from a validation/test loop .. code-block:: python def validation_step(self, batch, batch_idx): - return {'some_metric': some_metric} - - def validation_epoch_end(self, validation_step_outputs): - some_metric_mean = torch.stack([x['some_metric'] for x in validation_step_outputs]).mean() - return { - 'log': {'some_metric_mean': some_metric_mean}, - 'progress_bar': {'some_metric_mean': some_metric_mean} - } + some_metric = ... + result = pl.EvalResult(checkpoint_on=some_metric) + result.log('some_metric', some_metric, prog_bar=True) + return result -With the equivalent using the `EvalResult` syntax +If you'd like to do something special with the outputs other than logging, implement `__epoch_end`. .. code-block:: python def validation_step(self, batch, batch_idx): - some_metric = ... result = pl.EvalResult(checkpoint_on=some_metric) - result.log('some_metric', some_metric, prog_bar=True) + result.a_prediction = some_prediction return result + def validation_epoch_end(self, validation_step_output_result): + all_validation_step_predictions = validation_step_output_result.a_prediction + # do something with the predictions from all validation_steps + + return validation_step_output_result + + +With the equivalent using the `EvalResult` syntax + + ------------------ TrainResult @@ -161,7 +165,6 @@ Finally, you can use your own reduction function instead: Finally, you may need more esoteric logging such as something specific to your logger like images: - .. code-block:: python def training_step(...): @@ -171,6 +174,14 @@ Finally, you may need more esoteric logging such as something specific to your l # also log images (if tensorboard for example) self.logger.experiment.log_figure(...) +Sync across devices +^^^^^^^^^^^^^^^^^^^ +When training on multiple GPUs/CPUs/TPU cores, calculate the global mean of a logged metric as follows: + +.. code-block:: python + + result.log('train_loss', loss, sync_dist=True) + TrainResult API ^^^^^^^^^^^^^^^ @@ -226,6 +237,14 @@ Val/Test loop ^^^^^^^^^^^^^ Eval result can be used in both `test_step` and `validation_step`. +Sync across devices (v) +^^^^^^^^^^^^^^^^^^^^^^^ +When training on multiple GPUs/CPUs/TPU cores, calculate the global mean of a logged metric as follows: + +.. code-block:: python + + result.log('val_loss', loss, sync_dist=True) + EvalResult API ^^^^^^^^^^^^^^^ diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py index 3b4aed8566bb65..8f4ef9da4087e9 100644 --- a/pytorch_lightning/callbacks/early_stopping.py +++ b/pytorch_lightning/callbacks/early_stopping.py @@ -89,15 +89,6 @@ def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience: self.best_score = torch_inf if self.monitor_op == torch.lt else -torch_inf def _validate_condition_metric(self, logs): - """ - Checks that the condition metric for early stopping is good - - Args: - logs: callback metrics from validation output - - Return: - True if specified metric is available - """ monitor_val = logs.get(self.monitor) error_msg = (f'Early stopping conditioned on metric `{self.monitor}`' f' which is not available. Either add `{self.monitor}` to the return of ' diff --git a/pytorch_lightning/callbacks/lr_logger.py b/pytorch_lightning/callbacks/lr_logger.py index a4f2ddc75789ca..0188be550bd11f 100755 --- a/pytorch_lightning/callbacks/lr_logger.py +++ b/pytorch_lightning/callbacks/lr_logger.py @@ -94,8 +94,6 @@ def on_epoch_start(self, trainer, pl_module): trainer.logger.log_metrics(latest_stat, step=trainer.current_epoch) def _extract_lr(self, trainer, interval): - """ Extracts learning rates for lr schedulers and saves information - into dict structure. """ latest_stat = {} for name, scheduler in zip(self.lr_sch_names, trainer.lr_schedulers): diff --git a/pytorch_lightning/core/__init__.py b/pytorch_lightning/core/__init__.py index c29fea079f650d..7116da179afa8f 100644 --- a/pytorch_lightning/core/__init__.py +++ b/pytorch_lightning/core/__init__.py @@ -1,362 +1,3 @@ -""" -A :class:`~LightningModule` organizes your PyTorch code into the following sections: - -.. figure:: /_images/lightning_module/pt_to_pl.png - :alt: Convert from PyTorch to Lightning - - -Notice a few things. - -1. It's the SAME code. -2. The PyTorch code IS NOT abstracted - just organized. -3. All the other code that's not in the :class:`~LightningModule` - has been automated for you by the trainer. - - .. code-block:: python - - net = Net() - trainer = Trainer() - trainer.fit(net) - -4. There are no .cuda() or .to() calls... Lightning does these for you. - - .. code-block:: python - - # don't do in lightning - x = torch.Tensor(2, 3) - x = x.cuda() - x = x.to(device) - - # do this instead - x = x # leave it alone! - - # or to init a new tensor - new_x = torch.Tensor(2, 3) - new_x = new_x.type_as(x.type()) - -5. There are no samplers for distributed, Lightning also does this for you. - - .. code-block:: python - - # Don't do in Lightning... - data = MNIST(...) - sampler = DistributedSampler(data) - DataLoader(data, sampler=sampler) - - # do this instead - data = MNIST(...) - DataLoader(data) - -6. A :class:`~LightningModule` is a :class:`torch.nn.Module` but with added functionality. Use it as such! - - .. code-block:: python - - net = Net.load_from_checkpoint(PATH) - net.freeze() - out = net(x) - -Thus, to use Lightning, you just need to organize your code which takes about 30 minutes, -(and let's be real, you probably should do anyhow). - ------------- - -Minimal Example ---------------- - -Here are the only required methods. - -.. code-block:: python - - >>> import pytorch_lightning as pl - >>> class LitModel(pl.LightningModule): - ... - ... def __init__(self): - ... super().__init__() - ... self.l1 = torch.nn.Linear(28 * 28, 10) - ... - ... def forward(self, x): - ... return torch.relu(self.l1(x.view(x.size(0), -1))) - ... - ... def training_step(self, batch, batch_idx): - ... x, y = batch - ... y_hat = self(x) - ... loss = F.cross_entropy(y_hat, y) - ... return pl.TrainResult(loss) - ... - ... def configure_optimizers(self): - ... return torch.optim.Adam(self.parameters(), lr=0.02) - -Which you can train by doing: - -.. code-block:: python - - train_loader = DataLoader(MNIST(os.getcwd(), train=True, download=True, transform=transforms.ToTensor())) - trainer = pl.Trainer() - model = LitModel() - - trainer.fit(model, train_loader) - ----------- - -Training loop structure ------------------------ - -The general pattern is that each loop has a single method to worry about - -- ``___step`` - -If you need more control, there are two optional methods. - -- ``___step_end`` -- ``___epoch_end`` - -To show how Lightning calls these, let's use the validation loop as an example: - -.. code-block:: python - - # put model in prediction mode - model.eval() - torch.set_grad_enabled(False) - - val_outs = [] - for val_batch in val_data: - # do something with each batch - out = validation_step(val_batch) - val_outs.append(out) - - # do something with the outputs for all batches - # like calculate validation set accuracy or loss - validation_epoch_end(val_outs) - - # put model back in train mode - model.train() - torch.set_grad_enabled(True) - -If we use dp or ddp2 mode, we can also define the ``XXX_step_end`` method to operate -on all parts of the batch:: - - val_outs = [] - for val_batch in val_data: - batches = split_batch(val_batch) - dp_outs = [] - for sub_batch in batches: - dp_out = validation_step(sub_batch) - dp_outs.append(dp_out) - - out = validation_step_end(dp_outs) - val_outs.append(out) - - # do something with the outputs for all batches - # like calculate validation set accuracy or loss - validation_epoch_end(val_outs) - - -Add validation loop -^^^^^^^^^^^^^^^^^^^ - -Thus, if we wanted to add a validation loop you would add this to your -:class:`~LightningModule`: - - >>> import pytorch_lightning as pl - >>> class LitModel(pl.LightningModule): - ... def validation_step(self, batch, batch_idx): - ... x, y = batch - ... y_hat = self(x) - ... loss = F.cross_entropy(y_hat, y) - ... result = pl.EvalResult(checkpoint_on=loss) - ... result.log('val_loss', loss) - ... return result - -The equivalent expanded version (which you normally wouldn't need to use) is the following: - - >>> import pytorch_lightning as pl - >>> class LitModel(pl.LightningModule): - ... def validation_step(self, batch, batch_idx): - ... x, y = batch - ... y_hat = self(x) - ... return {'val_loss': F.cross_entropy(y_hat, y)} - ... - ... def validation_epoch_end(self, outputs): - ... val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean() - ... return {'val_loss': val_loss_mean} - ... - ... def val_dataloader(self): - ... # can also return a list of val dataloaders - ... return DataLoader(...) - -Add test loop -^^^^^^^^^^^^^ - - >>> import pytorch_lightning as pl - >>> class LitModel(pl.LightningModule): - ... def test_step(self, batch, batch_idx): - ... x, y = batch - ... y_hat = self(x) - ... loss = F.cross_entropy(y_hat, y) - ... result = pl.EvalResult(checkpoint_on=loss) - ... result.log('test_loss', loss) - ... return result - -However, the test loop won't ever be called automatically to make sure you -don't run your test data by accident. Instead you have to explicitly call: - -.. code-block:: python - - # call after training - trainer = Trainer() - trainer.fit(model) - trainer.test(test_dataloaders=test_dataloader) - - # or call with pretrained model - model = MyLightningModule.load_from_checkpoint(PATH) - trainer = Trainer() - trainer.test(model, test_dataloaders=test_dataloader) - -------------- - -TrainResult -^^^^^^^^^^^ -When you are using the `_step_end` and `_epoch_end` only for aggregating metrics and then logging, -consider using either a `EvalResult` or `TrainResult` instead. - -Here's a training loop structure - -.. code-block:: python - - def training_step(self, batch, batch_idx): - return {'loss': loss} - - def training_epoch_end(self, training_step_outputs): - epoch_loss = torch.stack([x['loss'] for x in training_step_outputs]).mean() - return { - 'log': {'epoch_loss': epoch_loss}, - 'progress_bar': {'epoch_loss': epoch_loss} - } - -using the equivalent syntax via the `TrainResult` object: - -.. code-block:: python - - def training_step(self, batch_subset, batch_idx): - loss = ... - result = pl.TrainResult(minimize=loss) - result.log('train_loss', loss, prog_bar=True) - return result - -EvalResult -^^^^^^^^^^ -Same for val/test loop - -.. code-block:: python - - def validation_step(self, batch, batch_idx): - return {'some_metric': some_metric} - - def validation_epoch_end(self, validation_step_outputs): - some_metric_mean = torch.stack([x['some_metric'] for x in validation_step_outputs]).mean() - return { - 'log': {'some_metric_mean': some_metric_mean}, - 'progress_bar': {'some_metric_mean': some_metric_mean} - } - -With the equivalent using the `EvalResult` syntax - -.. code-block:: python - - def validation_step(self, batch, batch_idx): - some_metric = ... - result = pl.EvalResult(checkpoint_on=some_metric) - result.log('some_metric', some_metric, prog_bar=True) - return result - ----------- - -Training_step_end method ------------------------- -When using :class:`~pytorch_lightning.overrides.data_parallel.LightningDataParallel` or -:class:`~pytorch_lightning.overrides.data_parallel.LightningDistributedDataParallel`, the -:meth:`~LightningModule.training_step` -will be operating on a portion of the batch. This is normally okay but in special -cases like calculating NCE loss using negative samples, we might want to -perform a softmax across all samples in the batch. - -For these types of situations, each loop has an additional ``__step_end`` method -which allows you to operate on the pieces of the batch: - -.. code-block:: python - - training_outs = [] - for train_batch in train_data: - # dp, ddp2 splits the batch - sub_batches = split_batches_for_dp(batch) - - # run training_step on each piece of the batch - batch_parts_outputs = [training_step(sub_batch) for sub_batch in sub_batches] - - # do softmax with all pieces - out = training_step_end(batch_parts_outputs) - training_outs.append(out) - - # do something with the outputs for all batches - # like calculate validation set accuracy or loss - training_epoch_end(val_outs) - ----------- - -Remove cuda calls ------------------ -In a :class:`~LightningModule`, all calls to ``.cuda()`` -and ``.to(device)`` should be removed. Lightning will do these -automatically. This will allow your code to work on CPUs, TPUs and GPUs. - -When you init a new tensor in your code, just use :meth:`~torch.Tensor.type_as`: - -.. code-block:: python - - def training_step(self, batch, batch_idx): - x, y = batch - - # put the z on the appropriate gpu or tpu core - z = sample_noise() - z = z.type_as(x) - ----------- - -Lifecycle ---------- -The methods in the :class:`~LightningModule` are called in this order: - -1. :meth:`~LightningModule.__init__` -2. :meth:`~LightningModule.prepare_data` -3. :meth:`~LightningModule.configure_optimizers` -4. :meth:`~LightningModule.train_dataloader` - -If you define a validation loop then - -5. :meth:`~LightningModule.val_dataloader` - -And if you define a test loop: - -6. :meth:`~LightningModule.test_dataloader` - -Note: - :meth:`~LightningModule.test_dataloader` is only called with ``.test()`` - -In every epoch, the loop methods are called in this frequency: - -1. :meth:`~LightningModule.validation_step` called every batch -2. :meth:`~LightningModule.validation_epoch_end` called every epoch - -Live demo ---------- -Check out this -`COLAB `_ -for a live demo. - -LightningModule Class ---------------------- - -""" from pytorch_lightning.core.datamodule import LightningDataModule from pytorch_lightning.core.lightning import LightningModule diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index 2bf9c18cf7593b..c594b2f04bb8fc 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -121,7 +121,51 @@ def on_train_batch_end(self, batch: Any, batch_idx: int, dataloader_idx: int) -> batch_idx: the index of the batch dataloader_idx: the index of the dataloader """ - # do something when the batch end + # do something when the batch ends + + def on_validation_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: + """ + Called in the validation loop before anything happens for that batch. + + Args: + batch: The batched data as it is returned by the training DataLoader. + batch_idx: the index of the batch + dataloader_idx: the index of the dataloader + """ + # do something when the batch starts + + def on_validation_batch_end(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: + """ + Called in the validation loop after the batch. + + Args: + batch: The batched data as it is returned by the training DataLoader. + batch_idx: the index of the batch + dataloader_idx: the index of the dataloader + """ + # do something when the batch ends + + def on_test_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: + """ + Called in the test loop before anything happens for that batch. + + Args: + batch: The batched data as it is returned by the training DataLoader. + batch_idx: the index of the batch + dataloader_idx: the index of the dataloader + """ + # do something when the batch starts + + def on_test_batch_end(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None: + """ + Called in the test loop after the batch. + + Args: + batch: The batched data as it is returned by the training DataLoader. + batch_idx: the index of the batch + dataloader_idx: the index of the dataloader + """ + # do something when the batch ends def on_batch_start(self, batch: Any) -> None: """ diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py index d23cde63f450eb..66d067a5146b62 100644 --- a/pytorch_lightning/core/lightning.py +++ b/pytorch_lightning/core/lightning.py @@ -24,6 +24,7 @@ from pytorch_lightning.utilities import rank_zero_warn from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin from pytorch_lightning.utilities.parsing import AttributeDict, collect_init_args, get_init_args +from pytorch_lightning.core.step_result import TrainResult, EvalResult try: import torch_xla.core.xla_model as xm @@ -68,6 +69,9 @@ def __init__(self, *args, **kwargs): #: True if using amp self.use_amp = False + #: The precision used + self.precision = 32 + # optionally can be set by user self._example_input_array = None self._datamodule = None @@ -115,7 +119,6 @@ def forward(self, x): if self.trainer.is_global_zero: print(*args, **kwargs) - @abstractmethod def forward(self, *args, **kwargs): r""" Same as :meth:`torch.nn.Module.forward()`, however in Lightning you want this to define @@ -168,7 +171,7 @@ def forward(self, batch): """ - def training_step(self, *args, **kwargs) -> Union[int, Dict[str, Union[Tensor, Dict[str, Union[float, Tensor]]]]]: + def training_step(self, *args, **kwargs): r""" Here you compute and return the training loss and some additional metrics for e.g. the progress bar or logger. @@ -182,69 +185,72 @@ def training_step(self, *args, **kwargs) -> Union[int, Dict[str, Union[Tensor, D :paramref:`~pytorch_lightning.trainer.trainer.Trainer.truncated_bptt_steps` > 0. Return: - Dict with loss key and optional log or progress bar keys. - When implementing :meth:`training_step`, return whatever you need in that step: + :class:`~pytorch_lightning.core.step_result.TrainResult` - - loss -> tensor scalar **REQUIRED** - - progress_bar -> Dict for progress bar display. Must have either scalar tensors or Python scalars - - log -> Dict of metrics to add to logger. Must have either scalar tensors or Python scalars (no images, etc) + .. note:: :class:`~pytorch_lightning.core.step_result.TrainResult` is simply a Dict with convenient + functions for logging, distributed sync and error checking. In this step you'd normally do the forward pass and calculate the loss for a batch. You can also do fancier things like multiple forward passes or something model specific. - Examples: - .. code-block:: python + Example:: - def training_step(self, batch, batch_idx): - x, y, z = batch + def training_step(self, batch, batch_idx): + x, y, z = batch - # implement your own - out = self(x) - loss = self.loss(out, x) + # implement your own + out = self(x) + loss = self.loss(out, x) - logger_logs = {'training_loss': loss} # optional + # TrainResult auto-detaches the loss after the optimization steps are complete + result = pl.TrainResult(minimize=loss) - # if using TestTubeLogger or TensorBoardLogger you can nest scalars - logger_logs = {'losses': logger_logs} # optional + The return object :class:`~pytorch_lightning.core.step_result.TrainResult` controls where to log, + when to log (step or epoch) and syncing with multiple GPUs. - output = { - 'loss': loss, # required - 'progress_bar': {'training_loss': loss}, # optional - 'log': logger_logs - } + .. code-block:: python - # return a dict - return output + # log to progress bar and logger + result.log('train_loss', loss, prog_bar=True, logger=True) - If you define multiple optimizers, this step will be called with an additional - ``optimizer_idx`` parameter. + # sync metric value across GPUs in distributed training + result.log('train_loss_2', loss, sync_dist=True) - .. code-block:: python + # log to progress bar as well + result.log('train_loss_2', loss, prog_bar=True) - # Multiple optimizers (e.g.: GANs) - def training_step(self, batch, batch_idx, optimizer_idx): - if optimizer_idx == 0: - # do training_step with encoder - if optimizer_idx == 1: - # do training_step with decoder + # assign arbitrary values + result.predictions = predictions + result.some_value = 'some_value' + If you define multiple optimizers, this step will be called with an additional + ``optimizer_idx`` parameter. - If you add truncated back propagation through time you will also get an additional - argument with the hidden states of the previous step. + .. code-block:: python - .. code-block:: python + # Multiple optimizers (e.g.: GANs) + def training_step(self, batch, batch_idx, optimizer_idx): + if optimizer_idx == 0: + # do training_step with encoder + if optimizer_idx == 1: + # do training_step with decoder + + + If you add truncated back propagation through time you will also get an additional + argument with the hidden states of the previous step. + + .. code-block:: python - # Truncated back-propagation through time - def training_step(self, batch, batch_idx, hiddens): - # hiddens are the hidden states from the previous truncated backprop step - ... - out, hiddens = self.lstm(data, hiddens) - ... + # Truncated back-propagation through time + def training_step(self, batch, batch_idx, hiddens): + # hiddens are the hidden states from the previous truncated backprop step + ... + out, hiddens = self.lstm(data, hiddens) + ... - return { - "loss": ..., - "hiddens": hiddens # remember to detach() this - } + # TrainResult auto-detaches hiddens + result = pl.TrainResult(minimize=loss, hiddens=hiddens) + return result Notes: The loss value shown in the progress bar is smoothed (averaged) over the last values, @@ -258,145 +264,122 @@ def training_end(self, *args, **kwargs): Deprecated in v0.7.0. Use :meth:`training_step_end` instead. """ - def training_epoch_end( - self, outputs: Union[List[Dict[str, Tensor]], List[List[Dict[str, Union[float, Tensor]]]]] - ) -> Dict[str, Dict[str, Union[float, Tensor]]]: - """Called at the end of the training epoch with the outputs of all training steps. + def training_step_end(self, *args, **kwargs): + """ + Use this when training with dp or ddp2 because :meth:`training_step` + will operate on only part of the batch. However, this is still optional + and only needed for things like softmax or NCE loss. + + Note: + If you later switch to ddp or some other mode, this will still be called + so that you don't have to change your code .. code-block:: python - # the pseudocode for these calls - train_outs = [] - for train_batch in train_data: - out = training_step(train_batch) - train_outs.append(out) - training_epoch_end(train_outs) + # pseudocode + sub_batches = split_batches_for_dp(batch) + batch_parts_outputs = [training_step(sub_batch) for sub_batch in sub_batches] + training_step_end(batch_parts_outputs) Args: - outputs: List of outputs you defined in :meth:`training_step`, or if there are - multiple dataloaders, a list containing a list of outputs for each dataloader. + batch_parts_outputs: What you return in `training_step` for each batch part. Return: - Dict or OrderedDict. - May contain the following optional keys: + :class:`~pytorch_lightning.core.step_result.TrainResult` - - log (metrics to be added to the logger; only tensors) - - progress_bar (dict for progress bar display) - - any metric used in a callback (e.g. early stopping). + .. note:: :class:`~pytorch_lightning.core.step_result.TrainResult` is simply a Dict with convenient + functions for logging, distributed sync and error checking. - Note: - If this method is not overridden, this won't be called. + When using dp/ddp2 distributed backends, only a portion of the batch is inside the training_step: - - The outputs here are strictly for logging or progress bar. - - If you don't need to display anything, don't return anything. - - If you want to manually set current step, you can specify the 'step' key in the 'log' dict. + .. code-block:: python - Examples: - With a single dataloader: + def training_step(self, batch, batch_idx): + # batch is 1/num_gpus big + x, y = batch - .. code-block:: python + out = self(x) - def training_epoch_end(self, outputs): - train_acc_mean = 0 - for output in outputs: - train_acc_mean += output['train_acc'] + # softmax uses only a portion of the batch in the denomintaor + loss = self.softmax(out) + loss = nce_loss(loss) + return pl.TrainResult(loss) - train_acc_mean /= len(outputs) + If you wish to do something with all the parts of the batch, then use this method to do it: - # log training accuracy at the end of an epoch - results = { - 'log': {'train_acc': train_acc_mean.item()}, - 'progress_bar': {'train_acc': train_acc_mean}, - } - return results + .. code-block:: python - With multiple dataloaders, ``outputs`` will be a list of lists. The outer list contains - one entry per dataloader, while the inner list contains the individual outputs of - each training step for that dataloader. + def training_step(self, batch, batch_idx): + # batch is 1/num_gpus big + x, y = batch - .. code-block:: python + out = self(x) + result = pl.TrainResult() + result.out = out - def training_epoch_end(self, outputs): - train_acc_mean = 0 - i = 0 - for dataloader_outputs in outputs: - for output in dataloader_outputs: - train_acc_mean += output['train_acc'] - i += 1 - - train_acc_mean /= i - - # log training accuracy at the end of an epoch - results = { - 'log': {'train_acc': train_acc_mean.item(), 'step': self.current_epoch} - 'progress_bar': {'train_acc': train_acc_mean}, - } - return results - """ + def training_step_end(self, training_step_outputs): + # this out is now the full size of the batch + all_outs = training_step_outputs.out + + # this softmax now uses the full batch + loss = nce_loss(all_outs) + result = pl.TrainResult(loss) + return result - def training_step_end(self, *args, **kwargs) -> Dict[str, Union[Tensor, Dict[str, Union[float, Tensor]]]]: + See Also: + See the :ref:`multi-gpu-training` guide for more details. """ - Use this when training with dp or ddp2 because :meth:`training_step` - will operate on only part of the batch. However, this is still optional - and only needed for things like softmax or NCE loss. - Note: - If you later switch to ddp or some other mode, this will still be called - so that you don't have to change your code + def training_epoch_end( + self, outputs: Union[TrainResult, List[TrainResult]] + ): + """ + Called at the end of the training epoch with the outputs of all training steps. + Use this in case you need to do something with all the outputs for every training_step. .. code-block:: python - # pseudocode - sub_batches = split_batches_for_dp(batch) - batch_parts_outputs = [training_step(sub_batch) for sub_batch in sub_batches] - training_step_end(batch_parts_outputs) + # the pseudocode for these calls + train_outs = [] + for train_batch in train_data: + out = training_step(train_batch) + train_outs.append(out) + training_epoch_end(train_outs) Args: - batch_parts_outputs: What you return in `training_step` for each batch part. + outputs: List of outputs you defined in :meth:`training_step`, or if there are + multiple dataloaders, a list containing a list of outputs for each dataloader. Return: - Dict with loss key and optional log or progress bar keys. + :class:`~pytorch_lightning.core.step_result.TrainResult` - - loss -> tensor scalar **REQUIRED** - - progress_bar -> Dict for progress bar display. Must have either scalar tensors or Python scalars - - log -> Dict of metrics to add to logger. Must have either scalar tensors or Python scalars (no images, etc) + .. note:: :class:`~pytorch_lightning.core.step_result.TrainResult` is simply a Dict with convenient + functions for logging, distributed sync and error checking. - Examples: - .. code-block:: python - - # WITHOUT training_step_end - # if used in DP or DDP2, this batch is 1/num_gpus large - def training_step(self, batch, batch_idx): - # batch is 1/num_gpus big - x, y = batch - - out = self(x) - loss = self.softmax(out) - loss = nce_loss(loss) - return {'loss': loss} + Note: + If this method is not overridden, this won't be called. - # -------------- - # with training_step_end to do softmax over the full batch - def training_step(self, batch, batch_idx): - # batch is 1/num_gpus big - x, y = batch + Example:: - out = self(x) - return {'out': out} + def training_epoch_end(self, training_step_outputs): + # do something with all training_step outputs + return result - def training_step_end(self, outputs): - # this out is now the full size of the batch - out = outputs['out'] + With multiple dataloaders, ``outputs`` will be a list of lists. The outer list contains + one entry per dataloader, while the inner list contains the individual outputs of + each training step for that dataloader. - # this softmax now uses the full batch size - loss = nce_loss(loss) - return {'loss': loss} + .. code-block:: python - See Also: - See the :ref:`multi-gpu-training` guide for more details. + def training_epoch_end(self, outputs): + epoch_result = pl.TrainResult() + for train_result in outputs: + all_losses = train_result.minimize + # do something with all losses + return results """ - def validation_step(self, *args, **kwargs) -> Dict[str, Union[float, Tensor]]: + def validation_step(self, *args, **kwargs) -> EvalResult: r""" Operates on a single batch of data from the validation set. In this step you'd might generate examples or calculate anything of interest like accuracy. @@ -418,8 +401,7 @@ def validation_step(self, *args, **kwargs) -> Dict[str, Union[float, Tensor]]: (only if multiple val datasets used) Return: - Dict or OrderedDict - passed to :meth:`validation_epoch_end`. - If you defined :meth:`validation_step_end` it will go to that first. + :class:`~pytorch_lightning.core.step_result.TrainResult` .. code-block:: python @@ -459,15 +441,10 @@ def validation_step(self, batch, batch_idx): labels_hat = torch.argmax(out, dim=1) val_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0) - # all optional... - # return whatever you need for the collation function validation_epoch_end - output = OrderedDict({ - 'val_loss': loss_val, - 'val_acc': torch.tensor(val_acc), # everything must be a tensor - }) - - # return an optional dict - return output + # log the outputs! + result = pl.EvalResult(checkpoint_on=loss) + result.log_dict({'val_loss': loss, 'val_acc': val_acc}) + return result If you pass in multiple val datasets, validation_step will have an additional argument. @@ -486,7 +463,7 @@ def validation_step(self, batch, batch_idx, dataloader_idx): the model goes back to training mode and gradients are enabled. """ - def validation_step_end(self, *args, **kwargs) -> Dict[str, Union[float, Tensor]]: + def validation_step_end(self, *args, **kwargs) -> EvalResult: """ Use this when validating with dp or ddp2 because :meth:`validation_step` will operate on only part of the batch. However, this is still optional @@ -508,38 +485,42 @@ def validation_step_end(self, *args, **kwargs) -> Dict[str, Union[float, Tensor] for each batch part. Return: - Dict or OrderedDict - passed to the :meth:`validation_epoch_end` method. - - Examples: - .. code-block:: python - - # WITHOUT validation_step_end - # if used in DP or DDP2, this batch is 1/num_gpus large - def validation_step(self, batch, batch_idx): - # batch is 1/num_gpus big - x, y = batch - - out = self(x) - loss = self.softmax(out) - loss = nce_loss(loss) - return {'loss': loss} + :class:`~pytorch_lightning.core.step_result.TrainResult` - # -------------- - # with validation_step_end to do softmax over the full batch - def validation_step(self, batch, batch_idx): - # batch is 1/num_gpus big - x, y = batch - - out = self(x) - return {'out': out} - - def validation_epoch_end(self, outputs): - # this out is now the full size of the batch - out = outputs['out'] + .. code-block:: python - # this softmax now uses the full batch size - loss = nce_loss(loss) - return {'loss': loss} + # WITHOUT validation_step_end + # if used in DP or DDP2, this batch is 1/num_gpus large + def validation_step(self, batch, batch_idx): + # batch is 1/num_gpus big + x, y = batch + + out = self(x) + loss = self.softmax(out) + loss = nce_loss(loss) + result = pl.EvalResult() + result.log('val_loss', loss) + return result + + # -------------- + # with validation_step_end to do softmax over the full batch + def validation_step(self, batch, batch_idx): + # batch is 1/num_gpus big + x, y = batch + + out = self(x) + result = pl.EvalResult() + result.out = out + return result + + def validation_epoch_end(self, output_results): + # this out is now the full size of the batch + all_val_step_outs = output_results.out + loss = nce_loss(all_val_step_outs) + + result = pl.EvalResult(checkpoint_on=loss) + result.log('val_loss', loss) + return result See Also: See the :ref:`multi-gpu-training` guide for more details. @@ -553,8 +534,8 @@ def validation_end(self, outputs): """ def validation_epoch_end( - self, outputs: Union[List[Dict[str, Union[float, Tensor]]], List[List[Dict[str, Union[float, Tensor]]]]] - ) -> Dict[str, Dict[str, Union[float, Tensor]]]: + self, outputs: Union[EvalResult, List[EvalResult]] + ) -> EvalResult: """ Called at the end of the validation epoch with the outputs of all validation steps. @@ -572,38 +553,25 @@ def validation_epoch_end( are multiple dataloaders, a list containing a list of outputs for each dataloader. Return: - Dict or OrderedDict. - May have the following optional keys: - - - progress_bar (dict for progress bar display; either scalar tensors or Python scalars) - - log (dict of metrics to add to logger; either scalar tensors or Python scalars). + :class:`~pytorch_lightning.core.step_result.TrainResult` Note: If you didn't define a :meth:`validation_step`, this won't be called. - The outputs here are strictly for logging or progress bar. - If you don't need to display anything, don't return anything. - - If you want to manually set current step, you can specify the 'step' key in the 'log' dict. Examples: With a single dataloader: .. code-block:: python - def validation_epoch_end(self, outputs): - val_acc_mean = 0 - for output in outputs: - val_acc_mean += output['val_acc'] - - val_acc_mean /= len(outputs) - tqdm_dict = {'val_acc': val_acc_mean.item()} - - # show val_acc in progress bar but only log val_loss - results = { - 'progress_bar': tqdm_dict, - 'log': {'val_acc': val_acc_mean.item()} - } - return results + def validation_epoch_end(self, val_step_outputs): + # do something with the outputs of all val batches + all_val_preds = val_step_outputs.predictions + + val_step_outputs.some_result = calc_all_results(all_val_preds) + return val_step_outputs With multiple dataloaders, `outputs` will be a list of lists. The outer list contains one entry per dataloader, while the inner list contains the individual outputs of @@ -612,25 +580,15 @@ def validation_epoch_end(self, outputs): .. code-block:: python def validation_epoch_end(self, outputs): - val_acc_mean = 0 - i = 0 - for dataloader_outputs in outputs: - for output in dataloader_outputs: - val_acc_mean += output['val_acc'] - i += 1 - - val_acc_mean /= i - tqdm_dict = {'val_acc': val_acc_mean.item()} - - # show val_loss and val_acc in progress bar but only log val_loss - results = { - 'progress_bar': tqdm_dict, - 'log': {'val_acc': val_acc_mean.item(), 'step': self.current_epoch} - } - return results + for dataloader_output_result in outputs: + dataloader_outs = dataloader_output_result.dataloader_i_outputs + + result = pl.EvalResult() + result.log('final_metric', final_value) + return result """ - def test_step(self, *args, **kwargs) -> Dict[str, Union[float, Tensor]]: + def test_step(self, *args, **kwargs) -> EvalResult: r""" Operates on a single batch of data from the test set. In this step you'd normally generate examples or calculate anything of interest @@ -653,8 +611,7 @@ def test_step(self, *args, **kwargs) -> Dict[str, Union[float, Tensor]]: (only if multiple test datasets used). Return: - Dict or OrderedDict - passed to the :meth:`test_epoch_end` method. - If you defined :meth:`test_step_end` it will go to that first. + :class:`~pytorch_lightning.core.step_result.TrainResult` .. code-block:: python @@ -683,17 +640,12 @@ def test_step(self, batch, batch_idx): # calculate acc labels_hat = torch.argmax(out, dim=1) - val_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0) - - # all optional... - # return whatever you need for the collation function test_epoch_end - output = OrderedDict({ - 'val_loss': loss_val, - 'val_acc': torch.tensor(val_acc), # everything must be a tensor - }) + test_acc = torch.sum(y == labels_hat).item() / (len(y) * 1.0) - # return an optional dict - return output + # log the outputs! + result = pl.EvalResult(checkpoint_on=loss) + result.log_dict({'test_loss': loss, 'test_acc': test_acc}) + return resultt If you pass in multiple validation datasets, :meth:`test_step` will have an additional argument. @@ -713,7 +665,7 @@ def test_step(self, batch, batch_idx, dataloader_idx): to training mode and gradients are enabled. """ - def test_step_end(self, *args, **kwargs) -> Dict[str, Union[float, Tensor]]: + def test_step_end(self, *args, **kwargs) -> EvalResult: """ Use this when testing with dp or ddp2 because :meth:`test_step` will operate on only part of the batch. However, this is still optional @@ -734,38 +686,42 @@ def test_step_end(self, *args, **kwargs) -> Dict[str, Union[float, Tensor]]: batch_parts_outputs: What you return in :meth:`test_step` for each batch part. Return: - Dict or OrderedDict - passed to the :meth:`test_epoch_end`. - - Examples: - .. code-block:: python - - # WITHOUT test_step_end - # if used in DP or DDP2, this batch is 1/num_gpus large - def test_step(self, batch, batch_idx): - # batch is 1/num_gpus big - x, y = batch - - out = self(x) - loss = self.softmax(out) - loss = nce_loss(loss) - return {'loss': loss} - - # -------------- - # with test_step_end to do softmax over the full batch - def test_step(self, batch, batch_idx): - # batch is 1/num_gpus big - x, y = batch + :class:`~pytorch_lightning.core.step_result.TrainResult` - out = self(x) - return {'out': out} - - def test_step_end(self, outputs): - # this out is now the full size of the batch - out = outputs['out'] + .. code-block:: python - # this softmax now uses the full batch size - loss = nce_loss(loss) - return {'loss': loss} + # WITHOUT test_step_end + # if used in DP or DDP2, this batch is 1/num_gpus large + def test_step(self, batch, batch_idx): + # batch is 1/num_gpus big + x, y = batch + + out = self(x) + loss = self.softmax(out) + loss = nce_loss(loss) + result = pl.EvalResult() + result.log('test_loss', loss) + return result + + # -------------- + # with test_step_end to do softmax over the full batch + def test_step(self, batch, batch_idx): + # batch is 1/num_gpus big + x, y = batch + + out = self(x) + result = pl.EvalResult() + result.out = out + return result + + def test_epoch_end(self, output_results): + # this out is now the full size of the batch + all_test_step_outs = output_results.out + loss = nce_loss(all_test_step_outs) + + result = pl.EvalResult(checkpoint_on=loss) + result.log('test_loss', loss) + return result See Also: See the :ref:`multi-gpu-training` guide for more details. @@ -779,8 +735,9 @@ def test_end(self, outputs): """ def test_epoch_end( - self, outputs: Union[List[Dict[str, Union[float, Tensor]]], List[List[Dict[str, Union[float, Tensor]]]]] - ) -> Dict[str, Dict[str, Union[float, Tensor]]]: + self, outputs: Union[EvalResult, List[EvalResult]] + ) -> EvalResult: + """ Called at the end of a test epoch with the output of all test steps. @@ -798,17 +755,13 @@ def test_epoch_end( are multiple dataloaders, a list containing a list of outputs for each dataloader Return: - Dict or OrderedDict: Dict has the following optional keys: - - - progress_bar -> Dict for progress bar display. Must have either scalar tensors or Python scalars. - - log -> Dict of metrics to add to logger. Must have either scalar tensors or Python scalars (no images, etc). + :class:`~pytorch_lightning.core.step_result.TrainResult` Note: If you didn't define a :meth:`test_step`, this won't be called. - The outputs here are strictly for logging or progress bar. - If you don't need to display anything, don't return anything. - - If you want to manually set current step, specify it with the 'step' key in the 'log' Dict Examples: With a single dataloader: @@ -816,19 +769,11 @@ def test_epoch_end( .. code-block:: python def test_epoch_end(self, outputs): - test_acc_mean = 0 - for output in outputs: - test_acc_mean += output['test_acc'] - - test_acc_mean /= len(outputs) - tqdm_dict = {'test_acc': test_acc_mean.item()} - - # show test_loss and test_acc in progress bar but only log test_loss - results = { - 'progress_bar': tqdm_dict, - 'log': {'test_acc': test_acc_mean.item()} - } - return results + # do something with the outputs of all test batches + all_test_preds = test_step_outputs.predictions + + test_step_outputs.some_result = calc_all_results(all_test_preds) + return test_step_outputs With multiple dataloaders, `outputs` will be a list of lists. The outer list contains one entry per dataloader, while the inner list contains the individual outputs of @@ -837,21 +782,11 @@ def test_epoch_end(self, outputs): .. code-block:: python def test_epoch_end(self, outputs): - test_acc_mean = 0 - i = 0 - for dataloader_outputs in outputs: - for output in dataloader_outputs: - test_acc_mean += output['test_acc'] - i += 1 - - test_acc_mean /= i - tqdm_dict = {'test_acc': test_acc_mean.item()} - - # show test_loss and test_acc in progress bar but only log test_loss - results = { - 'progress_bar': tqdm_dict, - 'log': {'test_acc': test_acc_mean.item(), 'step': self.current_epoch} - } + for dataloader_output_result in outputs: + dataloader_outs = dataloader_output_result.dataloader_i_outputs + + result = pl.EvalResult() + result.log('final_metric', final_value) return results """ @@ -889,6 +824,7 @@ def configure_ddp(self, model, device_ids): return model def _init_slurm_connection(self) -> None: + """""" """ Sets up environment variables necessary for pytorch distributed communications based on slurm environment. @@ -957,6 +893,10 @@ def init_ddp_connection(self, global_rank: int, world_size: int, is_slurm_managi log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank+1}/{world_size}") torch_distrib.init_process_group(torch_backend, rank=global_rank, world_size=world_size) + """ + configure_sync_batchnorm + ^^^^^^^^^^^^^^^^^^^^^^^^ + """ def configure_sync_batchnorm(self, model: 'LightningModule') -> 'LightningModule': """ Add global batchnorm for a model spread across multiple GPUs and nodes. @@ -1632,6 +1572,7 @@ def get_tqdm_dict(self) -> Dict[str, Union[int, str]]: @classmethod def _auto_collect_arguments(cls, frame=None) -> Tuple[Dict, Dict]: + """""" """ Collect all module arguments in the current constructor and all child constructors. The child constructors are all the ``__init__`` methods that reach the current class through @@ -1792,6 +1733,7 @@ def hparams(self, hp: Union[dict, Namespace, Any]): self._set_hparams(hp) def __get_hparams_assignment_variable(self): + """""" """ looks at the code of the class to figure out what the user named self.hparams this only happens when the user explicitly sets self.hparams diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py index 8b482f04361015..ea62fdab2e9960 100644 --- a/pytorch_lightning/core/step_result.py +++ b/pytorch_lightning/core/step_result.py @@ -25,7 +25,7 @@ def __init__( if checkpoint_on is not None and checkpoint_on: self.checkpoint_on = checkpoint_on if hiddens is not None: - self.hiddens = hiddens + self.hiddens = hiddens.detach() if minimize is not None: err = 'Minimize can only be used in training_step, training_step_end, training_epoch_end' self._assert_grad_tensor_metric('minimize', minimize, err) @@ -59,7 +59,7 @@ def __getattr__(self, key: str) -> Any: def __setattr__(self, key: str, val: Union[Tensor, Any]): # ensure reserve keys are tensors and detached - if key in {'hiddens', 'checkpoint_on', 'early_stop_on'}: + if key in {'checkpoint_on', 'early_stop_on'}: self._assert_tensor_metric(key, val) if val is not None and isinstance(val, torch.Tensor): val = val.detach() @@ -95,17 +95,17 @@ def log( tbptt_reduce_fx: Callable = torch.mean, tbptt_pad_token: int = 0, enable_graph: bool = False, - sync_ddp: bool = False, - sync_ddp_op: Union[Any, str] = 'mean', - sync_ddp_group: Optional[Any] = None + sync_dist: bool = False, + sync_dist_op: Union[Any, str] = 'mean', + sync_dist_group: Optional[Any] = None ): # no metrics should be logged with graphs if not enable_graph and isinstance(value, torch.Tensor): value = value.detach() # sync across ddp - if sync_ddp and isinstance(value, (torch.Tensor, numbers.Number)): - value = _sync_ddp_if_available(value, group=sync_ddp_group, reduce_op=sync_ddp_op) + if sync_dist and isinstance(value, (torch.Tensor, numbers.Number)): + value = _sync_ddp_if_available(value, group=sync_dist_group, reduce_op=sync_dist_op) if 'meta' not in self: self.__setitem__('meta', {}) @@ -450,9 +450,9 @@ def log( tbptt_reduce_fx: Callable = torch.mean, tbptt_pad_token: int = 0, enable_graph: bool = False, - sync_ddp: bool = False, - sync_ddp_op: Union[Any, str] = 'mean', - sync_ddp_group: Optional[Any] = None + sync_dist: bool = False, + sync_dist_op: Union[Any, str] = 'mean', + sync_dist_group: Optional[Any] = None ): """ Log a key, value @@ -485,9 +485,9 @@ def log( tbptt_reduce_fx: function to reduce on truncated back prop tbptt_pad_token: token to use for padding enable_graph: if True, will not auto detach the graph - sync_ddp: if True, reduces the metric across GPUs/TPUs - sync_ddp_op: the op to sync across - sync_ddp_group: the ddp group + sync_dist: if True, reduces the metric across GPUs/TPUs + sync_dist_op: the op to sync across + sync_dist_group: the ddp group """ super().log(name=name, value=value, @@ -497,9 +497,9 @@ def log( on_epoch=on_epoch, reduce_fx=reduce_fx, enable_graph=enable_graph, - sync_ddp=sync_ddp, - sync_ddp_group=sync_ddp_group, - sync_ddp_op=sync_ddp_op, + sync_dist=sync_dist, + sync_dist_group=sync_dist_group, + sync_dist_op=sync_dist_op, tbptt_pad_token=tbptt_pad_token, tbptt_reduce_fx=tbptt_reduce_fx) @@ -514,9 +514,9 @@ def log_dict( tbptt_reduce_fx: Callable = torch.mean, tbptt_pad_token: int = 0, enable_graph: bool = False, - sync_ddp: bool = False, - sync_ddp_op: Union[Any, str] = 'mean', - sync_ddp_group: Optional[Any] = None + sync_dist: bool = False, + sync_dist_op: Union[Any, str] = 'mean', + sync_dist_group: Optional[Any] = None ): """ Log a dictonary of values at once @@ -536,9 +536,9 @@ def log_dict( tbptt_reduce_fx: function to reduce on truncated back prop tbptt_pad_token: token to use for padding enable_graph: if True, will not auto detach the graph - sync_ddp: if True, reduces the metric across GPUs/TPUs - sync_ddp_op: the op to sync across - sync_ddp_group: the ddp group: + sync_dist: if True, reduces the metric across GPUs/TPUs + sync_dist_op: the op to sync across + sync_dist_group: the ddp group: """ for k, v in dictionary.items(): self.log(name=k, @@ -549,9 +549,9 @@ def log_dict( on_epoch=on_epoch, reduce_fx=reduce_fx, enable_graph=enable_graph, - sync_ddp=sync_ddp, - sync_ddp_group=sync_ddp_group, - sync_ddp_op=sync_ddp_op, + sync_dist=sync_dist, + sync_dist_group=sync_dist_group, + sync_dist_op=sync_dist_op, tbptt_pad_token=tbptt_pad_token, tbptt_reduce_fx=tbptt_reduce_fx) @@ -602,9 +602,9 @@ def log( tbptt_reduce_fx: Callable = torch.mean, tbptt_pad_token: int = 0, enable_graph: bool = False, - sync_ddp: bool = False, - sync_ddp_op: Union[Any, str] = 'mean', - sync_ddp_group: Optional[Any] = None + sync_dist: bool = False, + sync_dist_op: Union[Any, str] = 'mean', + sync_dist_group: Optional[Any] = None ): """ Log a key, value @@ -636,9 +636,9 @@ def log( tbptt_reduce_fx: function to reduce on truncated back prop tbptt_pad_token: token to use for padding enable_graph: if True, will not auto detach the graph - sync_ddp: if True, reduces the metric across GPUs/TPUs - sync_ddp_op: the op to sync across - sync_ddp_group: the ddp group + sync_dist: if True, reduces the metric across GPUs/TPUs + sync_dist_op: the op to sync across + sync_dist_group: the ddp group """ super().log(name=name, value=value, @@ -648,9 +648,9 @@ def log( on_epoch=on_epoch, reduce_fx=reduce_fx, enable_graph=enable_graph, - sync_ddp=sync_ddp, - sync_ddp_group=sync_ddp_group, - sync_ddp_op=sync_ddp_op, + sync_dist=sync_dist, + sync_dist_group=sync_dist_group, + sync_dist_op=sync_dist_op, tbptt_pad_token=tbptt_pad_token, tbptt_reduce_fx=tbptt_reduce_fx) @@ -665,9 +665,9 @@ def log_dict( tbptt_reduce_fx: Callable = torch.mean, tbptt_pad_token: int = 0, enable_graph: bool = False, - sync_ddp: bool = False, - sync_ddp_op: Union[Any, str] = 'mean', - sync_ddp_group: Optional[Any] = None + sync_dist: bool = False, + sync_dist_op: Union[Any, str] = 'mean', + sync_dist_group: Optional[Any] = None ): """ Log a dictonary of values at once @@ -687,9 +687,9 @@ def log_dict( tbptt_reduce_fx: function to reduce on truncated back prop tbptt_pad_token: token to use for padding enable_graph: if True, will not auto detach the graph - sync_ddp: if True, reduces the metric across GPUs/TPUs - sync_ddp_op: the op to sync across - sync_ddp_group: the ddp group + sync_dist: if True, reduces the metric across GPUs/TPUs + sync_dist_op: the op to sync across + sync_dist_group: the ddp group """ for k, v in dictionary.items(): self.log(name=k, @@ -700,9 +700,9 @@ def log_dict( on_epoch=on_epoch, reduce_fx=reduce_fx, enable_graph=enable_graph, - sync_ddp=sync_ddp, - sync_ddp_group=sync_ddp_group, - sync_ddp_op=sync_ddp_op, + sync_dist=sync_dist, + sync_dist_group=sync_dist_group, + sync_dist_op=sync_dist_op, tbptt_pad_token=tbptt_pad_token, tbptt_reduce_fx=tbptt_reduce_fx) diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index f76c6f1b008dea..1d22d1de45cb0a 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -71,6 +71,7 @@ class TrainerDPMixin(ABC): amp_level: str precision: ... global_rank: int + local_rank: int tpu_local_core_rank: int tpu_global_core_rank: int use_tpu: bool @@ -129,6 +130,9 @@ def copy_trainer_model_properties(self, model): m.use_tpu = self.use_tpu m.tpu_local_core_rank = self.tpu_local_core_rank m.tpu_global_core_rank = self.tpu_global_core_rank + m.precision = self.precision + m.global_rank = self.global_rank + m.local_rank = self.local_rank def transfer_batch_to_tpu(self, batch: Any, tpu_id: Optional[int] = None): """ diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index 5ce7b7718c2a7e..433ea970877db6 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -312,9 +312,16 @@ def _evaluate( # callbacks if test_mode: self.on_test_batch_start(batch, batch_idx, dataloader_idx) + if self.is_overridden('on_test_batch_start'): + model_ref = self.get_model() + with self.profiler.profile('on_test_batch_start'): + model_ref.on_test_batch_start(output) else: self.on_validation_batch_start(batch, batch_idx, dataloader_idx) - + if self.is_overridden('on_validation_batch_start'): + model_ref = self.get_model() + with self.profiler.profile('on_validation_batch_start'): + model_ref.on_validation_batch_start(output) # ----------------- # RUN EVALUATION STEP # ----------------- @@ -335,13 +342,25 @@ def _evaluate( model_ref = self.get_model() with self.profiler.profile('test_step_end'): output = model_ref.test_step_end(output) - self.on_test_batch_end(batch, batch_idx, dataloader_idx) else: if self.is_overridden('validation_step_end'): model_ref = self.get_model() with self.profiler.profile('validation_step_end'): output = model_ref.validation_step_end(output) + + # callbacks (on __batch_end) + if test_mode: + self.on_test_batch_end(batch, batch_idx, dataloader_idx) + if self.is_overridden('on_test_batch_end'): + model_ref = self.get_model() + with self.profiler.profile('on_test_batch_end'): + model_ref.on_test_batch_end(output) + else: self.on_validation_batch_end(batch, batch_idx, dataloader_idx) + if self.is_overridden('on_validation_batch_end'): + model_ref = self.get_model() + with self.profiler.profile('on_validation_batch_end'): + model_ref.on_validation_batch_end(output) # track outputs for collation if output is not None: diff --git a/tests/core/test_results.py b/tests/core/test_results.py index 743a6d89153436..0630838a871ef0 100644 --- a/tests/core/test_results.py +++ b/tests/core/test_results.py @@ -21,7 +21,7 @@ def _ddp_test_fn(rank, worldsize, result_cls: Result): tensor = torch.tensor([1.0]) res = result_cls() - res.log("test_tensor", tensor, sync_ddp=True, sync_ddp_op=torch.distributed.ReduceOp.SUM) + res.log("test_tensor", tensor, sync_dist=True, sync_dist_op=torch.distributed.ReduceOp.SUM) assert res["test_tensor"].item() == dist.get_world_size(), "Result-Log does not work properly with DDP and Tensors"