From 34eef353aaddcc0fd01689f30b385ea3e7c6804c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 30 Jul 2021 15:03:15 +0200 Subject: [PATCH] fix collecting training_step outputs (#8613) --- .../loops/batch/training_batch_loop.py | 5 +++-- .../loops/test_training_loop_flow_dict.py | 12 ++++++++---- tests/trainer/test_trainer.py | 16 +++++++++++++--- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py index 180bd5889251e..9328ddd9080f6 100644 --- a/pytorch_lightning/loops/batch/training_batch_loop.py +++ b/pytorch_lightning/loops/batch/training_batch_loop.py @@ -14,6 +14,7 @@ from collections import OrderedDict from contextlib import contextmanager +from copy import copy from functools import partial, update_wrapper from typing import Any, Callable, Dict, Generator, List, Mapping, Optional, Tuple @@ -146,12 +147,12 @@ def advance(self, batch, batch_idx, dataloader_idx): result = self._run_optimization(batch_idx, split_batch, opt_idx, optimizer) if result: - self.batch_outputs[opt_idx].append(result.training_step_output) + self.batch_outputs[opt_idx].append(copy(result.training_step_output)) else: # in manual optimization, there is no looping over optimizers result = self._run_optimization(batch_idx, split_batch) if result: - self.batch_outputs[0].append(result.training_step_output) + self.batch_outputs[0].append(copy(result.training_step_output)) def teardown(self) -> None: # release memory diff --git a/tests/trainer/loops/test_training_loop_flow_dict.py b/tests/trainer/loops/test_training_loop_flow_dict.py index f064dacb78844..ab4d7979bbf39 100644 --- a/tests/trainer/loops/test_training_loop_flow_dict.py +++ b/tests/trainer/loops/test_training_loop_flow_dict.py @@ -108,7 +108,7 @@ def training_step(self, batch, batch_idx): acc = acc + batch_idx self.training_step_called = True - out = {"loss": acc, "random_things": [1, "a", torch.tensor(2)]} + out = {"loss": acc, "random_things": [1, "a", torch.tensor(2)], "batch_idx": batch_idx} return out def training_epoch_end(self, outputs): @@ -116,11 +116,13 @@ def training_epoch_end(self, outputs): # verify we saw the current num of batches assert len(outputs) == 2 + assert len({id(output) for output in outputs}) == 2 + assert [output["batch_idx"] for output in outputs] == [0, 1] for b in outputs: assert isinstance(b, dict) assert self.count_num_graphs(b) == 0 - assert {"random_things", "loss"} == set(b.keys()) + assert {"random_things", "loss", "batch_idx"} == set(b.keys()) def backward(self, loss, optimizer, optimizer_idx): return LightningModule.backward(self, loss, optimizer, optimizer_idx) @@ -155,7 +157,7 @@ def training_step(self, batch, batch_idx): acc = acc + batch_idx self.training_step_called = True - self.out = {"loss": acc, "random_things": [1, "a", torch.tensor(2)]} + self.out = {"loss": acc, "random_things": [1, "a", torch.tensor(2)], "batch_idx": batch_idx} return self.out def training_step_end(self, tr_step_output): @@ -169,11 +171,13 @@ def training_epoch_end(self, outputs): # verify we saw the current num of batches assert len(outputs) == 2 + assert len({id(output) for output in outputs}) == 2 + assert [output["batch_idx"] for output in outputs] == [0, 1] for b in outputs: assert isinstance(b, dict) assert self.count_num_graphs(b) == 0 - assert {"random_things", "loss"} == set(b.keys()) + assert {"random_things", "loss", "batch_idx"} == set(b.keys()) def backward(self, loss, optimizer, optimizer_idx): return LightningModule.backward(self, loss, optimizer, optimizer_idx) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index e0f909a7b8b35..ed7a274569d79 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import gc import logging import math import os @@ -1872,13 +1873,22 @@ def on_epoch_start(self, trainer, *_): assert list(trainer.optimizers[0].state.values())[0]["exp_avg_sq"].device == torch.device("cpu") assert trainer.callback_metrics["train_loss"].device == torch.device("cpu") + # before measuring the memory force release any leftover allocations, including CUDA tensors + gc.collect() memory_1 = torch.cuda.memory_allocated(0) + assert memory_1 == initial + deepcopy(trainer) + + # before measuring the memory force release any leftover allocations, including CUDA tensors + gc.collect() memory_2 = torch.cuda.memory_allocated(0) - assert memory_1 == memory_2 == initial + assert memory_2 == initial trainer_2 = Trainer(**trainer_kwargs) trainer_2.fit(model) - memory_3 = torch.cuda.memory_allocated(0) - assert initial == memory_1 == memory_3 + # before measuring the memory force release any leftover allocations, including CUDA tensors + gc.collect() + memory_3 = torch.cuda.memory_allocated(0) + assert memory_3 == initial