Lightning-AI · tchaton · Jul 21, 2021 · Jul 20, 2021 · Jul 20, 2021 · Jul 20, 2021
@@ -493,6 +493,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed clearing dataloader references before attaching new dataloaders in consecutive `Trainer.{fit,validate,test,predict}´ runs ([#8442](https://github.com/PyTorchLightning/pytorch-lightning/pull/8442))
 
 
+- Fixed memory leaks on GPU by moving `optimizer_states`, `ResultCollection.extra`, `ResultMetric` attributes, and `LoggerConnector` metrics to `cpu`. Also, delete the DDP wrapper on `teardown` ([#8490](https://github.com/PyTorchLightning/pytorch-lightning/pull/8490))
+
+
 - Fixed DeepSpeed Windows support ([#8488](https://github.com/PyTorchLightning/pytorch-lightning/pull/8488))
 
 

@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
-from collections import defaultdict
-from typing import Any, Callable, DefaultDict, Dict, Generator, Iterable, List, Optional, Union
+from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Union
 
 import torch
 from torch import Tensor
@@ -104,21 +103,19 @@ def start_predicting(self, trainer: 'pl.Trainer') -> None:
 
     def pre_dispatch(self, trainer: 'pl.Trainer') -> None:
         """Hook to do something before the training/evaluation/prediction starts."""
-        self._move_optimizer_state()
+        self._move_optimizer_state(self.root_device)
 
         self.training_type_plugin.pre_dispatch()
         if self.training_type_plugin.setup_optimizers_in_pre_dispatch:
             self.setup_optimizers(trainer)
 
         self.precision_plugin.pre_dispatch()
 
-    def _move_optimizer_state(self) -> None:
+    def _move_optimizer_state(self, device: torch.device) -> None:
         """ Moves the state of the optimizers to the GPU if needed. """
         for opt in self.optimizers:
-            state: DefaultDict = defaultdict(dict)
             for p, v in opt.state.items():
-                state[p] = apply_to_collection(v, torch.Tensor, move_data_to_device, self.root_device)
-            opt.state = state
+                opt.state[p] = apply_to_collection(v, torch.Tensor, move_data_to_device, device)
 
     def dispatch(self, trainer: 'pl.Trainer') -> None:
         """Hook to do something before the training/evaluation/prediction starts."""

@@ -52,3 +52,7 @@ def set_nvidia_flags(local_rank: int) -> None:
         all_gpu_ids = ",".join([str(x) for x in range(torch.cuda.device_count())])
         devices = os.getenv("CUDA_VISIBLE_DEVICES", all_gpu_ids)
         _log.info(f"LOCAL_RANK: {local_rank} - CUDA_VISIBLE_DEVICES: [{devices}]")
+
+    def teardown(self) -> None:
+        super().teardown()
+        self._move_optimizer_state(torch.device("cpu"))
@@ -133,6 +133,9 @@ def block_backward_sync(self):
             yield None
 
     def teardown(self) -> None:
+        if not isinstance(self.model, pl.LightningModule):
+            # Un-reference the wrapper as the reducer can hold cuda memory.
+            self.model = None
         if self.on_gpu:
             # GPU teardown
             self.lightning_module.cpu()

@@ -39,7 +39,7 @@ class TrainingTypePlugin(Plugin, ABC):
     """
 
     def __init__(self) -> None:
-        self._model = None
+        self._model: Optional[Module] = None
         self._results: Optional[Union[_EVALUATE_OUTPUT, _PREDICT_OUTPUT]] = None
         self._call_configure_sharded_model_hook = True
 
@@ -121,12 +121,12 @@ def post_optimizer_step(self, optimizer: Optimizer, optimizer_idx: int, **kwargs
         """Hook to do something after each optimizer step."""
 
     @property
-    def model(self) -> Module:
+    def model(self) -> Optional[Module]:
         """Returns the potentially wrapped LightningModule"""
         return self._model
 
     @model.setter
-    def model(self, new_model: Module) -> None:
+    def model(self, new_model: Optional[Module]) -> None:
         self._model = new_model
 
     @property

@@ -23,6 +23,7 @@
 from pytorch_lightning.trainer.connectors.logger_connector.result import _METRIC, MetricSource
 from pytorch_lightning.trainer.states import RunningStage, TrainerFn
 from pytorch_lightning.utilities import DeviceType
+from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device
 from pytorch_lightning.utilities.metrics import metrics_to_scalars
 from pytorch_lightning.utilities.types import _EVALUATE_OUTPUT
 
@@ -312,3 +313,9 @@ def progress_bar_metrics(self) -> Dict[str, float]:
             metrics = self.metrics[MetricSource.PBAR]
             self._progress_bar_metrics.update(metrics)
         return self._progress_bar_metrics
+
+    def teardown(self):
+        args = (torch.Tensor, move_data_to_device, "cpu")
+        self._logged_metrics = apply_to_collection(self._logged_metrics, *args)
+        self._progress_bar_metrics = apply_to_collection(self._progress_bar_metrics, *args)
+        self._callback_metrics = apply_to_collection(self._callback_metrics, *args)
@@ -21,9 +21,8 @@
 
 from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin
 from pytorch_lightning.utilities import rank_zero_warn
-from pytorch_lightning.utilities.apply_func import apply_to_collection, apply_to_collections
+from pytorch_lightning.utilities.apply_func import apply_to_collection, apply_to_collections, move_data_to_device
 from pytorch_lightning.utilities.data import extract_batch_size
-from pytorch_lightning.utilities.distributed import distributed_available
 from pytorch_lightning.utilities.enums import LightningEnum
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.metrics import metrics_to_scalars
@@ -254,12 +253,7 @@ def __getstate__(self, drop_value: bool = False) -> dict:
         if not self.is_tensor and drop_value:
             # Avoid serializing ResultMetrics which are passed Metrics
             skip.append('value')
-        with self.sync_context(
-            should_sync=not self.meta.sync.rank_zero_only,
-            process_group=self.meta.sync.group,
-            distributed_available=distributed_available
-        ):
-            d = {k: v for k, v in self.__dict__.items() if k not in skip}
+        d = {k: v for k, v in self.__dict__.items() if k not in skip}
         d['meta'] = d['meta'].__getstate__()
         d['_class'] = self.__class__.__name__
         return d
@@ -276,6 +270,12 @@ def _reconstruct(cls, state: dict, sync_fn: Optional[Callable] = None) -> 'Resul
         result_metric.__setstate__(state, sync_fn=sync_fn)
         return result_metric
 
+    def to(self, *args: Any, **kwargs: Any) -> 'DeviceDtypeModuleMixin':
+        self.__dict__.update(
+            apply_to_collection(self.__dict__, (torch.Tensor, Metric), move_data_to_device, *args, **kwargs)
+        )
+        return self
+
 
 class ResultMetricCollection(dict):
     """
@@ -597,10 +597,7 @@ def extract_batch_size(self, batch: Any) -> None:
     def to(self, *args, **kwargs) -> 'ResultCollection':
         """Move all data to the given device."""
 
-        def to_(item: Union[torch.Tensor, Metric], *args: Any, **kwargs: Any) -> Union[torch.Tensor, Metric]:
-            return item.to(*args, **kwargs)
-
-        apply_to_collection(self, (torch.Tensor, Metric), to_, *args, **kwargs)
+        self.update(apply_to_collection(dict(self), (torch.Tensor, Metric), move_data_to_device, *args, **kwargs))
 
         if self.minimize is not None:
             self.minimize = self.minimize.to(*args, **kwargs)

@@ -964,6 +964,7 @@ def _post_dispatch(self):
         # which need to happen before.
         self.accelerator.teardown()
         self._active_loop.teardown()
+        self.logger_connector.teardown()
 
     def _dispatch(self):
         if self.evaluating:

@@ -1969,3 +1969,43 @@ def training_step(self, batch, batch_idx):
     # simulate random failure in training_step on rank 0
     with pytest.raises(DeadlockDetectedException, match="CustomException"):
         trainer.fit(model)
+
+
+@RunIf(min_gpus=1)
+def test_multiple_trainer_constant_memory_allocated(tmpdir):
+    """
+    This tests ensures calling the trainer several times reset the memory back to 0.
+    """
+
+    class TestModel(BoringModel):
+
+        def training_step(self, batch, batch_idx):
+            loss = super().training_step(batch, batch_idx)
+            self.log("train_loss", loss["loss"])
+            return loss
+
+        def configure_optimizers(self):
+            return torch.optim.Adam(self.layer.parameters(), lr=0.1)
+
+    initial = torch.cuda.memory_allocated(0)
+
+    model = TestModel()
+    trainer_kwargs = dict(
+        default_root_dir=tmpdir, fast_dev_run=True, gpus=1, accelerator="ddp", progress_bar_refresh_rate=0
+    )
+    trainer = Trainer(**trainer_kwargs)
+    trainer.fit(model)
+
+    assert list(trainer.optimizers[0].state.values())[0]["exp_avg_sq"].device == torch.device("cpu")
+    assert trainer.callback_metrics['train_loss'].device == torch.device("cpu")
+
+    memory_1 = torch.cuda.memory_allocated(0)
+    deepcopy(trainer)
+    memory_2 = torch.cuda.memory_allocated(0)
+    assert memory_1 == memory_2 == initial
+
+    trainer_2 = Trainer(**trainer_kwargs)
+    trainer_2.fit(model)
+    memory_3 = torch.cuda.memory_allocated(0)
+
+    assert initial == memory_1 == memory_3