Clean-up CUDA cache. (#6325)

pytorch · Apr 22, 2024 · 789eb6d · 789eb6d
1 parent 82624ff
commit 789eb6d
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 17 deletions.
diff --git a/benchmarks/torchbench_model.py b/benchmarks/torchbench_model.py
@@ -166,6 +166,14 @@ class TorchBenchModel(BenchmarkModel):
   def __init__(self, suite_name, model_name, benchmark_experiment):
     super().__init__(suite_name, model_name, benchmark_experiment)
 
+  def _cleanup(self):
+    # Garbage-collect right now.
+    gc.collect()
+
+    # If we are using CUDA, clean-up its cache left-over.
+    if self.benchmark_experiment.accelerator == "cuda":
+      torch.cuda.empty_cache()
+
   def set_up(self):
     """Set up module, actual batch_size, example_inputs, and optimizer_class
 
@@ -181,12 +189,16 @@ def set_up(self):
 
     # Move the initialized model to XLA device.
     if self.benchmark_experiment.xla:
-      import torch.utils._pytree as pytree
+      # First, move the model and the inputs to CPU.
+      # This avoids having dupplicated data on CUDA.
+      if self.benchmark_experiment.accelerator == "cuda":
+        self.module = self.module.to("cpu")
+        self.example_inputs = move_to_device(self.example_inputs, "cpu")
+        self._cleanup()
+
       device = self.benchmark_experiment.get_device()
       self.module = self.module.to(device)
-      self.example_inputs = pytree.tree_map_only(torch.Tensor,
-                                                 lambda t: t.to(device),
-                                                 self.example_inputs)
+      self.example_inputs = move_to_device(self.example_inputs, device)
 
     # Torchbench has quite different setup for yolov3, so directly passing
     # the right example_inputs
@@ -197,7 +209,7 @@ def set_up(self):
       self.optimizer = benchmark.optimizer
 
     del benchmark
-    gc.collect()
+    self._cleanup()
 
   def load_benchmark(self):
     try:
@@ -247,13 +259,15 @@ def default_precision_flag(self):
     elif test == "train" and hasattr(benchmark, 'DEFAULT_TRAIN_CUDA_PRECISION'):
       precision = benchmark.DEFAULT_TRAIN_CUDA_PRECISION
     else:
+      precision = None
       logger.warning("No default precision set. No patching needed.")
-      return None
 
     del benchmark
-    gc.collect()
+    self._cleanup()
 
     precision_flag = None
+    if precision is None:
+      return None
     if precision == "fp16":
       precision_flag = 'XLA_USE_FP16'
     elif precision == "amp":

diff --git a/benchmarks/util.py b/benchmarks/util.py
@@ -7,6 +7,7 @@
 import random
 import subprocess
 import torch
+import torch.utils._pytree as pytree
 import sys
 import torch_xla.core.xla_model as xm
 from torch_xla._internal import tpu
@@ -76,16 +77,7 @@ def is_xla_device_available(devkind):
 
 
 def move_to_device(item, device):
-  if isinstance(item, torch.Tensor):
-    return item.to(device=device)
-  elif isinstance(item, list):
-    return [move_to_device(t, device) for t in item]
-  elif isinstance(item, tuple):
-    return tuple(move_to_device(t, device) for t in item)
-  elif isinstance(item, dict):
-    return dict((k, move_to_device(t, device)) for k, t in item.items())
-  else:
-    return item
+  return pytree.tree_map_only(torch.Tensor, lambda t: t.to(device), item)
 
 
 def randomize_input(inputs):