From d718cc4c8027fbebf66e2f9eeb8a27e4f07d7d8d Mon Sep 17 00:00:00 2001
From: Mark Shields <mbs@octoml.ai>
Date: Tue, 10 Aug 2021 16:10:26 -0700
Subject: [PATCH] [tmp] Clutching, but force a gc between sphinx gallery runs
 to trigger CUDA memory free.

Sphinx has an explicit gc.collect but I think it's only triggered as part of the memory stats machinery.
---
 docs/conf.py                        | 29 +++++++++++++++++++++++------
 src/runtime/cuda/cuda_device_api.cc |  2 ++
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index b008c305b1e7..4f5eadbb553a 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -29,6 +29,7 @@
 #
 # All configuration values have a default; values that are commented out
 # serve to show the default.
+import gc
 import sys
 import inspect
 import os, subprocess
@@ -201,12 +202,11 @@ def git_describe_version(original_version):
 
 subsection_order = ExplicitOrder(
     [
-        "../tutorials/get_started",
+        # DO NOT CHECKIN
         "../tutorials/frontend",
-        "../tutorials/language",
+        "../tutorials/get_started",
         "../tutorials/optimize",
-        "../tutorials/autotvm",
-        "../tutorials/auto_scheduler",
+        "../tutorials/language",
         "../tutorials/dev",
         "../tutorials/topi",
         "../tutorials/deployment",
@@ -214,6 +214,8 @@ def git_describe_version(original_version):
         "../vta/tutorials/frontend",
         "../vta/tutorials/optimize",
         "../vta/tutorials/autotvm",
+        "../tutorials/autotvm",
+        "../tutorials/auto_scheduler",
     ]
 )
 
@@ -234,11 +236,14 @@ def git_describe_version(original_version):
         "relay_quick_start.py",
     ],
     "frontend": [
+        # DO NOT CHECKIN
+        "from_mxnet.py",
+        "from_keras.py",
+        "deploy_quantized.py",
+        "deploy_ssd_gluoncv.py",
         "from_pytorch.py",
         "from_tensorflow.py",
-        "from_mxnet.py",
         "from_onnx.py",
-        "from_keras.py",
         "from_tflite.py",
         "from_coreml.py",
         "from_darknet.py",
@@ -300,6 +305,15 @@ def __call__(self, filename):
         return filename
 
 
+# When running the tutorials on GPUs we are dependent on the Python garbage collector
+# collecting TVM packed function closures for any device memory to also be released. This
+# is not a good setup for machines with lots of CPU ram but constrained GPU ram, so force
+# a gc after each example.
+def force_gc(gallery_cong, fname):
+    print("(Forcing Python gc after '{}' to avoid lag in reclaiming CUDA memory)".format(fname))
+    gc.collect()
+    print("(Remaining garbage: {})".format(gc.garbage))
+
 sphinx_gallery_conf = {
     "backreferences_dir": "gen_modules/backreferences",
     "doc_module": ("tvm", "numpy"),
@@ -317,6 +331,9 @@ def __call__(self, filename):
     "download_all_examples": False,
     "min_reported_time": 60,
     "expected_failing_examples": [],
+    "reset_modules": (force_gc, "matplotlib", "seaborn"),
+    "abort_on_example_error": True,
+    "show_memory": True,
 }
 
 autodoc_default_options = {
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index 11a063d90156..5e33b1ff4f86 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -127,9 +127,11 @@ class CUDADeviceAPI final : public DeviceAPI {
 
   void FreeDataSpace(Device dev, void* ptr) final {
     if (dev.device_type == kDLCUDAHost) {
+      LOG(INFO) << "freeing host memory";
       CUDA_CALL(cudaFreeHost(ptr));
     } else {
       CUDA_CALL(cudaSetDevice(dev.device_id));
+      LOG(INFO) << "freeing device memory";
       CUDA_CALL(cudaFree(ptr));
     }
   }