From d718cc4c8027fbebf66e2f9eeb8a27e4f07d7d8d Mon Sep 17 00:00:00 2001 From: Mark Shields Date: Tue, 10 Aug 2021 16:10:26 -0700 Subject: [PATCH] [tmp] Clutching, but force a gc between sphinx gallery runs to trigger CUDA memory free. Sphinx has an explicit gc.collect but I think it's only triggered as part of the memory stats machinery. --- docs/conf.py | 29 +++++++++++++++++++++++------ src/runtime/cuda/cuda_device_api.cc | 2 ++ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index b008c305b1e7..4f5eadbb553a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -29,6 +29,7 @@ # # All configuration values have a default; values that are commented out # serve to show the default. +import gc import sys import inspect import os, subprocess @@ -201,12 +202,11 @@ def git_describe_version(original_version): subsection_order = ExplicitOrder( [ - "../tutorials/get_started", + # DO NOT CHECKIN "../tutorials/frontend", - "../tutorials/language", + "../tutorials/get_started", "../tutorials/optimize", - "../tutorials/autotvm", - "../tutorials/auto_scheduler", + "../tutorials/language", "../tutorials/dev", "../tutorials/topi", "../tutorials/deployment", @@ -214,6 +214,8 @@ def git_describe_version(original_version): "../vta/tutorials/frontend", "../vta/tutorials/optimize", "../vta/tutorials/autotvm", + "../tutorials/autotvm", + "../tutorials/auto_scheduler", ] ) @@ -234,11 +236,14 @@ def git_describe_version(original_version): "relay_quick_start.py", ], "frontend": [ + # DO NOT CHECKIN + "from_mxnet.py", + "from_keras.py", + "deploy_quantized.py", + "deploy_ssd_gluoncv.py", "from_pytorch.py", "from_tensorflow.py", - "from_mxnet.py", "from_onnx.py", - "from_keras.py", "from_tflite.py", "from_coreml.py", "from_darknet.py", @@ -300,6 +305,15 @@ def __call__(self, filename): return filename +# When running the tutorials on GPUs we are dependent on the Python garbage collector +# collecting TVM packed function closures for any device memory to also be released. This +# is not a good setup for machines with lots of CPU ram but constrained GPU ram, so force +# a gc after each example. +def force_gc(gallery_cong, fname): + print("(Forcing Python gc after '{}' to avoid lag in reclaiming CUDA memory)".format(fname)) + gc.collect() + print("(Remaining garbage: {})".format(gc.garbage)) + sphinx_gallery_conf = { "backreferences_dir": "gen_modules/backreferences", "doc_module": ("tvm", "numpy"), @@ -317,6 +331,9 @@ def __call__(self, filename): "download_all_examples": False, "min_reported_time": 60, "expected_failing_examples": [], + "reset_modules": (force_gc, "matplotlib", "seaborn"), + "abort_on_example_error": True, + "show_memory": True, } autodoc_default_options = { diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc index 11a063d90156..5e33b1ff4f86 100644 --- a/src/runtime/cuda/cuda_device_api.cc +++ b/src/runtime/cuda/cuda_device_api.cc @@ -127,9 +127,11 @@ class CUDADeviceAPI final : public DeviceAPI { void FreeDataSpace(Device dev, void* ptr) final { if (dev.device_type == kDLCUDAHost) { + LOG(INFO) << "freeing host memory"; CUDA_CALL(cudaFreeHost(ptr)); } else { CUDA_CALL(cudaSetDevice(dev.device_id)); + LOG(INFO) << "freeing device memory"; CUDA_CALL(cudaFree(ptr)); } }