From 51a61954975d714f23074dad76802074274dbfc9 Mon Sep 17 00:00:00 2001
From: Antonio Ospite <ao2@ao2.it>
Date: Sat, 21 Oct 2023 11:02:43 +0200
Subject: [PATCH] Check for support of CUDA Memory Pools at runtime (#4679)

Some CUDA GPUs, like the Quadro M3000M don't support Memory Pools
operations like cudaMallocAsync/cudaFreeAsync even on driver versions
newer than 11.2, and this can result in errors like:

  CUDA runtime error: operation not supported

So check for support at runtime instead of compile time.

Still keep the compile time check to support building with CUDA versions
older than 11.2.
---
 CHANGELOG.md                          |  1 +
 cpp/open3d/core/CUDAUtils.cpp         | 21 +++++++++++++++++++++
 cpp/open3d/core/CUDAUtils.h           |  7 +++++++
 cpp/open3d/core/MemoryManagerCUDA.cpp | 14 +++++++++++---
 4 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 96ad848dd6e..0d3ba84ed84 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@
 * Fix some bad triangle generation in TriangleMesh::SimplifyQuadricDecimation
 * Fix printing of tensor in gpu and add validation check for bounds of axis-aligned bounding box (PR #6444)  
 * Python 3.11 support. bump pybind11 v2.6.2 -> v2.11.1
+* Check for support of CUDA Memory Pools at runtime (#4679)
 
 ## 0.13
 
diff --git a/cpp/open3d/core/CUDAUtils.cpp b/cpp/open3d/core/CUDAUtils.cpp
index 630cbf38938..1da331035e7 100644
--- a/cpp/open3d/core/CUDAUtils.cpp
+++ b/cpp/open3d/core/CUDAUtils.cpp
@@ -108,6 +108,27 @@ void AssertCUDADeviceAvailable(const Device& device) {
     }
 }
 
+bool SupportsMemoryPools(const Device& device) {
+#if defined(BUILD_CUDA_MODULE) && (CUDART_VERSION >= 11020)
+    if (device.IsCUDA()) {
+        int driverVersion = 0;
+        int deviceSupportsMemoryPools = 0;
+        OPEN3D_CUDA_CHECK(cudaDriverGetVersion(&driverVersion));
+        if (driverVersion >=
+            11020) {  // avoid invalid value error in cudaDeviceGetAttribute
+            OPEN3D_CUDA_CHECK(cudaDeviceGetAttribute(
+                    &deviceSupportsMemoryPools, cudaDevAttrMemoryPoolsSupported,
+                    device.GetID()));
+        }
+        return !!deviceSupportsMemoryPools;
+    } else {
+        return false;
+    }
+#else
+    return false;
+#endif
+}
+
 #ifdef BUILD_CUDA_MODULE
 int GetDevice() {
     int device;
diff --git a/cpp/open3d/core/CUDAUtils.h b/cpp/open3d/core/CUDAUtils.h
index 2996cd0987c..15f87be040d 100644
--- a/cpp/open3d/core/CUDAUtils.h
+++ b/cpp/open3d/core/CUDAUtils.h
@@ -255,6 +255,13 @@ void AssertCUDADeviceAvailable(int device_id);
 /// \param device The device to be checked.
 void AssertCUDADeviceAvailable(const Device& device);
 
+/// Checks if the CUDA device support Memory Pools
+/// used by the Stream Ordered Memory Allocator,
+/// see
+/// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html
+/// \param device The device to be checked.
+bool SupportsMemoryPools(const Device& device);
+
 #ifdef BUILD_CUDA_MODULE
 
 int GetDevice();
diff --git a/cpp/open3d/core/MemoryManagerCUDA.cpp b/cpp/open3d/core/MemoryManagerCUDA.cpp
index 3cc2f4730bc..835ea550cfe 100644
--- a/cpp/open3d/core/MemoryManagerCUDA.cpp
+++ b/cpp/open3d/core/MemoryManagerCUDA.cpp
@@ -20,8 +20,12 @@ void* MemoryManagerCUDA::Malloc(size_t byte_size, const Device& device) {
     void* ptr;
     if (device.IsCUDA()) {
 #if CUDART_VERSION >= 11020
-        OPEN3D_CUDA_CHECK(cudaMallocAsync(static_cast<void**>(&ptr), byte_size,
-                                          cuda::GetStream()));
+        if (cuda::SupportsMemoryPools(device)) {
+            OPEN3D_CUDA_CHECK(cudaMallocAsync(static_cast<void**>(&ptr),
+                                              byte_size, cuda::GetStream()));
+        } else {
+            OPEN3D_CUDA_CHECK(cudaMalloc(static_cast<void**>(&ptr), byte_size));
+        }
 #else
         OPEN3D_CUDA_CHECK(cudaMalloc(static_cast<void**>(&ptr), byte_size));
 #endif
@@ -38,7 +42,11 @@ void MemoryManagerCUDA::Free(void* ptr, const Device& device) {
     if (device.IsCUDA()) {
         if (ptr && IsCUDAPointer(ptr, device)) {
 #if CUDART_VERSION >= 11020
-            OPEN3D_CUDA_CHECK(cudaFreeAsync(ptr, cuda::GetStream()));
+            if (cuda::SupportsMemoryPools(device)) {
+                OPEN3D_CUDA_CHECK(cudaFreeAsync(ptr, cuda::GetStream()));
+            } else {
+                OPEN3D_CUDA_CHECK(cudaFree(ptr));
+            }
 #else
             OPEN3D_CUDA_CHECK(cudaFree(ptr));
 #endif