From b12ad2fed38013759fab8b969b6949ce4f278ee8 Mon Sep 17 00:00:00 2001
From: Zihao Ye <expye@outlook.com>
Date: Mon, 17 Jul 2023 17:25:21 -0700
Subject: [PATCH] [Runtime] Device API to query L2 cache size (#15332)

Followup of #15305 , this PR creates API to query device L2 cache size in bytes.
Currently, the API-supported devices includes CUDA, OpenCL, and ROCM.

Note that OpenCL's API does not return the accurate device L2 cache size.
I cannot find a Vulkan API that returns L2 texture cache size, but the `vkCmdPipelineBarrier` call will flush the L2 texture cache automatically(https://zeux.io/2020/02/27/writing-an-efficient-vulkan-renderer/), thus we return 0 by default.
---
 include/tvm/runtime/device_api.h        |  3 ++-
 python/tvm/_ffi/runtime_ctypes.py       | 18 ++++++++++++++++++
 python/tvm/target/target.py             |  4 ++++
 src/runtime/cuda/cuda_device_api.cc     |  6 ++++++
 src/runtime/metal/metal_device_api.mm   |  2 ++
 src/runtime/opencl/opencl_device_api.cc |  7 +++++++
 src/runtime/rocm/rocm_device_api.cc     |  5 +++++
 src/runtime/vulkan/vulkan_device_api.cc |  3 +++
 8 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index d3c2f9ba38572..654018565716b 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -48,7 +48,8 @@ enum DeviceAttrKind : int {
   kMaxRegistersPerBlock = 9,
   kGcnArch = 10,
   kApiVersion = 11,
-  kDriverVersion = 12
+  kDriverVersion = 12,
+  kL2CacheSizeBytes = 13,
 };
 
 #ifdef TVM_KALLOC_ALIGNMENT
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index 363843018927b..7836f4224769c 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -488,6 +488,24 @@ def driver_version(self):
         """
         return self._GetDeviceAttr(self.device_type, self.device_id, 12)
 
+    @property
+    def l2_cache_size_bytes(self):
+        """Return the size of the device L2 cache in bytes
+
+        Supported devices include CUDA/ROCM/OpenCL.
+
+        Returns
+        -------
+        l2_cache_size_bytes : int or None
+            The size of the device L2 cache in bytes returned by device runtime API.
+            Return None if the device does not support this feature.
+
+        Note
+        ----
+        The value returned by opencl's API is smaller than actual device L2 cache size.
+        """
+        return self._GetDeviceAttr(self.device_type, self.device_id, 13)
+
     def texture_spatial_limit(self):
         """Returns limits for textures by spatial dimensions
 
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 0c834c5f026ef..0117420c21400 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -244,6 +244,10 @@ def supports_cooperative_matrix(self):
     def features(self):
         return TargetFeatures(self)
 
+    @property
+    def l2_cache_size_bytes(self):
+        return int(self.attrs.get("l2_cache_size_bytes", 0))
+
     def get_kind_attr(self, attr_name):
         """Get additional attribute about the target kind.
 
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index b8854f88cbe1a..5cf6cd7342a0e 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -105,6 +105,12 @@ class CUDADeviceAPI final : public DeviceAPI {
       }
       case kDriverVersion:
         return;
+      case kL2CacheSizeBytes:
+        // Get size of device l2 cache size in bytes.
+        int l2_size = 0;
+        CUDA_CALL(cudaDeviceGetAttribute(&l2_size, cudaDevAttrL2CacheSize, dev.device_id));
+        *rv = l2_size;
+        return;
     }
     *rv = value;
   }
diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm
index 0f1be0cc95ea0..1e60019f144d2 100644
--- a/src/runtime/metal/metal_device_api.mm
+++ b/src/runtime/metal/metal_device_api.mm
@@ -81,6 +81,8 @@
         return;
       case kDriverVersion:
         return;
+      case kL2CacheSizeBytes:
+        return;
     }
   };
 }
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index f3eb8d83a2107..0d1f4af2bbf10 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -198,6 +198,13 @@ void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
       *rv = std::string(value);
       break;
     }
+    case kL2CacheSizeBytes:
+      // NOTE(Zihao): this API cannot reflect the real L2 cache size in both CUDA/AMD GPUs.
+      cl_ulong value;
+      OPENCL_CALL(clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(value), &value,
+                                  nullptr));
+      *rv = static_cast<int64_t>(value);
+      break;
   }
 }
 
diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc
index 4e758b7fd977d..c2fb42ee360aa 100644
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -122,6 +122,11 @@ class ROCMDeviceAPI final : public DeviceAPI {
       }
       case kDriverVersion:
         return;
+      case kL2CacheSizeBytes:
+        // Get size of device l2 cache size in bytes.
+        int l2_size;
+        ROCM_CALL(hipDeviceGetAttribute(&l2_size, hipDeviceAttributeL2CacheSize, device.device_id));
+        *rv = l2_size;
     }
     *rv = value;
   }
diff --git a/src/runtime/vulkan/vulkan_device_api.cc b/src/runtime/vulkan/vulkan_device_api.cc
index 1087415256025..d67746856cfc1 100644
--- a/src/runtime/vulkan/vulkan_device_api.cc
+++ b/src/runtime/vulkan/vulkan_device_api.cc
@@ -160,6 +160,9 @@ void VulkanDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
       *rv = os.str();
       break;
     }
+
+    case kL2CacheSizeBytes:
+      break;
   }
 }