From b12ad2fed38013759fab8b969b6949ce4f278ee8 Mon Sep 17 00:00:00 2001 From: Zihao Ye Date: Mon, 17 Jul 2023 17:25:21 -0700 Subject: [PATCH] [Runtime] Device API to query L2 cache size (#15332) Followup of #15305 , this PR creates API to query device L2 cache size in bytes. Currently, the API-supported devices includes CUDA, OpenCL, and ROCM. Note that OpenCL's API does not return the accurate device L2 cache size. I cannot find a Vulkan API that returns L2 texture cache size, but the `vkCmdPipelineBarrier` call will flush the L2 texture cache automatically(https://zeux.io/2020/02/27/writing-an-efficient-vulkan-renderer/), thus we return 0 by default. --- include/tvm/runtime/device_api.h | 3 ++- python/tvm/_ffi/runtime_ctypes.py | 18 ++++++++++++++++++ python/tvm/target/target.py | 4 ++++ src/runtime/cuda/cuda_device_api.cc | 6 ++++++ src/runtime/metal/metal_device_api.mm | 2 ++ src/runtime/opencl/opencl_device_api.cc | 7 +++++++ src/runtime/rocm/rocm_device_api.cc | 5 +++++ src/runtime/vulkan/vulkan_device_api.cc | 3 +++ 8 files changed, 47 insertions(+), 1 deletion(-) diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h index d3c2f9ba38572..654018565716b 100644 --- a/include/tvm/runtime/device_api.h +++ b/include/tvm/runtime/device_api.h @@ -48,7 +48,8 @@ enum DeviceAttrKind : int { kMaxRegistersPerBlock = 9, kGcnArch = 10, kApiVersion = 11, - kDriverVersion = 12 + kDriverVersion = 12, + kL2CacheSizeBytes = 13, }; #ifdef TVM_KALLOC_ALIGNMENT diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py index 363843018927b..7836f4224769c 100644 --- a/python/tvm/_ffi/runtime_ctypes.py +++ b/python/tvm/_ffi/runtime_ctypes.py @@ -488,6 +488,24 @@ def driver_version(self): """ return self._GetDeviceAttr(self.device_type, self.device_id, 12) + @property + def l2_cache_size_bytes(self): + """Return the size of the device L2 cache in bytes + + Supported devices include CUDA/ROCM/OpenCL. + + Returns + ------- + l2_cache_size_bytes : int or None + The size of the device L2 cache in bytes returned by device runtime API. + Return None if the device does not support this feature. + + Note + ---- + The value returned by opencl's API is smaller than actual device L2 cache size. + """ + return self._GetDeviceAttr(self.device_type, self.device_id, 13) + def texture_spatial_limit(self): """Returns limits for textures by spatial dimensions diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py index 0c834c5f026ef..0117420c21400 100644 --- a/python/tvm/target/target.py +++ b/python/tvm/target/target.py @@ -244,6 +244,10 @@ def supports_cooperative_matrix(self): def features(self): return TargetFeatures(self) + @property + def l2_cache_size_bytes(self): + return int(self.attrs.get("l2_cache_size_bytes", 0)) + def get_kind_attr(self, attr_name): """Get additional attribute about the target kind. diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc index b8854f88cbe1a..5cf6cd7342a0e 100644 --- a/src/runtime/cuda/cuda_device_api.cc +++ b/src/runtime/cuda/cuda_device_api.cc @@ -105,6 +105,12 @@ class CUDADeviceAPI final : public DeviceAPI { } case kDriverVersion: return; + case kL2CacheSizeBytes: + // Get size of device l2 cache size in bytes. + int l2_size = 0; + CUDA_CALL(cudaDeviceGetAttribute(&l2_size, cudaDevAttrL2CacheSize, dev.device_id)); + *rv = l2_size; + return; } *rv = value; } diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm index 0f1be0cc95ea0..1e60019f144d2 100644 --- a/src/runtime/metal/metal_device_api.mm +++ b/src/runtime/metal/metal_device_api.mm @@ -81,6 +81,8 @@ return; case kDriverVersion: return; + case kL2CacheSizeBytes: + return; } }; } diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index f3eb8d83a2107..0d1f4af2bbf10 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -198,6 +198,13 @@ void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) *rv = std::string(value); break; } + case kL2CacheSizeBytes: + // NOTE(Zihao): this API cannot reflect the real L2 cache size in both CUDA/AMD GPUs. + cl_ulong value; + OPENCL_CALL(clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(value), &value, + nullptr)); + *rv = static_cast(value); + break; } } diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc index 4e758b7fd977d..c2fb42ee360aa 100644 --- a/src/runtime/rocm/rocm_device_api.cc +++ b/src/runtime/rocm/rocm_device_api.cc @@ -122,6 +122,11 @@ class ROCMDeviceAPI final : public DeviceAPI { } case kDriverVersion: return; + case kL2CacheSizeBytes: + // Get size of device l2 cache size in bytes. + int l2_size; + ROCM_CALL(hipDeviceGetAttribute(&l2_size, hipDeviceAttributeL2CacheSize, device.device_id)); + *rv = l2_size; } *rv = value; } diff --git a/src/runtime/vulkan/vulkan_device_api.cc b/src/runtime/vulkan/vulkan_device_api.cc index 1087415256025..d67746856cfc1 100644 --- a/src/runtime/vulkan/vulkan_device_api.cc +++ b/src/runtime/vulkan/vulkan_device_api.cc @@ -160,6 +160,9 @@ void VulkanDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) *rv = os.str(); break; } + + case kL2CacheSizeBytes: + break; } }