From 37133cae69c619d6a2b7da263c277e399349324c Mon Sep 17 00:00:00 2001 From: Jianyu Huang Date: Sun, 28 Apr 2024 18:03:16 -0700 Subject: [PATCH] Enhance AMD support Summary: Support AMD GPU build. Differential Revision: D56686760 --- .../experimental/gen_ai/gen_ai/__init__.py | 20 +++------ .../gen_ai/src/attention/gqa_attn_splitk.cu | 42 +++++++++++++++++++ .../gen_ai/src/quantize/quantize.cu | 26 ------------ 3 files changed, 48 insertions(+), 40 deletions(-) diff --git a/fbgemm_gpu/experimental/gen_ai/gen_ai/__init__.py b/fbgemm_gpu/experimental/gen_ai/gen_ai/__init__.py index 8c1cf2edef..024588cd2b 100644 --- a/fbgemm_gpu/experimental/gen_ai/gen_ai/__init__.py +++ b/fbgemm_gpu/experimental/gen_ai/gen_ai/__init__.py @@ -28,17 +28,9 @@ os.path.join(os.path.dirname(__file__), "fbgemm_gpu_experimental_gen_ai_py.so") ) else: - if torch.version.hip: - torch.ops.load_library( - "//deeplearning/fbgemm/fbgemm_gpu/experimental/gen_ai:attention_ops_hip" - ) - torch.ops.load_library( - "//deeplearning/fbgemm/fbgemm_gpu/experimental/gen_ai:quantize_ops_hip" - ) - else: - torch.ops.load_library( - "//deeplearning/fbgemm/fbgemm_gpu/experimental/gen_ai:attention_ops_cuda" - ) - torch.ops.load_library( - "//deeplearning/fbgemm/fbgemm_gpu/experimental/gen_ai:quantize_ops_cuda" - ) + torch.ops.load_library( + "//deeplearning/fbgemm/fbgemm_gpu/experimental/gen_ai:attention_ops" + ) + torch.ops.load_library( + "//deeplearning/fbgemm/fbgemm_gpu/experimental/gen_ai:quantize_ops" + ) diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/gqa_attn_splitk.cu b/fbgemm_gpu/experimental/gen_ai/src/attention/gqa_attn_splitk.cu index 5a8cfe38a2..2c4b6f5c84 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/attention/gqa_attn_splitk.cu +++ b/fbgemm_gpu/experimental/gen_ai/src/attention/gqa_attn_splitk.cu @@ -99,6 +99,48 @@ void set_gpu_max_dynamic_shared_memory( C10_CUDA_KERNEL_LAUNCH_CHECK(); } +#ifdef __HIP_PLATFORM_AMD__ +using __nv_bfloat16 = hip_bfloat16; + +typedef struct __align__(4) { + uint16_t x; + uint16_t y; +} +__nv_bfloat162_raw; + +struct __align__(4) __nv_bfloat162 { + __nv_bfloat16 x; + __nv_bfloat16 y; +}; + +// the descriptions of __float2bfloat16 and __float2bfloat16_rn are identical +// https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____BFLOAT16__MISC.html#group__CUDA__MATH____BFLOAT16__MISC +static __host__ __device__ __nv_bfloat16 __float2bfloat16(float f) { + __nv_bfloat16 output; + return output.round_to_bfloat16(f); +} + +static __host__ __device__ __nv_bfloat16 __float2bfloat16_rn(float f) { + __nv_bfloat16 output; + return output.round_to_bfloat16(f); +} + +static __host__ __device__ float __bfloat162float(__nv_bfloat16 f) { + // float output; + // https://docs.amd.com/projects/HIP/en/docs-5.0.0/doxygen/html/hip__bfloat16_8h_source.html + return float(f); +} + +static __host__ __device__ __nv_bfloat162 +__floats2bfloat162_rn(float x, float y) { + __nv_bfloat162 output; + output.x = __float2bfloat16_rn(x); + output.y = __float2bfloat16_rn(y); + return output; +} + +#endif + // TODO: Include the following code from fbgemm_gpu header struct __align__(16) bfx8 { __nv_bfloat162 vals[4]; diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu index 91f04a6069..d7c1592ee6 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu @@ -90,31 +90,6 @@ constexpr int32_t MAX_T = 16384; constexpr int SMEM_ADJUST_THRESHOLD = 48 * 1024; #ifdef __HIP_PLATFORM_AMD__ -using __nv_bfloat16 = hip_bfloat16; - -typedef struct __align__(4) { - uint16_t x; - uint16_t y; -} -__nv_bfloat162_raw; - -struct __align__(4) __nv_bfloat162 { - __nv_bfloat16 x; - __nv_bfloat16 y; -}; - -// the descriptions of __float2bfloat16 and __float2bfloat16_rn are identical -// https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____BFLOAT16__MISC.html#group__CUDA__MATH____BFLOAT16__MISC -static __host__ __device__ __nv_bfloat16 __float2bfloat16(float f) { - __nv_bfloat16 output; - return output.round_to_bfloat16(f); -} - -static __host__ __device__ __nv_bfloat16 __float2bfloat16_rn(float f) { - __nv_bfloat16 output; - return output.round_to_bfloat16(f); -} - static __host__ __device__ float __bfloat162float(__nv_bfloat16 f) { // float output; // https://docs.amd.com/projects/HIP/en/docs-5.0.0/doxygen/html/hip__bfloat16_8h_source.html @@ -128,7 +103,6 @@ __floats2bfloat162_rn(float x, float y) { output.y = __float2bfloat16_rn(y); return output; } - #endif struct __align__(16) bf16x8 {