Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMDGPU: Use defset to cleanup marking MFMA intrinsics as divergent #85915

Merged
merged 2 commits into from
Mar 21, 2024

Conversation

arsenm
Copy link
Contributor

@arsenm arsenm commented Mar 20, 2024

No description provided.

@llvmbot
Copy link
Collaborator

llvmbot commented Mar 20, 2024

@llvm/pr-subscribers-llvm-ir

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/85915.diff

2 Files Affected:

  • (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+48-38)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td (+11-76)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 051e603c0819d2..b24dab4ba72b93 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2653,6 +2653,8 @@ class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
 // The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
 // The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers.
 // The content of the other 16-bit half is preserved from the input.
+
+defset list<Intrinsic> AMDGPUMFMAIntrinsicsGFX11 = {
 def int_amdgcn_wmma_f16_16x16x16_f16_tied   : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
 
@@ -2668,6 +2670,7 @@ def int_amdgcn_wmma_i32_16x16x16_iu4   : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, l
 // GFX12: The op_sel bit must be 0.
 def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
+}
 
 //===----------------------------------------------------------------------===//
 // GFX12 Intrinsics
@@ -2687,20 +2690,6 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
             [IntrNoMem, IntrConvergent, IntrWillReturn,
              ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
 
-
-// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
-//
-// These operations perform a matrix multiplication and accumulation of
-// the form: D = A * B + C .
-
-// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
-def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
-// A and B are <16 x iu4>.
-def int_amdgcn_wmma_i32_16x16x32_iu4     : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
-
 // SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics
 //
 // These operations perform a sparse matrix multiplication and accumulation of
@@ -2734,6 +2723,20 @@ class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType I
     [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>]
 >;
 
+defset list<Intrinsic> AMDGPUMFMAIntrinsicsGFX12 = {
+// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
+//
+// These operations perform a matrix multiplication and accumulation of
+// the form: D = A * B + C .
+
+// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
+def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+// A and B are <16 x iu4>.
+def int_amdgcn_wmma_i32_16x16x32_iu4     : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
+
 def int_amdgcn_swmmac_f32_16x16x32_f16     : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
 def int_amdgcn_swmmac_f32_16x16x32_bf16    : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
 def int_amdgcn_swmmac_f16_16x16x32_f16     : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
@@ -2745,6 +2748,7 @@ def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyin
 def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
 def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
 def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+}
 
 def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>;
 
@@ -3012,6 +3016,7 @@ class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
             [IntrConvergent, IntrNoMem,
              ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
+defset list<Intrinsic> AMDGPUMFMAIntrinsics908 = {
 def int_amdgcn_mfma_f32_32x32x1f32  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>;
 def int_amdgcn_mfma_f32_16x16x1f32  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>;
 def int_amdgcn_mfma_f32_4x4x1f32    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_float_ty>;
@@ -3032,6 +3037,7 @@ def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v
 def int_amdgcn_mfma_f32_4x4x2bf16   : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2i16_ty>;
 def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
 def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2i16_ty>;
+}
 
 //===----------------------------------------------------------------------===//
 // gfx90a intrinsics
@@ -3043,6 +3049,7 @@ def int_amdgcn_flat_atomic_fadd   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 def int_amdgcn_flat_atomic_fmin   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 def int_amdgcn_flat_atomic_fmax   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 
+defset list<Intrinsic> AMDGPUMFMAIntrinsics90A = {
 def int_amdgcn_mfma_f32_32x32x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>;
 def int_amdgcn_mfma_f32_16x16x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
 def int_amdgcn_mfma_f32_4x4x4bf16_1k    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty>;
@@ -3054,25 +3061,12 @@ def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  ll
 //       source operand.
 def int_amdgcn_mfma_f64_16x16x4f64      : AMDGPUMfmaIntrinsic<llvm_v4f64_ty,  llvm_double_ty>;
 def int_amdgcn_mfma_f64_4x4x4f64        : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>;
+}
 
 //===----------------------------------------------------------------------===//
 // gfx940 intrinsics
 // ===----------------------------------------------------------------------===//
 
-// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
-def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
-def int_amdgcn_flat_atomic_fadd_v2bf16   : AMDGPUAtomicRtn<llvm_v2i16_ty>;
-def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
-    [llvm_v2i16_ty],
-    [LLVMQualPointerType<3>, llvm_v2i16_ty],
-    [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>,
-    ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
-
-def int_amdgcn_mfma_i32_16x16x32_i8     : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i64_ty>;
-def int_amdgcn_mfma_i32_32x32x16_i8     : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
-def int_amdgcn_mfma_f32_16x16x8_xf32    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2f32_ty>;
-def int_amdgcn_mfma_f32_32x32x4_xf32    : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;
-
 class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> :
   AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>;
 
@@ -3081,9 +3075,6 @@ multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
     def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
 }
 
-defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>;
-defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>;
-
 // llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid
 class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
   ClangBuiltin<!subst("int", "__builtin", NAME)>,
@@ -3093,13 +3084,6 @@ class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
             [IntrConvergent, IntrNoMem,
              ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
-def int_amdgcn_smfmac_f32_16x16x32_f16  : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty,  llvm_v4f16_ty, llvm_v8f16_ty>;
-def int_amdgcn_smfmac_f32_32x32x16_f16  : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
-def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty, llvm_v8i16_ty>;
-def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
-def int_amdgcn_smfmac_i32_16x16x64_i8   : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty,  llvm_v2i32_ty, llvm_v4i32_ty>;
-def int_amdgcn_smfmac_i32_32x32x32_i8   : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
-
 class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> :
   AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>;
 
@@ -3108,8 +3092,34 @@ multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> {
     def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>;
 }
 
+// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
+def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
+def int_amdgcn_flat_atomic_fadd_v2bf16   : AMDGPUAtomicRtn<llvm_v2i16_ty>;
+def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
+    [llvm_v2i16_ty],
+    [LLVMQualPointerType<3>, llvm_v2i16_ty],
+    [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>,
+    ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
+
+defset list<Intrinsic> AMDGPUMFMAIntrinsics940 = {
+def int_amdgcn_mfma_i32_16x16x32_i8     : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i64_ty>;
+def int_amdgcn_mfma_i32_32x32x16_i8     : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
+def int_amdgcn_mfma_f32_16x16x8_xf32    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2f32_ty>;
+def int_amdgcn_mfma_f32_32x32x4_xf32    : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;
+
+defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>;
+defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>;
+
+def int_amdgcn_smfmac_f32_16x16x32_f16  : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty,  llvm_v4f16_ty, llvm_v8f16_ty>;
+def int_amdgcn_smfmac_f32_32x32x16_f16  : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
+def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty, llvm_v8i16_ty>;
+def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
+def int_amdgcn_smfmac_i32_16x16x64_i8   : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty,  llvm_v2i32_ty, llvm_v4i32_ty>;
+def int_amdgcn_smfmac_i32_32x32x32_i8   : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
+
 defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>;
 defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>;
+}
 
 // llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3]
 // byte_sel selects byte from srcA.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index bb1c6b73372999..40d432c3b17a9e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -354,82 +354,17 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
 def : SourceOfDivergence<int_amdgcn_update_dpp>;
 def : SourceOfDivergence<int_amdgcn_writelane>;
 
-def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4f16>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_4x4x4i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x2bf16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x1f32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16f16>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x4i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x16i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x2bf16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8bf16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x1f32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2f32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4f16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8f16>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x4i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16_1k>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4bf16_1k>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4bf16_1k>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8bf16_1k>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16bf16_1k>;
-def : SourceOfDivergence<int_amdgcn_mfma_f64_16x16x4f64>;
-def : SourceOfDivergence<int_amdgcn_mfma_f64_4x4x4f64>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x32_i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x16_i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8_xf32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4_xf32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_bf8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_fp8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_bf8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_fp8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_bf8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_fp8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_bf8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_fp8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_f16>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_f16>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_bf16>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_bf16>;
-def : SourceOfDivergence<int_amdgcn_smfmac_i32_16x16x64_i8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_i32_32x32x32_i8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_bf8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_fp8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_bf8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_fp8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_bf8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_fp8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_bf8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>;
-def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_f16>;
-def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf16>;
-def : SourceOfDivergence<int_amdgcn_wmma_f16_16x16x16_f16>;
-def : SourceOfDivergence<int_amdgcn_wmma_bf16_16x16x16_bf16>;
-def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu8>;
-def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>;
-def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_fp8>;
-def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_bf8>;
-def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_fp8>;
-def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_bf8>;
-def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x32_iu4>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_f16>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf16>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f16_16x16x32_f16>;
-def : SourceOfDivergence<int_amdgcn_swmmac_bf16_16x16x32_bf16>;
-def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu8>;
-def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu4>;
-def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x64_iu4>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_fp8>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_bf8>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_fp8>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_bf8>;
+foreach intr = AMDGPUMFMAIntrinsics908 in
+def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUMFMAIntrinsics90A in
+def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUMFMAIntrinsics940 in
+def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUMFMAIntrinsicsGFX11 in
+def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUMFMAIntrinsicsGFX12 in
+def : SourceOfDivergence<intr>;
+
 def : SourceOfDivergence<int_amdgcn_global_load_tr>;
 
 // The dummy boolean output is divergent from the IR's perspective,

@@ -2653,6 +2653,8 @@ class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers.
// The content of the other 16-bit half is preserved from the input.

defset list<Intrinsic> AMDGPUMFMAIntrinsicsGFX11 = {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is WMMA, not MFMA. Defset name is wrong. Same for GFX12.

@arsenm arsenm merged commit d8b0d8d into llvm:main Mar 21, 2024
3 of 4 checks passed
@arsenm arsenm deleted the defset-mfma-cleanup branch March 21, 2024 16:20
chencha3 pushed a commit to chencha3/llvm-project that referenced this pull request Mar 23, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants