Skip to content

Commit

Permalink
AMDGPU: Use defset to cleanup marking MFMA intrinsics as divergent (l…
Browse files Browse the repository at this point in the history
  • Loading branch information
arsenm authored and chencha3 committed Mar 22, 2024
1 parent cadade8 commit adfe14c
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 114 deletions.
86 changes: 48 additions & 38 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2653,6 +2653,8 @@ class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers.
// The content of the other 16-bit half is preserved from the input.

defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX11 = {
def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;

Expand All @@ -2668,6 +2670,7 @@ def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, l
// GFX12: The op_sel bit must be 0.
def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
}

//===----------------------------------------------------------------------===//
// GFX12 Intrinsics
Expand All @@ -2687,20 +2690,6 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;


// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
//
// These operations perform a matrix multiplication and accumulation of
// the form: D = A * B + C .

// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
// A and B are <16 x iu4>.
def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;

// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics
//
// These operations perform a sparse matrix multiplication and accumulation of
Expand Down Expand Up @@ -2734,6 +2723,20 @@ class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType I
[IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>]
>;

defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX12 = {
// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
//
// These operations perform a matrix multiplication and accumulation of
// the form: D = A * B + C .

// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
// A and B are <16 x iu4>.
def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;

def int_amdgcn_swmmac_f32_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_f32_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_f16_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
Expand All @@ -2745,6 +2748,7 @@ def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyin
def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
}

def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>;

Expand Down Expand Up @@ -3012,6 +3016,7 @@ class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
[IntrConvergent, IntrNoMem,
ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;

defset list<Intrinsic> AMDGPUMFMAIntrinsics908 = {
def int_amdgcn_mfma_f32_32x32x1f32 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>;
def int_amdgcn_mfma_f32_16x16x1f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>;
def int_amdgcn_mfma_f32_4x4x1f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>;
Expand All @@ -3032,6 +3037,7 @@ def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v
def int_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>;
def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>;
}

//===----------------------------------------------------------------------===//
// gfx90a intrinsics
Expand All @@ -3043,6 +3049,7 @@ def int_amdgcn_flat_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_flat_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_flat_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;

defset list<Intrinsic> AMDGPUMFMAIntrinsics90A = {
def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>;
def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>;
Expand All @@ -3054,25 +3061,12 @@ def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, ll
// source operand.
def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic<llvm_v4f64_ty, llvm_double_ty>;
def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>;
}

//===----------------------------------------------------------------------===//
// gfx940 intrinsics
// ===----------------------------------------------------------------------===//

// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
[llvm_v2i16_ty],
[LLVMQualPointerType<3>, llvm_v2i16_ty],
[IntrArgMemOnly, NoCapture<ArgIndex<0>>]>,
ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;

def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>;
def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>;
def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;

class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> :
AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>;

Expand All @@ -3081,9 +3075,6 @@ multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
}

defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>;
defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>;

// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid
class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
ClangBuiltin<!subst("int", "__builtin", NAME)>,
Expand All @@ -3093,13 +3084,6 @@ class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
[IntrConvergent, IntrNoMem,
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;

def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;

class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> :
AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>;

Expand All @@ -3108,8 +3092,34 @@ multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> {
def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>;
}

// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
[llvm_v2i16_ty],
[LLVMQualPointerType<3>, llvm_v2i16_ty],
[IntrArgMemOnly, NoCapture<ArgIndex<0>>]>,
ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;

defset list<Intrinsic> AMDGPUMFMAIntrinsics940 = {
def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>;
def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>;
def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;

defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>;
defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>;

def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;

defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>;
defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>;
}

// llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3]
// byte_sel selects byte from srcA.
Expand Down
87 changes: 11 additions & 76 deletions llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
Original file line number Diff line number Diff line change
Expand Up @@ -354,82 +354,17 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
def : SourceOfDivergence<int_amdgcn_update_dpp>;
def : SourceOfDivergence<int_amdgcn_writelane>;

def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4f16>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_4x4x4i8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x2bf16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x1f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16f16>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x4i8>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x16i8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x2bf16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8bf16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x1f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4f16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8f16>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x4i8>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16_1k>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4bf16_1k>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4bf16_1k>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8bf16_1k>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16bf16_1k>;
def : SourceOfDivergence<int_amdgcn_mfma_f64_16x16x4f64>;
def : SourceOfDivergence<int_amdgcn_mfma_f64_4x4x4f64>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x32_i8>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x16_i8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8_xf32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4_xf32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_bf8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_fp8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_bf8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_fp8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_bf8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_fp8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_bf8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_fp8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_f16>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_f16>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_bf16>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_bf16>;
def : SourceOfDivergence<int_amdgcn_smfmac_i32_16x16x64_i8>;
def : SourceOfDivergence<int_amdgcn_smfmac_i32_32x32x32_i8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_bf8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_fp8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_bf8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_fp8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_bf8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_fp8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_bf8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>;
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_f16>;
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf16>;
def : SourceOfDivergence<int_amdgcn_wmma_f16_16x16x16_f16>;
def : SourceOfDivergence<int_amdgcn_wmma_bf16_16x16x16_bf16>;
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu8>;
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>;
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_fp8>;
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_bf8>;
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_fp8>;
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_bf8>;
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x32_iu4>;
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_f16>;
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf16>;
def : SourceOfDivergence<int_amdgcn_swmmac_f16_16x16x32_f16>;
def : SourceOfDivergence<int_amdgcn_swmmac_bf16_16x16x32_bf16>;
def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu8>;
def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu4>;
def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x64_iu4>;
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_fp8>;
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_bf8>;
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_fp8>;
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_bf8>;
foreach intr = AMDGPUMFMAIntrinsics908 in
def : SourceOfDivergence<intr>;
foreach intr = AMDGPUMFMAIntrinsics90A in
def : SourceOfDivergence<intr>;
foreach intr = AMDGPUMFMAIntrinsics940 in
def : SourceOfDivergence<intr>;
foreach intr = AMDGPUWMMAIntrinsicsGFX11 in
def : SourceOfDivergence<intr>;
foreach intr = AMDGPUWMMAIntrinsicsGFX12 in
def : SourceOfDivergence<intr>;

def : SourceOfDivergence<int_amdgcn_global_load_tr>;

// The dummy boolean output is divergent from the IR's perspective,
Expand Down

0 comments on commit adfe14c

Please sign in to comment.