Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMDGPU: Use defset to cleanup marking MFMA intrinsics as divergent #85915

Merged
merged 2 commits into from
Mar 21, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 48 additions & 38 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2653,6 +2653,8 @@ class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers.
// The content of the other 16-bit half is preserved from the input.

defset list<Intrinsic> AMDGPUMFMAIntrinsicsGFX11 = {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is WMMA, not MFMA. Defset name is wrong. Same for GFX12.

def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;

Expand All @@ -2668,6 +2670,7 @@ def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, l
// GFX12: The op_sel bit must be 0.
def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
}

//===----------------------------------------------------------------------===//
// GFX12 Intrinsics
Expand All @@ -2687,20 +2690,6 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
[IntrNoMem, IntrConvergent, IntrWillReturn,
ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;


// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
//
// These operations perform a matrix multiplication and accumulation of
// the form: D = A * B + C .

// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
// A and B are <16 x iu4>.
def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;

// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics
//
// These operations perform a sparse matrix multiplication and accumulation of
Expand Down Expand Up @@ -2734,6 +2723,20 @@ class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType I
[IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>]
>;

defset list<Intrinsic> AMDGPUMFMAIntrinsicsGFX12 = {
// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
//
// These operations perform a matrix multiplication and accumulation of
// the form: D = A * B + C .

// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
// A and B are <16 x iu4>.
def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;

def int_amdgcn_swmmac_f32_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_f32_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_f16_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
Expand All @@ -2745,6 +2748,7 @@ def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyin
def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
}

def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>;

Expand Down Expand Up @@ -3012,6 +3016,7 @@ class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
[IntrConvergent, IntrNoMem,
ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;

defset list<Intrinsic> AMDGPUMFMAIntrinsics908 = {
def int_amdgcn_mfma_f32_32x32x1f32 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>;
def int_amdgcn_mfma_f32_16x16x1f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>;
def int_amdgcn_mfma_f32_4x4x1f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>;
Expand All @@ -3032,6 +3037,7 @@ def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v
def int_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>;
def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>;
}

//===----------------------------------------------------------------------===//
// gfx90a intrinsics
Expand All @@ -3043,6 +3049,7 @@ def int_amdgcn_flat_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_flat_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_flat_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;

defset list<Intrinsic> AMDGPUMFMAIntrinsics90A = {
def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>;
def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>;
Expand All @@ -3054,25 +3061,12 @@ def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, ll
// source operand.
def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic<llvm_v4f64_ty, llvm_double_ty>;
def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>;
}

//===----------------------------------------------------------------------===//
// gfx940 intrinsics
// ===----------------------------------------------------------------------===//

// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
[llvm_v2i16_ty],
[LLVMQualPointerType<3>, llvm_v2i16_ty],
[IntrArgMemOnly, NoCapture<ArgIndex<0>>]>,
ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;

def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>;
def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>;
def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;

class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> :
AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>;

Expand All @@ -3081,9 +3075,6 @@ multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
}

defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>;
defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>;

// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid
class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
ClangBuiltin<!subst("int", "__builtin", NAME)>,
Expand All @@ -3093,13 +3084,6 @@ class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
[IntrConvergent, IntrNoMem,
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;

def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;

class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> :
AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>;

Expand All @@ -3108,8 +3092,34 @@ multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> {
def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>;
}

// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
[llvm_v2i16_ty],
[LLVMQualPointerType<3>, llvm_v2i16_ty],
[IntrArgMemOnly, NoCapture<ArgIndex<0>>]>,
ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;

defset list<Intrinsic> AMDGPUMFMAIntrinsics940 = {
def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>;
def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>;
def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;

defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>;
defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>;

def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;

defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>;
defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>;
}

// llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3]
// byte_sel selects byte from srcA.
Expand Down
87 changes: 11 additions & 76 deletions llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
Original file line number Diff line number Diff line change
Expand Up @@ -354,82 +354,17 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
def : SourceOfDivergence<int_amdgcn_update_dpp>;
def : SourceOfDivergence<int_amdgcn_writelane>;

def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4f16>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_4x4x4i8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x2bf16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x1f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16f16>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x4i8>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x16i8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x2bf16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8bf16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x1f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2f32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4f16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8f16>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x4i8>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16_1k>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4bf16_1k>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4bf16_1k>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8bf16_1k>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16bf16_1k>;
def : SourceOfDivergence<int_amdgcn_mfma_f64_16x16x4f64>;
def : SourceOfDivergence<int_amdgcn_mfma_f64_4x4x4f64>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x32_i8>;
def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x16_i8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8_xf32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4_xf32>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_bf8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_fp8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_bf8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_fp8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_bf8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_fp8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_bf8>;
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_fp8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_f16>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_f16>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_bf16>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_bf16>;
def : SourceOfDivergence<int_amdgcn_smfmac_i32_16x16x64_i8>;
def : SourceOfDivergence<int_amdgcn_smfmac_i32_32x32x32_i8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_bf8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_fp8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_bf8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_fp8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_bf8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_fp8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_bf8>;
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>;
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_f16>;
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf16>;
def : SourceOfDivergence<int_amdgcn_wmma_f16_16x16x16_f16>;
def : SourceOfDivergence<int_amdgcn_wmma_bf16_16x16x16_bf16>;
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu8>;
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>;
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_fp8>;
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_bf8>;
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_fp8>;
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_bf8>;
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x32_iu4>;
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_f16>;
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf16>;
def : SourceOfDivergence<int_amdgcn_swmmac_f16_16x16x32_f16>;
def : SourceOfDivergence<int_amdgcn_swmmac_bf16_16x16x32_bf16>;
def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu8>;
def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu4>;
def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x64_iu4>;
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_fp8>;
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_bf8>;
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_fp8>;
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_bf8>;
foreach intr = AMDGPUMFMAIntrinsics908 in
def : SourceOfDivergence<intr>;
foreach intr = AMDGPUMFMAIntrinsics90A in
def : SourceOfDivergence<intr>;
foreach intr = AMDGPUMFMAIntrinsics940 in
def : SourceOfDivergence<intr>;
foreach intr = AMDGPUMFMAIntrinsicsGFX11 in
def : SourceOfDivergence<intr>;
foreach intr = AMDGPUMFMAIntrinsicsGFX12 in
def : SourceOfDivergence<intr>;

def : SourceOfDivergence<int_amdgcn_global_load_tr>;

// The dummy boolean output is divergent from the IR's perspective,
Expand Down
Loading