Skip to content

Commit

Permalink
[TargetLowering] Correctly yield NaN from FP_TO_BF16
Browse files Browse the repository at this point in the history
We didn't set the exponent field, resulting in tiny numbers instead of
NaNs.
  • Loading branch information
majnemer committed Feb 21, 2024
1 parent 2b2881b commit 9eff001
Show file tree
Hide file tree
Showing 7 changed files with 966 additions and 967 deletions.
9 changes: 4 additions & 5 deletions llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10948,12 +10948,11 @@ SDValue TargetLowering::expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const {
Op = expandRoundInexactToOdd(F32, Op, dl, DAG);
Op = DAG.getNode(ISD::BITCAST, dl, I32, Op);

// Extract the sign bit.
SDValue SignBit =
DAG.getNode(ISD::AND, dl, I32, Op,
DAG.getConstant(APInt::getSignMask(32), dl, I32));
// Extract the sign bit and exponent.
SDValue SignBitAndExponentField = DAG.getNode(
ISD::AND, dl, I32, Op, DAG.getConstant(0xff800000, dl, I32));
// Set the quiet bit.
SDValue NaN = DAG.getNode(ISD::OR, dl, I32, SignBit,
SDValue NaN = DAG.getNode(ISD::OR, dl, I32, SignBitAndExponentField,
DAG.getConstant(0x400000, dl, I32));

// Factor in the contribution of the low 16 bits.
Expand Down
1,864 changes: 932 additions & 932 deletions llvm/test/CodeGen/AMDGPU/bf16.ll

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
Original file line number Diff line number Diff line change
Expand Up @@ -790,7 +790,7 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
; GFX8-NEXT: v_and_b32_e32 v2, 0x80000000, v0
; GFX8-NEXT: v_and_b32_e32 v2, 0xff800000, v0
; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
Expand All @@ -806,7 +806,7 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1
; GFX9-NEXT: v_and_b32_e32 v2, 0x80000000, v0
; GFX9-NEXT: v_and_b32_e32 v2, 0xff800000, v0
; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4
; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v2
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1524,7 +1524,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1
; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX900-NEXT: v_and_b32_e32 v4, 0x80000000, v1
; GFX900-NEXT: v_and_b32_e32 v4, 0xff800000, v1
; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
Expand Down Expand Up @@ -1566,7 +1566,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1
; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX908-NEXT: v_and_b32_e32 v4, 0x80000000, v1
; GFX908-NEXT: v_and_b32_e32 v4, 0xff800000, v1
; GFX908-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v4
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
Expand Down Expand Up @@ -1608,7 +1608,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1
; GFX90A-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX90A-NEXT: v_and_b32_e32 v4, 0x80000000, v1
; GFX90A-NEXT: v_and_b32_e32 v4, 0xff800000, v1
; GFX90A-NEXT: v_add3_u32 v2, v2, v1, s4
; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v4
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
Expand All @@ -1632,7 +1632,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_brev_b32 s5, 1
; GFX10-NEXT: s_mov_b32 s5, 0xff800000
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s0, s2, -4
; GFX10-NEXT: s_mov_b32 s1, s3
Expand Down Expand Up @@ -1673,7 +1673,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
; GFX11-LABEL: global_atomic_fadd_ret_bf16_agent:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_brev_b32 s5, 1
; GFX11-NEXT: s_mov_b32 s5, 0xff800000
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s0, s2, -4
Expand Down Expand Up @@ -1744,7 +1744,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
; GFX900-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v1
; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX900-NEXT: v_and_b32_e32 v4, 0x80000000, v1
; GFX900-NEXT: v_and_b32_e32 v4, 0xff800000, v1
; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4
; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
Expand Down Expand Up @@ -1786,7 +1786,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
; GFX908-NEXT: v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v1
; GFX908-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX908-NEXT: v_and_b32_e32 v4, 0x80000000, v1
; GFX908-NEXT: v_and_b32_e32 v4, 0xff800000, v1
; GFX908-NEXT: v_add3_u32 v3, v3, v1, s4
; GFX908-NEXT: v_or_b32_e32 v4, 0x400000, v4
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
Expand Down Expand Up @@ -1828,7 +1828,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
; GFX90A-NEXT: v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX90A-NEXT: v_add_f32_e32 v1, 4.0, v1
; GFX90A-NEXT: v_bfe_u32 v2, v1, 16, 1
; GFX90A-NEXT: v_and_b32_e32 v4, 0x80000000, v1
; GFX90A-NEXT: v_and_b32_e32 v4, 0xff800000, v1
; GFX90A-NEXT: v_add3_u32 v2, v2, v1, s4
; GFX90A-NEXT: v_or_b32_e32 v4, 0x400000, v4
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
Expand All @@ -1854,7 +1854,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_brev_b32 s5, 1
; GFX10-NEXT: s_mov_b32 s5, 0xff800000
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s0, s2, -4
; GFX10-NEXT: s_mov_b32 s1, s3
Expand Down Expand Up @@ -1895,7 +1895,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
; GFX11-LABEL: global_atomic_fadd_ret_bf16_system:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_brev_b32 s5, 1
; GFX11-NEXT: s_mov_b32 s5, 0xff800000
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s0, s2, -4
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -912,7 +912,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
; DAGISEL-GFX11-WF32-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
; DAGISEL-GFX11-WF32-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
; DAGISEL-GFX11-WF32-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
; DAGISEL-GFX11-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
Expand All @@ -934,7 +934,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
; DAGISEL-GFX11-WF64-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
; DAGISEL-GFX11-WF64-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
; DAGISEL-GFX11-WF64-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
; DAGISEL-GFX11-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
Expand All @@ -956,7 +956,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
; DAGISEL-GFX10-WF32-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
; DAGISEL-GFX10-WF32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
; DAGISEL-GFX10-WF32-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
; DAGISEL-GFX10-WF32-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
; DAGISEL-GFX10-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
Expand All @@ -978,7 +978,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
; DAGISEL-GFX10-WF64-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
; DAGISEL-GFX10-WF64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
; DAGISEL-GFX10-WF64-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
; DAGISEL-GFX10-WF64-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
; DAGISEL-GFX10-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1413,7 +1413,7 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; VI-NEXT: v_add_f32_e32 v3, 4.0, v3
; VI-NEXT: v_bfe_u32 v6, v3, 16, 1
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3
; VI-NEXT: v_and_b32_e32 v7, 0x80000000, v3
; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v3
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
Expand Down Expand Up @@ -1451,7 +1451,7 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX9-NEXT: v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v3
; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v3
; GFX9-NEXT: v_add3_u32 v5, v5, v3, s6
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
Expand Down Expand Up @@ -1560,7 +1560,7 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; VI-NEXT: v_add_f32_e32 v4, 4.0, v4
; VI-NEXT: v_bfe_u32 v6, v4, 16, 1
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; VI-NEXT: v_and_b32_e32 v7, 0x80000000, v4
; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v4
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v7
; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
Expand Down Expand Up @@ -1597,7 +1597,7 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX9-NEXT: v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_add_f32_e32 v4, 4.0, v4
; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v4
; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v4
; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4262,20 +4262,20 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
; GFX9-NEXT: v_fma_f32 v8, v12, v9, v11
; GFX9-NEXT: v_fma_f32 v2, v12, v5, v2
; GFX9-NEXT: v_bfe_u32 v5, v7, 16, 1
; GFX9-NEXT: v_and_b32_e32 v9, 0x80000000, v7
; GFX9-NEXT: v_and_b32_e32 v9, 0xff800000, v7
; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 1
; GFX9-NEXT: v_and_b32_e32 v12, 0x80000000, v1
; GFX9-NEXT: v_and_b32_e32 v12, 0xff800000, v1
; GFX9-NEXT: v_add3_u32 v5, v5, v7, s2
; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
; GFX9-NEXT: v_bfe_u32 v13, v8, 16, 1
; GFX9-NEXT: v_and_b32_e32 v14, 0x80000000, v8
; GFX9-NEXT: v_and_b32_e32 v14, 0xff800000, v8
; GFX9-NEXT: v_add3_u32 v11, v11, v1, s2
; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v12
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1
; GFX9-NEXT: v_and_b32_e32 v16, 0x80000000, v2
; GFX9-NEXT: v_and_b32_e32 v16, 0xff800000, v2
; GFX9-NEXT: v_add3_u32 v13, v13, v8, s2
; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v14
; GFX9-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc
Expand All @@ -4298,20 +4298,20 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
; GFX9-NEXT: v_fma_f32 v2, v4, v10, v2
; GFX9-NEXT: v_fma_f32 v4, v4, v6, v7
; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1
; GFX9-NEXT: v_and_b32_e32 v6, 0x80000000, v1
; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v1
; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v3
; GFX9-NEXT: v_and_b32_e32 v8, 0xff800000, v3
; GFX9-NEXT: v_add3_u32 v5, v5, v1, s2
; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v6
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1
; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v2
; GFX9-NEXT: v_and_b32_e32 v10, 0xff800000, v2
; GFX9-NEXT: v_add3_u32 v7, v7, v3, s2
; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v8
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX9-NEXT: v_bfe_u32 v11, v4, 16, 1
; GFX9-NEXT: v_and_b32_e32 v12, 0x80000000, v4
; GFX9-NEXT: v_and_b32_e32 v12, 0xff800000, v4
; GFX9-NEXT: v_add3_u32 v9, v9, v2, s2
; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10
; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
Expand All @@ -4332,7 +4332,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; GFX10-NEXT: s_brev_b32 s2, 1
; GFX10-NEXT: s_mov_b32 s2, 0xff800000
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
Expand Down Expand Up @@ -4416,7 +4416,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x10
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; GFX11-NEXT: s_brev_b32 s0, 1
; GFX11-NEXT: s_mov_b32 s0, 0xff800000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3]
Expand Down

0 comments on commit 9eff001

Please sign in to comment.