From e7e90dd1c1014b4a7ef77f74af3682168d23ddbf Mon Sep 17 00:00:00 2001 From: Brian Favela Date: Fri, 14 Jun 2024 13:14:19 -0400 Subject: [PATCH] [AMDGPU] Adding multiple use analysis to SIPeepholeSDWA (#94800) Allow for multiple uses of an operand where each instruction can be promoted to SDWA. For instance: ; v_and_b32 v2, lit(0x0000ffff), v2 ; v_and_b32 v3, 6, v2 ; v_and_b32 v2, 1, v2 Can be folded to: ; v_and_b32 v3, 6, sel_lo(v2) ; v_and_b32 v2, 1, sel_lo(v2) --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 68 ++- .../AMDGPU/GlobalISel/cvt_f32_ubyte.ll | 27 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 308 +++++----- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 359 ++++++------ .../CodeGen/AMDGPU/GlobalISel/llvm.abs.ll | 55 +- .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 316 +++++----- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 314 +++++----- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 36 +- llvm/test/CodeGen/AMDGPU/fract-match.ll | 25 +- llvm/test/CodeGen/AMDGPU/fshr.ll | 92 ++- llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 86 ++- llvm/test/CodeGen/AMDGPU/idot4u.ll | 31 +- llvm/test/CodeGen/AMDGPU/llvm.frexp.ll | 42 +- ...ne-sink-temporal-divergence-swdev407790.ll | 25 +- llvm/test/CodeGen/AMDGPU/permute_i8.ll | 545 ++++++++---------- .../AMDGPU/reassoc-mul-add-1-to-mad.ll | 5 +- llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 102 +++- llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll | 6 +- 18 files changed, 1250 insertions(+), 1192 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 1fadd8ce45b1f5..f47731bf6aac3f 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -37,20 +37,22 @@ STATISTIC(NumSDWAInstructionsPeepholed, namespace { +bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST, + const SIInstrInfo *TII); class SDWAOperand; class SDWADstOperand; -class SIPeepholeSDWA : public MachineFunctionPass { -public: - using SDWAOperandsVector = SmallVector; +using SDWAOperandsVector = SmallVector; +using SDWAOperandsMap = MapVector; +class SIPeepholeSDWA : public MachineFunctionPass { private: MachineRegisterInfo *MRI; const SIRegisterInfo *TRI; const SIInstrInfo *TII; MapVector> SDWAOperands; - MapVector PotentialMatches; + SDWAOperandsMap PotentialMatches; SmallVector ConvertedInstructions; std::optional foldToImm(const MachineOperand &Op) const; @@ -65,7 +67,6 @@ class SIPeepholeSDWA : public MachineFunctionPass { bool runOnMachineFunction(MachineFunction &MF) override; void matchSDWAOperands(MachineBasicBlock &MBB); std::unique_ptr matchSDWAOperand(MachineInstr &MI); - bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const; void pseudoOpConvertToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const; bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); @@ -93,7 +94,9 @@ class SDWAOperand { virtual ~SDWAOperand() = default; - virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; + virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches = nullptr) = 0; virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; MachineOperand *getTargetOperand() const { return Target; } @@ -126,7 +129,9 @@ class SDWASrcOperand : public SDWAOperand { : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} - MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; + MachineInstr *potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches = nullptr) override; bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; SdwaSel getSrcSel() const { return SrcSel; } @@ -153,7 +158,9 @@ class SDWADstOperand : public SDWAOperand { SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} - MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; + MachineInstr *potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches = nullptr) override; bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; SdwaSel getDstSel() const { return DstSel; } @@ -327,7 +334,33 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, return Mods; } -MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { +MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches) { + if (PotentialMatches != nullptr) { + // Fill out the map for all uses if all can be converted + MachineOperand *Reg = getReplacedOperand(); + if (!Reg->isReg() || !Reg->isDef()) + return nullptr; + + for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg())) + // Check that all instructions that use Reg can be converted + if (!isConvertibleToSDWA(UseMI, ST, TII)) + return nullptr; + + // Now that it's guaranteed all uses are legal, iterate over the uses again + // to add them for later conversion. + for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) { + // Should not get a subregister here + assert(isSameReg(UseMO, *Reg)); + + SDWAOperandsMap &potentialMatchesMap = *PotentialMatches; + MachineInstr *UseMI = UseMO.getParent(); + potentialMatchesMap[UseMI].push_back(this); + } + return nullptr; + } + // For SDWA src operand potential instruction is one that use register // defined by parent instruction MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); @@ -420,7 +453,9 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { return true; } -MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { +MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches) { // For SDWA dst operand potential instruction is one that defines register // that this operand uses MachineRegisterInfo *MRI = getMRI(); @@ -919,8 +954,10 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI); } -bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, - const GCNSubtarget &ST) const { +namespace { +bool isConvertibleToSDWA(MachineInstr &MI, + const GCNSubtarget &ST, + const SIInstrInfo* TII) { // Check if this is already an SDWA instruction unsigned Opc = MI.getOpcode(); if (TII->isSDWA(Opc)) @@ -980,6 +1017,7 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, return true; } +} // namespace bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands) { @@ -1215,7 +1253,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { matchSDWAOperands(MBB); for (const auto &OperandPair : SDWAOperands) { const auto &Operand = OperandPair.second; - MachineInstr *PotentialMI = Operand->potentialToConvert(TII); + MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST); if (PotentialMI && (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64)) @@ -1228,8 +1266,8 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { for (const auto &OperandPair : SDWAOperands) { const auto &Operand = OperandPair.second; - MachineInstr *PotentialMI = Operand->potentialToConvert(TII); - if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { + MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches); + if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) { PotentialMatches[PotentialMI].push_back(Operand.get()); } } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll index 02781e763f44a1..eb20178f9f4d88 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -771,7 +771,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v6, 8 +; VI-NEXT: v_mov_b32_e32 v6, 9 +; VI-NEXT: v_mov_b32_e32 v7, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -779,28 +780,28 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 9 +; VI-NEXT: v_mov_b32_e32 v2, 0xff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; VI-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1 ; VI-NEXT: v_add_u16_e32 v9, 9, v1 -; VI-NEXT: v_add_u16_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v7, 9, v7 +; VI-NEXT: v_add_u16_sdwa v10, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; VI-NEXT: v_add_u16_e32 v8, 9, v8 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 ; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v10 +; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v6 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_or_b32_e32 v2, v0, v2 ; VI-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 06930388901b0f..4df5fa18e2942d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -1271,46 +1271,45 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX8-LABEL: v_fshl_v4i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_not_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v9, 1 +; GFX8-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX8-NEXT: v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX8-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX8-NEXT: v_not_b32_e32 v2, v2 -; GFX8-NEXT: v_mov_b32_e32 v10, 1 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b16_e32 v8, v8, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v11 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, v6, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v10 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v8, v2 -; GFX8-NEXT: v_and_b32_e32 v8, 7, v5 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: v_and_b32_e32 v7, 7, v5 ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v9, 0xff -; GFX8-NEXT: v_lshlrev_b16_e32 v3, v8, v3 +; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, v7, v3 ; GFX8-NEXT: v_lshrrev_b16_e32 v4, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, 0xff ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 7, v6 -; GFX8-NEXT: v_not_b32_e32 v5, v6 -; GFX8-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v6 -; GFX8-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6 -; GFX8-NEXT: v_not_b32_e32 v6, v7 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX8-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, v6, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, 7 +; GFX8-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v8 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v5 +; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 @@ -1321,47 +1320,46 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX9-LABEL: v_fshl_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_not_b32_e32 v7, v2 +; GFX9-NEXT: v_mov_b32_e32 v9, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX9-NEXT: v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX9-NEXT: v_not_b32_e32 v2, v2 -; GFX9-NEXT: v_mov_b32_e32 v10, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX9-NEXT: v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b16_e32 v8, v8, v0 -; GFX9-NEXT: v_lshrrev_b16_e32 v2, v2, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v0 +; GFX9-NEXT: v_lshrrev_b16_e32 v7, v7, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: v_or_b32_e32 v2, v8, v2 -; GFX9-NEXT: v_and_b32_e32 v8, 7, v5 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: v_and_b32_e32 v7, 7, v5 ; GFX9-NEXT: v_not_b32_e32 v5, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xff -; GFX9-NEXT: v_lshlrev_b16_e32 v3, v8, v3 +; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, v7, v3 ; GFX9-NEXT: v_lshrrev_b16_e32 v4, v5, v4 +; GFX9-NEXT: v_mov_b32_e32 v8, 0xff ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 7, v6 -; GFX9-NEXT: v_not_b32_e32 v5, v6 -; GFX9-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX9-NEXT: v_lshrrev_b16_e32 v6, 1, v6 -; GFX9-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b16_e32 v5, v5, v6 -; GFX9-NEXT: v_not_b32_e32 v6, v7 -; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX9-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshrrev_b16_e32 v1, v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, 7 +; GFX9-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX9-NEXT: v_lshrrev_b16_e32 v10, 1, v10 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b16_e32 v7, v7, v10 +; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_or_b32 v1, v2, v9, v1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v5 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX9-NEXT: v_and_or_b32 v1, v6, v8, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 @@ -1370,42 +1368,41 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10-LABEL: v_fshl_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX10-NEXT: v_and_b32_e32 v9, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v11, 0xff, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX10-NEXT: v_not_b32_e32 v12, v7 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX10-NEXT: v_not_b32_e32 v9, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_lshlrev_b16 v0, v10, v0 -; GFX10-NEXT: v_not_b32_e32 v10, v8 -; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 -; GFX10-NEXT: v_mov_b32_e32 v13, 0xff -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX10-NEXT: v_and_b32_e32 v12, 0xff, v1 +; GFX10-NEXT: v_lshlrev_b16 v0, v9, v0 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX10-NEXT: v_lshrrev_b16 v9, 1, v11 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v12 +; GFX10-NEXT: v_mov_b32_e32 v12, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v1 ; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX10-NEXT: v_lshlrev_b16 v3, v8, v3 -; GFX10-NEXT: v_not_b32_e32 v8, v11 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_not_b32_e32 v13, v2 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_lshlrev_b16 v3, v7, v3 +; GFX10-NEXT: v_mov_b32_e32 v7, 7 +; GFX10-NEXT: v_not_b32_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_not_b32_sdwa v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_not_b32_e32 v8, v2 ; GFX10-NEXT: v_lshrrev_b16 v6, 1, v6 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 -; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_and_b32_sdwa v14, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_e32 v13, 7, v13 -; GFX10-NEXT: v_lshrrev_b16 v7, 1, v7 -; GFX10-NEXT: v_and_b32_e32 v9, 7, v9 -; GFX10-NEXT: v_lshrrev_b16 v12, 1, v12 -; GFX10-NEXT: v_lshrrev_b16 v6, v10, v6 -; GFX10-NEXT: v_lshlrev_b16 v4, v11, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v8, v1 +; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX10-NEXT: v_and_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v7, 7, v12 +; GFX10-NEXT: v_lshrrev_b16 v10, 1, v10 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX10-NEXT: v_lshrrev_b16 v6, v11, v6 +; GFX10-NEXT: v_lshlrev_b16 v4, v14, v4 +; GFX10-NEXT: v_lshrrev_b16 v1, v13, v1 ; GFX10-NEXT: v_lshlrev_b16 v2, v2, v5 -; GFX10-NEXT: v_lshrrev_b16 v5, v13, v7 -; GFX10-NEXT: v_lshrrev_b16 v7, v9, v12 +; GFX10-NEXT: v_lshrrev_b16 v5, v7, v10 +; GFX10-NEXT: v_lshrrev_b16 v7, v8, v9 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, 8 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 @@ -3932,25 +3929,26 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { ; GFX8-LABEL: v_fshl_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, v3, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, 15 +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v5, -1 +; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v4, 1 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, v3, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_v2i16: @@ -4083,27 +4081,28 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg % ; ; GFX8-LABEL: v_fshl_v2i16_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s1 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: s_lshr_b32 s0, s0, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, 15 +; GFX8-NEXT: v_mov_b32_e32 v3, -1 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v1 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX8-NEXT: s_lshr_b32 s0, s3, 1 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshl_v2i16_ssv: @@ -4620,32 +4619,33 @@ define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) ; GFX8-LABEL: v_fshl_v3i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 15, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 ; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8 -; GFX8-NEXT: v_or_b32_e32 v4, v7, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, v6, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: v_mov_b32_e32 v7, 15 +; GFX8-NEXT: v_and_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v8, -1 +; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v7, 1 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v6, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v5 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v3 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v5, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4984,42 +4984,42 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX8-LABEL: v_fshl_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v8, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v8, v8, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v9 -; GFX8-NEXT: v_or_b32_e32 v4, v8, v4 -; GFX8-NEXT: v_and_b32_e32 v8, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 15, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, v6, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: v_mov_b32_e32 v7, 15 +; GFX8-NEXT: v_and_b32_sdwa v8, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v9, -1 +; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v8, 1 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v6, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v10, 1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v7 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v7 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v10 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_and_b32_sdwa v4, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, v6, v3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index ff93cddafc8728..61588e640be185 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -1272,46 +1272,45 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX8-LABEL: v_fshr_v4i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_not_b32_e32 v7, v2 +; GFX8-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX8-NEXT: v_lshlrev_b16_e32 v8, 1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX8-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX8-NEXT: v_not_b32_e32 v2, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v9 -; GFX8-NEXT: v_lshrrev_b16_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v8 +; GFX8-NEXT: v_lshrrev_b16_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v8 -; GFX8-NEXT: v_and_b32_e32 v8, 7, v5 +; GFX8-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX8-NEXT: v_and_b32_e32 v7, 7, v5 ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, v5, v3 -; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 7, v6 -; GFX8-NEXT: v_not_b32_e32 v5, v6 -; GFX8-NEXT: v_mov_b32_e32 v6, 1 -; GFX8-NEXT: v_mov_b32_e32 v9, 0xff -; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX8-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshlrev_b16_e32 v5, v5, v8 -; GFX8-NEXT: v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX8-NEXT: v_not_b32_e32 v7, v7 +; GFX8-NEXT: v_mov_b32_e32 v4, 7 +; GFX8-NEXT: v_mov_b32_e32 v8, 0xff +; GFX8-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v9, 1 +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 ; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v7, v0 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v10, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v10 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v8 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v5 +; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 @@ -1322,47 +1321,46 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX9-LABEL: v_fshr_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_not_b32_e32 v7, v2 +; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v8, 1, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX9-NEXT: v_not_b32_e32 v2, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 1, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, v2, v9 -; GFX9-NEXT: v_lshrrev_b16_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8 +; GFX9-NEXT: v_lshrrev_b16_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX9-NEXT: v_or_b32_e32 v2, v2, v8 -; GFX9-NEXT: v_and_b32_e32 v8, 7, v5 +; GFX9-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 7, v5 ; GFX9-NEXT: v_not_b32_e32 v5, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, v5, v3 -; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 7, v6 -; GFX9-NEXT: v_not_b32_e32 v5, v6 -; GFX9-NEXT: v_mov_b32_e32 v6, 1 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xff -; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshlrev_b16_e32 v5, v5, v8 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b16_e32 v4, v4, v8 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX9-NEXT: v_not_b32_e32 v7, v7 +; GFX9-NEXT: v_mov_b32_e32 v4, 7 +; GFX9-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v9, 1 +; GFX9-NEXT: v_mov_b32_e32 v8, 0xff +; GFX9-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, v7, v0 -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_sdwa v10, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v10 +; GFX9-NEXT: v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshrrev_b16_e32 v5, v5, v10 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_or_b32 v1, v2, v9, v1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v5 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX9-NEXT: v_and_or_b32 v1, v6, v8, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 @@ -1372,52 +1370,51 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX10-NEXT: v_not_b32_e32 v8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; GFX10-NEXT: v_not_b32_e32 v10, v5 -; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 -; GFX10-NEXT: v_mov_b32_e32 v13, 0xff -; GFX10-NEXT: v_not_b32_e32 v14, v12 -; GFX10-NEXT: v_lshlrev_b16 v3, v10, v3 -; GFX10-NEXT: v_not_b32_e32 v10, v11 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 +; GFX10-NEXT: v_mov_b32_e32 v3, 7 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 -; GFX10-NEXT: v_and_b32_e32 v8, 0xff, v1 +; GFX10-NEXT: v_not_b32_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_not_b32_sdwa v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_lshlrev_b16 v4, v10, v4 +; GFX10-NEXT: v_mov_b32_e32 v10, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v1 +; GFX10-NEXT: v_and_b32_e32 v12, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v13, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX10-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v13, 7, v14 +; GFX10-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX10-NEXT: v_and_b32_sdwa v15, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 -; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 -; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_lshrrev_b16 v5, v5, v7 -; GFX10-NEXT: v_lshlrev_b16 v4, v10, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v11, v1 -; GFX10-NEXT: v_lshlrev_b16 v6, v13, v6 -; GFX10-NEXT: v_lshrrev_b16 v7, v12, v9 -; GFX10-NEXT: v_lshrrev_b16 v2, v2, v8 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, 8 -; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX10-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v10, 7, v14 +; GFX10-NEXT: v_lshlrev_b16 v7, 1, v7 +; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b16 v3, v5, v9 +; GFX10-NEXT: v_lshlrev_b16 v5, v8, v6 +; GFX10-NEXT: v_lshrrev_b16 v1, v15, v1 +; GFX10-NEXT: v_lshlrev_b16 v6, v10, v7 +; GFX10-NEXT: v_lshrrev_b16 v2, v2, v11 +; GFX10-NEXT: v_lshrrev_b16 v7, v12, v13 +; GFX10-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v4, 8 +; GFX10-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3718,29 +3715,29 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { ; GFX8-NEXT: v_mov_b32_e32 v4, 1 ; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-NEXT: v_lshlrev_b16_e32 v5, 1, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v1 ; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, v4, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, 1, v6 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v7, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v5, -1 +; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v5 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, v6, v3 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, v4, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v4, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v2i16: @@ -3896,30 +3893,31 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg % ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 1 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s1 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s0 ; GFX8-NEXT: s_lshr_b32 s4, s3, 15 ; GFX8-NEXT: s_lshl_b32 s3, s3, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, 15 +; GFX8-NEXT: v_mov_b32_e32 v3, -1 ; GFX8-NEXT: s_lshl_b32 s2, s2, 1 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v1 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_and_b32 s0, 0xffff, s3 ; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX8-NEXT: s_lshr_b32 s0, s0, 1 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshr_v2i16_ssv: @@ -4536,47 +4534,47 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) ; GFX8-LABEL: v_fshr_v3i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b16_e32 v7, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v8, 15, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX8-NEXT: v_mov_b32_e32 v8, 1 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v8, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v9, 15, v4 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, 15, v2 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: v_mov_b32_e32 v7, 1 +; GFX8-NEXT: v_mov_b32_e32 v8, 15 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v9, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v2 +; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v4 +; GFX8-NEXT: v_xor_b32_e32 v10, -1, v4 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, v7, v6 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v9 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, v10, v7 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: v_and_b32_sdwa v7, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v8, -1 +; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v7, v9, v7 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v7, v0 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v6 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v8 -; GFX8-NEXT: v_xor_b32_e32 v7, -1, v8 -; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v4, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 1, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v7, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 15, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v5 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, v5, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, v4, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 15, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, v4, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5001,44 +4999,43 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v11, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_xor_b32_e32 v11, -1, v4 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v4 +; GFX8-NEXT: v_and_b32_e32 v11, 15, v11 ; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v9 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, v10, v6 +; GFX8-NEXT: v_lshrrev_b16_e32 v9, v11, v9 +; GFX8-NEXT: v_mov_b32_e32 v10, -1 ; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshlrev_b16_e32 v6, v11, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v9 -; GFX8-NEXT: v_xor_b32_e32 v9, -1, v10 -; GFX8-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v10 -; GFX8-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX8-NEXT: v_and_b32_sdwa v9, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v9, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v9, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v1 ; GFX8-NEXT: v_lshrrev_b16_e32 v4, 15, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 1, v3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v7, 15, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v5 +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v6, 15, v5 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 ; GFX8-NEXT: v_lshrrev_b16_e32 v4, 1, v4 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, v7, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v5, v4 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, v6, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v7, v4 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v6 +; GFX8-NEXT: v_and_b32_sdwa v4, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, v4, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll index 3ef059057ac8e3..41e915a4c1011b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll @@ -248,13 +248,12 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) { ; GFX8-LABEL: abs_vgpr_v2i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v0, 8, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_sub_u16_e32 v2, 0, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v1, 8, v1 -; GFX8-NEXT: v_max_i16_e32 v0, v0, v2 -; GFX8-NEXT: v_sub_u16_e32 v2, 0, v1 -; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 +; GFX8-NEXT: v_sub_u16_sdwa v3, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_sub_u16_sdwa v2, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog @@ -340,17 +339,15 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX8-LABEL: abs_vgpr_v3i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v0, 8, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_sub_u16_sdwa v4, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_sub_u16_e32 v3, 0, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v1, 8, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX8-NEXT: v_max_i16_e32 v0, v0, v3 -; GFX8-NEXT: v_sub_u16_e32 v3, 0, v1 -; GFX8-NEXT: v_ashrrev_i16_e32 v2, 8, v2 -; GFX8-NEXT: v_max_i16_e32 v1, v1, v3 -; GFX8-NEXT: v_sub_u16_e32 v3, 0, v2 -; GFX8-NEXT: v_max_i16_e32 v2, v2, v3 +; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_sdwa v4, v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_sub_u16_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_sdwa v2, sext(v2), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 @@ -424,12 +421,12 @@ define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) { ; ; GFX8-LABEL: abs_vgpr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_sub_u16_e32 v2, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v3, 0, v1 -; GFX8-NEXT: v_max_i16_e32 v0, v0, v2 -; GFX8-NEXT: v_max_i16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0 +; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v1, v0, v1 +; GFX8-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -503,14 +500,14 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) { ; ; GFX8-LABEL: abs_vgpr_v3i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_sub_u16_e32 v3, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v4, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v5, 0, v1 -; GFX8-NEXT: v_max_i16_e32 v0, v0, v3 -; GFX8-NEXT: v_max_i16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_max_i16_e32 v1, v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_sub_u16_e32 v2, 0, v0 +; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v4, 0, v1 +; GFX8-NEXT: v_max_i16_e32 v2, v0, v2 +; GFX8-NEXT: v_max_i16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_max_i16_e32 v1, v1, v4 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index a6f9bb7ee055d4..168e6dfa5f147d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -2774,22 +2774,22 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-LABEL: v_saddsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_i16_e32 v4, 0, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_max_i16_e32 v3, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v4, 0x8000, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, 0x7fff, v3 -; GFX8-NEXT: v_max_i16_e32 v4, v4, v1 -; GFX8-NEXT: v_min_i16_e32 v5, 0, v2 -; GFX8-NEXT: v_min_i16_e32 v3, v4, v3 -; GFX8-NEXT: v_max_i16_e32 v4, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 +; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3 +; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 +; GFX8-NEXT: v_max_i16_e32 v3, v3, v1 +; GFX8-NEXT: v_min_i16_e32 v2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_max_i16_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_sub_u16_e32 v4, 0x7fff, v4 -; GFX8-NEXT: v_max_i16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v3 -; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_u16_e32 v2, v0, v2 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_v2i16: @@ -2987,23 +2987,23 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX8-LABEL: saddsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3 -; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 -; GFX8-NEXT: v_max_i16_e32 v3, s0, v3 -; GFX8-NEXT: v_min_i16_e32 v4, 0, v1 +; GFX8-NEXT: v_min_i16_e32 v2, 0, v0 +; GFX8-NEXT: v_max_i16_e32 v1, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v2, 0x8000, v2 +; GFX8-NEXT: v_sub_u16_e32 v1, 0x7fff, v1 +; GFX8-NEXT: v_max_i16_e32 v2, s0, v2 +; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_max_i16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: v_min_i16_e32 v2, v3, v2 -; GFX8-NEXT: v_max_i16_e32 v3, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v4, 0x8000, v4 +; GFX8-NEXT: v_sub_u16_e32 v2, 0x8000, v2 ; GFX8-NEXT: v_sub_u16_e32 v3, 0x7fff, v3 -; GFX8-NEXT: v_max_i16_e32 v4, s1, v4 -; GFX8-NEXT: v_min_i16_e32 v3, v4, v3 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v2 -; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_max_i16_e32 v2, s1, v2 +; GFX8-NEXT: v_min_i16_e32 v2, v2, v3 +; GFX8-NEXT: v_add_u16_e32 v1, v0, v1 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: saddsat_v2i16_vs: @@ -3090,38 +3090,37 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-LABEL: v_saddsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_i16_e32 v7, 0, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX8-NEXT: v_max_i16_e32 v6, 0, v0 +; GFX8-NEXT: v_min_i16_e32 v5, 0, v0 +; GFX8-NEXT: v_max_i16_e32 v4, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_sub_u16_e32 v4, 0x7fff, v4 +; GFX8-NEXT: v_max_i16_e32 v5, v5, v2 +; GFX8-NEXT: v_min_i16_e32 v4, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_min_i16_sdwa v7, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_sdwa v6, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_sub_u16_e32 v7, 0x8000, v7 ; GFX8-NEXT: v_sub_u16_e32 v6, 0x7fff, v6 -; GFX8-NEXT: v_max_i16_e32 v7, v7, v2 -; GFX8-NEXT: v_min_i16_e32 v8, 0, v4 +; GFX8-NEXT: v_max_i16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v7, 0, v1 +; GFX8-NEXT: v_min_i16_e32 v2, v2, v6 +; GFX8-NEXT: v_max_i16_e32 v6, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_sub_u16_e32 v6, 0x7fff, v6 +; GFX8-NEXT: v_max_i16_e32 v7, v7, v3 ; GFX8-NEXT: v_min_i16_e32 v6, v7, v6 -; GFX8-NEXT: v_max_i16_e32 v7, 0, v4 -; GFX8-NEXT: v_sub_u16_e32 v8, 0x8000, v8 -; GFX8-NEXT: v_sub_u16_e32 v7, 0x7fff, v7 -; GFX8-NEXT: v_max_i16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v8, 0, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_min_i16_e32 v2, v2, v7 -; GFX8-NEXT: v_max_i16_e32 v7, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v8, 0x8000, v8 +; GFX8-NEXT: v_max_i16_sdwa v7, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v5, 0x8000, v5 ; GFX8-NEXT: v_sub_u16_e32 v7, 0x7fff, v7 -; GFX8-NEXT: v_max_i16_e32 v8, v8, v3 -; GFX8-NEXT: v_min_i16_e32 v9, 0, v5 -; GFX8-NEXT: v_min_i16_e32 v7, v8, v7 -; GFX8-NEXT: v_max_i16_e32 v8, 0, v5 -; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 -; GFX8-NEXT: v_sub_u16_e32 v8, 0x7fff, v8 -; GFX8-NEXT: v_max_i16_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v8 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v6 -; GFX8-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v7 -; GFX8-NEXT: v_add_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_max_i16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v3, v3, v7 +; GFX8-NEXT: v_add_u16_e32 v4, v0, v4 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e32 v2, v1, v6 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_v4i16: @@ -3376,54 +3375,52 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX8-LABEL: v_saddsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_i16_e32 v10, 0, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_max_i16_e32 v9, 0, v0 +; GFX8-NEXT: v_min_i16_e32 v7, 0, v0 +; GFX8-NEXT: v_max_i16_e32 v6, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_sub_u16_e32 v6, 0x7fff, v6 +; GFX8-NEXT: v_max_i16_e32 v7, v7, v3 +; GFX8-NEXT: v_min_i16_e32 v6, v7, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, 0 +; GFX8-NEXT: v_min_i16_sdwa v9, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_sdwa v8, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_sub_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_max_i16_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v9, 0, v1 +; GFX8-NEXT: v_min_i16_e32 v3, v3, v8 +; GFX8-NEXT: v_max_i16_e32 v8, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_sub_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_max_i16_e32 v9, v9, v4 +; GFX8-NEXT: v_min_i16_sdwa v10, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_e32 v8, v9, v8 +; GFX8-NEXT: v_max_i16_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v10, 0x8000, v10 +; GFX8-NEXT: v_sub_u16_e32 v9, 0x7fff, v9 +; GFX8-NEXT: v_max_i16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v10, 0, v2 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v9 +; GFX8-NEXT: v_max_i16_e32 v9, 0, v2 ; GFX8-NEXT: v_sub_u16_e32 v10, 0x8000, v10 ; GFX8-NEXT: v_sub_u16_e32 v9, 0x7fff, v9 -; GFX8-NEXT: v_max_i16_e32 v10, v10, v3 -; GFX8-NEXT: v_min_i16_e32 v11, 0, v6 +; GFX8-NEXT: v_max_i16_e32 v10, v10, v5 ; GFX8-NEXT: v_min_i16_e32 v9, v10, v9 -; GFX8-NEXT: v_max_i16_e32 v10, 0, v6 -; GFX8-NEXT: v_sub_u16_e32 v11, 0x8000, v11 -; GFX8-NEXT: v_sub_u16_e32 v10, 0x7fff, v10 -; GFX8-NEXT: v_max_i16_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v11, 0, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v10 -; GFX8-NEXT: v_max_i16_e32 v10, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_max_i16_sdwa v10, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_sdwa v7, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v7, 0x8000, v7 ; GFX8-NEXT: v_sub_u16_e32 v10, 0x7fff, v10 -; GFX8-NEXT: v_max_i16_e32 v11, v11, v4 -; GFX8-NEXT: v_min_i16_e32 v12, 0, v7 -; GFX8-NEXT: v_min_i16_e32 v10, v11, v10 -; GFX8-NEXT: v_max_i16_e32 v11, 0, v7 -; GFX8-NEXT: v_sub_u16_e32 v12, 0x8000, v12 -; GFX8-NEXT: v_sub_u16_e32 v11, 0x7fff, v11 -; GFX8-NEXT: v_max_i16_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v12, 0, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v11 -; GFX8-NEXT: v_max_i16_e32 v11, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v12, 0x8000, v12 -; GFX8-NEXT: v_sub_u16_e32 v11, 0x7fff, v11 -; GFX8-NEXT: v_max_i16_e32 v12, v12, v5 -; GFX8-NEXT: v_min_i16_e32 v13, 0, v8 -; GFX8-NEXT: v_min_i16_e32 v11, v12, v11 -; GFX8-NEXT: v_max_i16_e32 v12, 0, v8 -; GFX8-NEXT: v_sub_u16_e32 v13, 0x8000, v13 -; GFX8-NEXT: v_sub_u16_e32 v12, 0x7fff, v12 -; GFX8-NEXT: v_max_i16_sdwa v5, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v9 -; GFX8-NEXT: v_add_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v5, v5, v12 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v10 -; GFX8-NEXT: v_add_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v11 -; GFX8-NEXT: v_add_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_max_i16_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v10 +; GFX8-NEXT: v_add_u16_e32 v6, v0, v6 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e32 v3, v1, v8 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_u16_e32 v3, v2, v9 +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_v6i16: @@ -3752,70 +3749,67 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX8-LABEL: v_saddsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_i16_e32 v13, 0, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX8-NEXT: v_max_i16_e32 v12, 0, v0 +; GFX8-NEXT: v_min_i16_e32 v9, 0, v0 +; GFX8-NEXT: v_max_i16_e32 v8, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_sub_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_max_i16_e32 v9, v9, v4 +; GFX8-NEXT: v_min_i16_e32 v8, v9, v8 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_min_i16_sdwa v11, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_sdwa v10, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_sub_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_max_i16_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v11, 0, v1 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v10 +; GFX8-NEXT: v_max_i16_e32 v10, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_sub_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_max_i16_e32 v11, v11, v5 +; GFX8-NEXT: v_min_i16_sdwa v12, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_e32 v10, v11, v10 +; GFX8-NEXT: v_max_i16_sdwa v11, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_sub_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v12, 0, v2 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v11 +; GFX8-NEXT: v_max_i16_e32 v11, 0, v2 +; GFX8-NEXT: v_sub_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_sub_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_max_i16_e32 v12, v12, v6 +; GFX8-NEXT: v_min_i16_sdwa v13, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_e32 v11, v12, v11 +; GFX8-NEXT: v_max_i16_sdwa v12, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v13, 0x8000, v13 +; GFX8-NEXT: v_sub_u16_e32 v12, 0x7fff, v12 +; GFX8-NEXT: v_max_i16_sdwa v6, v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v13, 0, v3 +; GFX8-NEXT: v_min_i16_e32 v6, v6, v12 +; GFX8-NEXT: v_max_i16_e32 v12, 0, v3 ; GFX8-NEXT: v_sub_u16_e32 v13, 0x8000, v13 ; GFX8-NEXT: v_sub_u16_e32 v12, 0x7fff, v12 -; GFX8-NEXT: v_max_i16_e32 v13, v13, v4 -; GFX8-NEXT: v_min_i16_e32 v14, 0, v8 +; GFX8-NEXT: v_max_i16_e32 v13, v13, v7 ; GFX8-NEXT: v_min_i16_e32 v12, v13, v12 -; GFX8-NEXT: v_max_i16_e32 v13, 0, v8 -; GFX8-NEXT: v_sub_u16_e32 v14, 0x8000, v14 -; GFX8-NEXT: v_sub_u16_e32 v13, 0x7fff, v13 -; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v14, 0, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v13 -; GFX8-NEXT: v_max_i16_e32 v13, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v14, 0x8000, v14 +; GFX8-NEXT: v_max_i16_sdwa v13, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_sdwa v9, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 ; GFX8-NEXT: v_sub_u16_e32 v13, 0x7fff, v13 -; GFX8-NEXT: v_max_i16_e32 v14, v14, v5 -; GFX8-NEXT: v_min_i16_e32 v15, 0, v9 -; GFX8-NEXT: v_min_i16_e32 v13, v14, v13 -; GFX8-NEXT: v_max_i16_e32 v14, 0, v9 -; GFX8-NEXT: v_sub_u16_e32 v15, 0x8000, v15 -; GFX8-NEXT: v_sub_u16_e32 v14, 0x7fff, v14 -; GFX8-NEXT: v_max_i16_sdwa v5, v15, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v15, 0, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_min_i16_e32 v5, v5, v14 -; GFX8-NEXT: v_max_i16_e32 v14, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v15, 0x8000, v15 -; GFX8-NEXT: v_sub_u16_e32 v14, 0x7fff, v14 -; GFX8-NEXT: v_max_i16_e32 v15, v15, v6 -; GFX8-NEXT: v_min_i16_e32 v16, 0, v10 -; GFX8-NEXT: v_min_i16_e32 v14, v15, v14 -; GFX8-NEXT: v_max_i16_e32 v15, 0, v10 -; GFX8-NEXT: v_sub_u16_e32 v16, 0x8000, v16 -; GFX8-NEXT: v_sub_u16_e32 v15, 0x7fff, v15 -; GFX8-NEXT: v_max_i16_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v16, 0, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX8-NEXT: v_min_i16_e32 v6, v6, v15 -; GFX8-NEXT: v_max_i16_e32 v15, 0, v3 -; GFX8-NEXT: v_sub_u16_e32 v16, 0x8000, v16 -; GFX8-NEXT: v_sub_u16_e32 v15, 0x7fff, v15 -; GFX8-NEXT: v_max_i16_e32 v16, v16, v7 -; GFX8-NEXT: v_min_i16_e32 v17, 0, v11 -; GFX8-NEXT: v_min_i16_e32 v15, v16, v15 -; GFX8-NEXT: v_max_i16_e32 v16, 0, v11 -; GFX8-NEXT: v_sub_u16_e32 v17, 0x8000, v17 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v12 -; GFX8-NEXT: v_add_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_sub_u16_e32 v16, 0x7fff, v16 -; GFX8-NEXT: v_max_i16_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v13 -; GFX8-NEXT: v_add_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v7, v7, v16 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v14 -; GFX8-NEXT: v_add_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v15 -; GFX8-NEXT: v_add_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_max_i16_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v8, v0, v8 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e32 v4, v1, v10 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_e32 v7, v7, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_add_u16_e32 v4, v2, v11 +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_add_u16_e32 v4, v3, v12 +; GFX8-NEXT: v_add_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 320dfbb4980e4c..2572f8581f0edf 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -2774,22 +2774,22 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-LABEL: v_ssubsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_i16_e32 v3, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v3, 0x7fff, v3 -; GFX8-NEXT: v_min_i16_e32 v4, -1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v4, 0x8000, v4 -; GFX8-NEXT: v_max_i16_e32 v3, v3, v1 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 -; GFX8-NEXT: v_max_i16_e32 v4, -1, v2 +; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 +; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 +; GFX8-NEXT: v_max_i16_e32 v2, v2, v1 +; GFX8-NEXT: v_min_i16_e32 v2, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, -1 +; GFX8-NEXT: v_max_i16_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4 -; GFX8-NEXT: v_min_i16_e32 v5, -1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_min_i16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v1, v1, v5 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v3 -; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_min_i16_e32 v1, v1, v3 +; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v2i16: @@ -2987,23 +2987,23 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX8-LABEL: ssubsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 -; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 -; GFX8-NEXT: v_max_i16_e32 v2, s0, v2 -; GFX8-NEXT: v_min_i16_e32 v2, v2, v3 -; GFX8-NEXT: v_max_i16_e32 v3, -1, v1 +; GFX8-NEXT: v_max_i16_e32 v1, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 +; GFX8-NEXT: v_min_i16_e32 v2, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v2, 0x8000, v2 +; GFX8-NEXT: v_max_i16_e32 v1, s0, v1 +; GFX8-NEXT: v_min_i16_e32 v1, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, -1 +; GFX8-NEXT: v_max_i16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: v_subrev_u16_e32 v3, 0x7fff, v3 -; GFX8-NEXT: v_min_i16_e32 v4, -1, v1 -; GFX8-NEXT: v_subrev_u16_e32 v4, 0x8000, v4 +; GFX8-NEXT: v_min_i16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v2, 0x8000, v2 ; GFX8-NEXT: v_max_i16_e32 v3, s1, v3 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v2 -; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_min_i16_e32 v2, v3, v2 +; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ssubsat_v2i16_vs: @@ -3090,38 +3090,37 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-LABEL: v_ssubsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_i16_e32 v6, -1, v0 +; GFX8-NEXT: v_max_i16_e32 v4, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4 +; GFX8-NEXT: v_min_i16_e32 v5, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_max_i16_e32 v4, v4, v2 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v5 +; GFX8-NEXT: v_mov_b32_e32 v5, -1 +; GFX8-NEXT: v_max_i16_sdwa v6, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_subrev_u16_e32 v6, 0x7fff, v6 -; GFX8-NEXT: v_min_i16_e32 v7, -1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX8-NEXT: v_min_i16_sdwa v7, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_max_i16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v6, -1, v1 +; GFX8-NEXT: v_min_i16_e32 v2, v2, v7 +; GFX8-NEXT: v_subrev_u16_e32 v6, 0x7fff, v6 +; GFX8-NEXT: v_min_i16_e32 v7, -1, v1 ; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7 -; GFX8-NEXT: v_max_i16_e32 v6, v6, v2 +; GFX8-NEXT: v_max_i16_e32 v6, v6, v3 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v7 -; GFX8-NEXT: v_max_i16_e32 v7, -1, v4 -; GFX8-NEXT: v_subrev_u16_e32 v7, 0x7fff, v7 -; GFX8-NEXT: v_min_i16_e32 v8, -1, v4 -; GFX8-NEXT: v_subrev_u16_e32 v8, 0x8000, v8 -; GFX8-NEXT: v_max_i16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v7, -1, v1 -; GFX8-NEXT: v_min_i16_e32 v2, v2, v8 +; GFX8-NEXT: v_max_i16_sdwa v7, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_subrev_u16_e32 v7, 0x7fff, v7 -; GFX8-NEXT: v_min_i16_e32 v8, -1, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_subrev_u16_e32 v8, 0x8000, v8 -; GFX8-NEXT: v_max_i16_e32 v7, v7, v3 -; GFX8-NEXT: v_min_i16_e32 v7, v7, v8 -; GFX8-NEXT: v_max_i16_e32 v8, -1, v5 -; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 -; GFX8-NEXT: v_min_i16_e32 v9, -1, v5 -; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 -; GFX8-NEXT: v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v9 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v6 -; GFX8-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v7 -; GFX8-NEXT: v_sub_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_min_i16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_max_i16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v3, v3, v5 +; GFX8-NEXT: v_sub_u16_e32 v4, v0, v4 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v2, v1, v6 +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v4i16: @@ -3376,54 +3375,52 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX8-LABEL: v_ssubsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_i16_e32 v9, -1, v0 +; GFX8-NEXT: v_max_i16_e32 v6, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v6, 0x7fff, v6 +; GFX8-NEXT: v_min_i16_e32 v7, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_max_i16_e32 v6, v6, v3 +; GFX8-NEXT: v_min_i16_e32 v6, v6, v7 +; GFX8-NEXT: v_mov_b32_e32 v7, -1 +; GFX8-NEXT: v_max_i16_sdwa v8, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_min_i16_sdwa v9, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v8, -1, v1 +; GFX8-NEXT: v_min_i16_e32 v3, v3, v9 +; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_min_i16_e32 v9, -1, v1 +; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_max_i16_e32 v8, v8, v4 +; GFX8-NEXT: v_min_i16_e32 v8, v8, v9 +; GFX8-NEXT: v_max_i16_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v9, 0x7fff, v9 +; GFX8-NEXT: v_min_i16_sdwa v10, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v10, 0x8000, v10 +; GFX8-NEXT: v_max_i16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v9, -1, v2 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v10 ; GFX8-NEXT: v_subrev_u16_e32 v9, 0x7fff, v9 -; GFX8-NEXT: v_min_i16_e32 v10, -1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX8-NEXT: v_min_i16_e32 v10, -1, v2 ; GFX8-NEXT: v_subrev_u16_e32 v10, 0x8000, v10 -; GFX8-NEXT: v_max_i16_e32 v9, v9, v3 +; GFX8-NEXT: v_max_i16_e32 v9, v9, v5 ; GFX8-NEXT: v_min_i16_e32 v9, v9, v10 -; GFX8-NEXT: v_max_i16_e32 v10, -1, v6 -; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10 -; GFX8-NEXT: v_min_i16_e32 v11, -1, v6 -; GFX8-NEXT: v_subrev_u16_e32 v11, 0x8000, v11 -; GFX8-NEXT: v_max_i16_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v10, -1, v1 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v11 +; GFX8-NEXT: v_max_i16_sdwa v10, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10 -; GFX8-NEXT: v_min_i16_e32 v11, -1, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX8-NEXT: v_subrev_u16_e32 v11, 0x8000, v11 -; GFX8-NEXT: v_max_i16_e32 v10, v10, v4 -; GFX8-NEXT: v_min_i16_e32 v10, v10, v11 -; GFX8-NEXT: v_max_i16_e32 v11, -1, v7 -; GFX8-NEXT: v_subrev_u16_e32 v11, 0x7fff, v11 -; GFX8-NEXT: v_min_i16_e32 v12, -1, v7 -; GFX8-NEXT: v_subrev_u16_e32 v12, 0x8000, v12 -; GFX8-NEXT: v_max_i16_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v11, -1, v2 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v12 -; GFX8-NEXT: v_subrev_u16_e32 v11, 0x7fff, v11 -; GFX8-NEXT: v_min_i16_e32 v12, -1, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_subrev_u16_e32 v12, 0x8000, v12 -; GFX8-NEXT: v_max_i16_e32 v11, v11, v5 -; GFX8-NEXT: v_min_i16_e32 v11, v11, v12 -; GFX8-NEXT: v_max_i16_e32 v12, -1, v8 -; GFX8-NEXT: v_subrev_u16_e32 v12, 0x7fff, v12 -; GFX8-NEXT: v_min_i16_e32 v13, -1, v8 -; GFX8-NEXT: v_subrev_u16_e32 v13, 0x8000, v13 -; GFX8-NEXT: v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v9 -; GFX8-NEXT: v_sub_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v5, v5, v13 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v10 -; GFX8-NEXT: v_sub_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v11 -; GFX8-NEXT: v_sub_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_min_i16_sdwa v7, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_max_i16_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v7 +; GFX8-NEXT: v_sub_u16_e32 v6, v0, v6 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v3, v1, v8 +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_sub_u16_e32 v3, v2, v9 +; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v6i16: @@ -3752,70 +3749,67 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX8-LABEL: v_ssubsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_i16_e32 v12, -1, v0 +; GFX8-NEXT: v_max_i16_e32 v8, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_min_i16_e32 v9, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_max_i16_e32 v8, v8, v4 +; GFX8-NEXT: v_min_i16_e32 v8, v8, v9 +; GFX8-NEXT: v_mov_b32_e32 v9, -1 +; GFX8-NEXT: v_max_i16_sdwa v10, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_min_i16_sdwa v11, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_max_i16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v10, -1, v1 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v11 +; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_min_i16_e32 v11, -1, v1 +; GFX8-NEXT: v_subrev_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_max_i16_e32 v10, v10, v5 +; GFX8-NEXT: v_min_i16_e32 v10, v10, v11 +; GFX8-NEXT: v_max_i16_sdwa v11, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_min_i16_sdwa v12, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_max_i16_sdwa v5, v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v11, -1, v2 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v12 +; GFX8-NEXT: v_subrev_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_min_i16_e32 v12, -1, v2 +; GFX8-NEXT: v_subrev_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_max_i16_e32 v11, v11, v6 +; GFX8-NEXT: v_min_i16_e32 v11, v11, v12 +; GFX8-NEXT: v_max_i16_sdwa v12, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v12, 0x7fff, v12 +; GFX8-NEXT: v_min_i16_sdwa v13, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v13, 0x8000, v13 +; GFX8-NEXT: v_max_i16_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v12, -1, v3 +; GFX8-NEXT: v_min_i16_e32 v6, v6, v13 ; GFX8-NEXT: v_subrev_u16_e32 v12, 0x7fff, v12 -; GFX8-NEXT: v_min_i16_e32 v13, -1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX8-NEXT: v_min_i16_e32 v13, -1, v3 ; GFX8-NEXT: v_subrev_u16_e32 v13, 0x8000, v13 -; GFX8-NEXT: v_max_i16_e32 v12, v12, v4 +; GFX8-NEXT: v_max_i16_e32 v12, v12, v7 ; GFX8-NEXT: v_min_i16_e32 v12, v12, v13 -; GFX8-NEXT: v_max_i16_e32 v13, -1, v8 -; GFX8-NEXT: v_subrev_u16_e32 v13, 0x7fff, v13 -; GFX8-NEXT: v_min_i16_e32 v14, -1, v8 -; GFX8-NEXT: v_subrev_u16_e32 v14, 0x8000, v14 -; GFX8-NEXT: v_max_i16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v13, -1, v1 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v14 +; GFX8-NEXT: v_max_i16_sdwa v13, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_subrev_u16_e32 v13, 0x7fff, v13 -; GFX8-NEXT: v_min_i16_e32 v14, -1, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX8-NEXT: v_subrev_u16_e32 v14, 0x8000, v14 -; GFX8-NEXT: v_max_i16_e32 v13, v13, v5 -; GFX8-NEXT: v_min_i16_e32 v13, v13, v14 -; GFX8-NEXT: v_max_i16_e32 v14, -1, v9 -; GFX8-NEXT: v_subrev_u16_e32 v14, 0x7fff, v14 -; GFX8-NEXT: v_min_i16_e32 v15, -1, v9 -; GFX8-NEXT: v_subrev_u16_e32 v15, 0x8000, v15 -; GFX8-NEXT: v_max_i16_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v14, -1, v2 -; GFX8-NEXT: v_min_i16_e32 v5, v5, v15 -; GFX8-NEXT: v_subrev_u16_e32 v14, 0x7fff, v14 -; GFX8-NEXT: v_min_i16_e32 v15, -1, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_subrev_u16_e32 v15, 0x8000, v15 -; GFX8-NEXT: v_max_i16_e32 v14, v14, v6 -; GFX8-NEXT: v_min_i16_e32 v14, v14, v15 -; GFX8-NEXT: v_max_i16_e32 v15, -1, v10 -; GFX8-NEXT: v_subrev_u16_e32 v15, 0x7fff, v15 -; GFX8-NEXT: v_min_i16_e32 v16, -1, v10 -; GFX8-NEXT: v_subrev_u16_e32 v16, 0x8000, v16 -; GFX8-NEXT: v_max_i16_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v15, -1, v3 -; GFX8-NEXT: v_min_i16_e32 v6, v6, v16 -; GFX8-NEXT: v_subrev_u16_e32 v15, 0x7fff, v15 -; GFX8-NEXT: v_min_i16_e32 v16, -1, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX8-NEXT: v_subrev_u16_e32 v16, 0x8000, v16 -; GFX8-NEXT: v_max_i16_e32 v15, v15, v7 -; GFX8-NEXT: v_min_i16_e32 v15, v15, v16 -; GFX8-NEXT: v_max_i16_e32 v16, -1, v11 -; GFX8-NEXT: v_subrev_u16_e32 v16, 0x7fff, v16 -; GFX8-NEXT: v_min_i16_e32 v17, -1, v11 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v12 -; GFX8-NEXT: v_sub_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v17, 0x8000, v17 -; GFX8-NEXT: v_max_i16_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v13 -; GFX8-NEXT: v_sub_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v7, v7, v17 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v14 -; GFX8-NEXT: v_sub_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, v3, v15 -; GFX8-NEXT: v_sub_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_min_i16_sdwa v9, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_max_i16_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v8, v0, v8 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v4, v1, v10 +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_e32 v7, v7, v9 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_sub_u16_e32 v4, v2, v11 +; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_sub_u16_e32 v4, v3, v12 +; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 028a28ed9a23b7..3f513e120e141b 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1608,34 +1608,35 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v5, 0xffffff00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v6, 9 +; VI-NEXT: v_mov_b32_e32 v7, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s2 ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_mov_b32 s2, s6 ; VI-NEXT: s_mov_b32 s3, s7 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; VI-NEXT: v_and_b32_e32 v6, 0xffffff00, v4 -; VI-NEXT: v_add_u16_e32 v4, 9, v4 +; VI-NEXT: v_and_b32_e32 v8, 0xffffff00, v4 +; VI-NEXT: v_add_u16_e32 v9, 9, v4 +; VI-NEXT: v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_nop 0 -; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v5 -; VI-NEXT: v_add_u16_e32 v2, 9, v5 -; VI-NEXT: v_or_b32_sdwa v0, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_mov_b32_e32 v2, 0x900 +; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 0x900, v0 -; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1674,28 +1675,29 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_movk_i32 s4, 0xff00 +; GFX9-NEXT: v_mov_b32_e32 v6, 9 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: s_movk_i32 s4, 0x900 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_movk_i32 s5, 0x900 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 -; GFX9-NEXT: v_add_u16_e32 v4, 9, v4 +; GFX9-NEXT: v_add_u16_e32 v8, 9, v4 +; GFX9-NEXT: v_and_b32_sdwa v9, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff00, v6 -; GFX9-NEXT: v_add_u16_e32 v2, 9, v6 -; GFX9-NEXT: v_or_b32_sdwa v0, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x900, v0 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v5, v0, s[2:3] ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index e361aa4db2aa94..1b28ddb2c58620 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -2135,19 +2135,18 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX8-LABEL: safe_math_fract_v2f16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x204 -; GFX8-NEXT: v_floor_f16_e32 v4, v3 -; GFX8-NEXT: v_floor_f16_e32 v5, v0 -; GFX8-NEXT: v_fract_f16_e32 v6, v3 -; GFX8-NEXT: v_cmp_class_f16_e32 vcc, v3, v7 -; GFX8-NEXT: v_pack_b32_f16 v4, v5, v4 -; GFX8-NEXT: v_fract_f16_e32 v5, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, 0, vcc -; GFX8-NEXT: v_cmp_class_f16_e32 vcc, v0, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc -; GFX8-NEXT: v_pack_b32_f16 v0, v0, v3 -; GFX8-NEXT: global_store_dword v[1:2], v4, off +; GFX8-NEXT: v_mov_b32_e32 v6, 0x204 +; GFX8-NEXT: v_floor_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_floor_f16_e32 v4, v0 +; GFX8-NEXT: v_cmp_class_f16_sdwa s[4:5], v0, v6 src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_pack_b32_f16 v3, v4, v3 +; GFX8-NEXT: v_fract_f16_e32 v4, v0 +; GFX8-NEXT: v_fract_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_cmp_class_f16_e32 vcc, v0, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX8-NEXT: v_pack_b32_f16 v0, v0, v5 +; GFX8-NEXT: global_store_dword v[1:2], v3, off ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index 3118d637880425..e8310e73f9a475 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -803,13 +803,13 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2 ; VI-LABEL: v_fshr_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; VI-NEXT: v_mov_b32_e32 v5, 1 -; VI-NEXT: v_lshrrev_b16_sdwa v4, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_xor_b32_e32 v3, -1, v3 -; VI-NEXT: v_lshlrev_b16_e32 v3, v3, v5 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, 1 +; VI-NEXT: v_mov_b32_e32 v5, -1 +; VI-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_xor_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b16_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v4, v5, v4 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; VI-NEXT: v_xor_b32_e32 v4, -1, v2 ; VI-NEXT: v_lshlrev_b16_e32 v0, v4, v0 @@ -887,13 +887,13 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2 ; VI-LABEL: v_fshr_v3i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; VI-NEXT: v_mov_b32_e32 v8, 1 -; VI-NEXT: v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_xor_b32_e32 v6, -1, v6 -; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v8 -; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v7, 1 +; VI-NEXT: v_mov_b32_e32 v8, -1 +; VI-NEXT: v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_xor_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b16_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v7, v8, v7 +; VI-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 ; VI-NEXT: v_xor_b32_e32 v7, -1, v5 ; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1 @@ -910,13 +910,13 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2 ; GFX9-LABEL: v_fshr_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX9-NEXT: v_mov_b32_e32 v8, 1 -; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v8 -; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: v_mov_b32_e32 v7, 1 +; GFX9-NEXT: v_mov_b32_e32 v8, -1 +; GFX9-NEXT: v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_xor_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, v8, v7 +; GFX9-NEXT: v_or_b32_e32 v6, v7, v6 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 ; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v7, v1 @@ -1019,18 +1019,18 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2 ; VI-LABEL: v_fshr_v4i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; VI-NEXT: v_mov_b32_e32 v8, 1 -; VI-NEXT: v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_xor_b32_e32 v6, -1, v6 -; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v9 -; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; VI-NEXT: v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_xor_b32_e32 v7, -1, v7 -; VI-NEXT: v_lshlrev_b16_e32 v7, v7, v8 +; VI-NEXT: v_mov_b32_e32 v7, 1 +; VI-NEXT: v_mov_b32_e32 v9, -1 +; VI-NEXT: v_lshlrev_b16_sdwa v8, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_xor_b32_sdwa v10, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b16_sdwa v6, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v8, v10, v8 +; VI-NEXT: v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_xor_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_lshrrev_b16_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v7, v9, v7 +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 ; VI-NEXT: v_xor_b32_e32 v8, -1, v5 ; VI-NEXT: v_lshlrev_b16_e32 v1, v8, v1 @@ -1040,7 +1040,6 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2 ; VI-NEXT: v_xor_b32_e32 v3, -1, v4 ; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; VI-NEXT: v_lshrrev_b16_e32 v2, v4, v2 -; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -1049,18 +1048,18 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2 ; GFX9-LABEL: v_fshr_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, 1 -; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v9 -; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX9-NEXT: v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8 +; GFX9-NEXT: v_mov_b32_e32 v7, 1 +; GFX9-NEXT: v_mov_b32_e32 v9, -1 +; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_xor_b32_sdwa v10, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_sdwa v6, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_lshlrev_b16_e32 v8, v10, v8 +; GFX9-NEXT: v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_xor_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX9-NEXT: v_lshrrev_b16_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, v9, v7 +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 ; GFX9-NEXT: v_xor_b32_e32 v8, -1, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v8, v1 @@ -1070,7 +1069,6 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2 -; GFX9-NEXT: v_or_b32_e32 v7, v7, v9 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v7, v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index d4ff845e1edf3a..7ee31bf4dce7cd 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -637,6 +637,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_movk_i32 s4, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -644,19 +645,18 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX9-NEXT: v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_add_u16_e32 v2, 1, v2 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 -; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 +; GFX9-NEXT: v_mul_f32_e32 v6, v4, v1 +; GFX9-NEXT: v_trunc_f32_e32 v6, v6 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 +; GFX9-NEXT: v_mad_f32 v4, -v6, v0, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v6, s[0:1] +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_short v3, v4, s[2:3] +; GFX9-NEXT: global_store_short v5, v4, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB4_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -667,25 +667,25 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s0, s4, 0xffff ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB4_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX10-NEXT: v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_add_nc_u16 v2, v2, 1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX10-NEXT: v_mul_f32_e32 v6, v4, v1 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX10-NEXT: v_trunc_f32_e32 v6, v6 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo -; GFX10-NEXT: v_trunc_f32_e32 v5, v5 -; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4 -; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX10-NEXT: v_mad_f32 v4, -v6, v0, v4 +; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v4|, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v5, s0 -; GFX10-NEXT: global_store_short v3, v4, s[2:3] +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v6, s0 +; GFX10-NEXT: global_store_short v5, v4, s[2:3] ; GFX10-NEXT: s_cbranch_vccz .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -748,30 +748,28 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_movk_i32 s5, 0x400 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s3, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s2, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX9-NEXT: v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 +; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s2 +; GFX9-NEXT: v_sub_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 1, v3 -; GFX9-NEXT: v_mul_f32_e32 v6, v4, v1 -; GFX9-NEXT: v_trunc_f32_e32 v6, v6 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 -; GFX9-NEXT: v_mad_f32 v4, -v6, v0, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, v0 -; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s4 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_short v5, v3, s[2:3] +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2 +; GFX9-NEXT: global_store_short v5, v4, s[0:1] ; GFX9-NEXT: s_cbranch_vccz .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -782,26 +780,26 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s0, s4, 0xffff ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB5_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; GFX10-NEXT: v_add_nc_u16 v2, v2, 1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX10-NEXT: v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5 ; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4 ; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v4|, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 1, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_mul_lo_u32 v4, v4, s0 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, v3, v4 -; GFX10-NEXT: global_store_short v5, v3, s[2:3] +; GFX10-NEXT: v_sub_nc_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u16 v2, v2, 1 +; GFX10-NEXT: global_store_short v5, v4, s[2:3] +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 ; GFX10-NEXT: s_cbranch_vccz .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 9a1de74034cd83..0b131ea74f1abb 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -4713,29 +4713,24 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v3, v2, s[6:7] ; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NODL-NEXT: global_load_dword v3, v2, s[6:7] ; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v3 -; GFX9-NODL-NEXT: v_bfe_u32 v6, v3, 16, 8 -; GFX9-NODL-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v7, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v9, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v8, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v7, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v8, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v9, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v3, v7, s0, v9 -; GFX9-NODL-NEXT: v_add3_u32 v3, v3, v4, v6 -; GFX9-NODL-NEXT: v_add3_u32 v0, v8, v3, v0 -; GFX9-NODL-NEXT: v_add3_u32 v0, v0, v5, v1 +; GFX9-NODL-NEXT: v_add3_u32 v3, v4, s0, v6 +; GFX9-NODL-NEXT: v_add3_u32 v3, v3, v7, v9 +; GFX9-NODL-NEXT: v_add3_u32 v0, v5, v3, v0 +; GFX9-NODL-NEXT: v_add3_u32 v0, v0, v8, v1 ; GFX9-NODL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll index 14742c5827c1e4..b9fef0834cb245 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll @@ -183,11 +183,10 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) { ; GFX8-SDAG-LABEL: test_frexp_v2f16_v2i32: ; GFX8-SDAG: ; %bb.0: ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-SDAG-NEXT: v_frexp_mant_f16_e32 v1, v0 -; GFX8-SDAG-NEXT: v_frexp_mant_f16_sdwa v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-SDAG-NEXT: v_frexp_exp_i16_f16_e32 v1, v2 +; GFX8-SDAG-NEXT: v_frexp_mant_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v1, v2 +; GFX8-SDAG-NEXT: v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-SDAG-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 ; GFX8-SDAG-NEXT: v_bfe_i32 v2, v1, 0, 16 ; GFX8-SDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 @@ -197,11 +196,10 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) { ; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX9-SDAG-NEXT: v_frexp_mant_f16_e32 v2, v1 -; GFX9-SDAG-NEXT: v_frexp_mant_f16_e32 v3, v0 -; GFX9-SDAG-NEXT: v_pack_b32_f16 v3, v3, v2 -; GFX9-SDAG-NEXT: v_frexp_exp_i16_f16_e32 v1, v1 +; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-SDAG-NEXT: v_frexp_mant_f16_e32 v2, v0 +; GFX9-SDAG-NEXT: v_pack_b32_f16 v3, v2, v1 +; GFX9-SDAG-NEXT: v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-SDAG-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 ; GFX9-SDAG-NEXT: v_bfe_i32 v2, v1, 0, 16 ; GFX9-SDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 @@ -246,27 +244,25 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) { ; GFX8-GISEL-LABEL: test_frexp_v2f16_v2i32: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-GISEL-NEXT: v_frexp_mant_f16_e32 v3, v0 -; GFX8-GISEL-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 -; GFX8-GISEL-NEXT: v_bfe_i32 v1, v0, 0, 16 -; GFX8-GISEL-NEXT: v_frexp_mant_f16_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX8-GISEL-NEXT: v_frexp_exp_i16_f16_e32 v2, v2 -; GFX8-GISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX8-GISEL-NEXT: v_frexp_exp_i16_f16_e32 v1, v0 +; GFX8-GISEL-NEXT: v_frexp_mant_f16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-GISEL-NEXT: v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX8-GISEL-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v3, v4 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-GISEL-NEXT: v_frexp_mant_f16_e32 v3, v0 -; GFX9-GISEL-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 -; GFX9-GISEL-NEXT: v_bfe_i32 v1, v0, 0, 16 -; GFX9-GISEL-NEXT: v_frexp_mant_f16_e32 v0, v2 -; GFX9-GISEL-NEXT: v_frexp_exp_i16_f16_e32 v2, v2 -; GFX9-GISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v3, v0 +; GFX9-GISEL-NEXT: v_frexp_exp_i16_f16_e32 v1, v0 +; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-GISEL-NEXT: v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX9-GISEL-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v3, v4 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] %result = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> %a) ret { <2 x half>, <2 x i32> } %result diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index 6672568b98a203..8861ee380be031 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -147,11 +147,10 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_add_i32 s48, s49, 1 ; CHECK-NEXT: s_add_i32 s5, s49, 5 ; CHECK-NEXT: v_or3_b32 v57, s4, v43, s48 -; CHECK-NEXT: ds_read_u8 v0, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: ds_read_u8 v56, v0 ; CHECK-NEXT: v_mov_b32_e32 v58, s48 ; CHECK-NEXT: s_mov_b32 s52, exec_lo -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v56, 0xff, v0 ; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_17 ; CHECK-NEXT: ; %bb.6: ; %.preheader2 @@ -175,10 +174,10 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_add_nc_u32_e32 v59, s54, v46 ; CHECK-NEXT: v_add_nc_u32_e32 v58, s54, v57 -; CHECK-NEXT: s_mov_b32 s55, exec_lo ; CHECK-NEXT: ds_read_u8 v0, v59 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 +; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD +; CHECK-NEXT: s_and_saveexec_b32 s55, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.9: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -200,9 +199,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:1 -; CHECK-NEXT: s_mov_b32 s55, exec_lo ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 +; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD +; CHECK-NEXT: s_and_saveexec_b32 s55, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_12 ; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -225,9 +224,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:2 -; CHECK-NEXT: s_mov_b32 s55, exec_lo ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 +; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD +; CHECK-NEXT: s_and_saveexec_b32 s55, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_14 ; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -250,9 +249,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:3 -; CHECK-NEXT: s_mov_b32 s55, exec_lo ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 +; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD +; CHECK-NEXT: s_and_saveexec_b32 s55, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_7 ; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -300,10 +299,10 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: .LBB0_20: ; Parent Loop BB0_5 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v58 -; CHECK-NEXT: s_mov_b32 s53, exec_lo ; CHECK-NEXT: ds_read_u8 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 +; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD +; CHECK-NEXT: s_and_saveexec_b32 s53, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_19 ; CHECK-NEXT: ; %bb.21: ; in Loop: Header=BB0_20 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index 7ca9ae359a4992..352c1ecf8ece4a 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -697,18 +697,16 @@ define hidden void @add(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, p ; GFX9-LABEL: add: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[2:3], off -; GFX9-NEXT: global_load_dword v7, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_sdwa v1, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v2, v4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_add_u16_sdwa v3, v7, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v2, v7, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_add_u16_sdwa v3, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:WORD_1 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[5:6], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -808,17 +806,16 @@ define hidden void @add_store(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 % ; GFX9-NEXT: global_load_dword v9, v[2:3], off ; GFX9-NEXT: s_movk_i32 s4, 0xff00 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4 -; GFX9-NEXT: v_and_b32_sdwa v1, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_sdwa v2, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 -; GFX9-NEXT: v_or_b32_e32 v1, v0, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v9 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX9-NEXT: v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v1, off +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: global_store_dword v[5:6], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 @@ -868,23 +865,22 @@ define hidden void @add_store_div_16(ptr addrspace(1) %in0, ptr addrspace(1) %in ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v9, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off ; GFX9-NEXT: s_movk_i32 s4, 0xff00 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b16_e32 v1, 8, v9 -; GFX9-NEXT: v_and_b32_sdwa v2, v9, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v2, v1, v2 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_sdwa v3, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 -; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX9-NEXT: v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v1, off +; GFX9-NEXT: global_store_dword v[5:6], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -942,22 +938,20 @@ define hidden void @add_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v4, v[2:3], off -; GFX9-NEXT: global_load_dword v9, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off ; GFX9-NEXT: s_mov_b32 s4, 0x10705 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v1, v9, v4, s4 -; GFX9-NEXT: v_add_u16_sdwa v2, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v3, v4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_add_u16_sdwa v9, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v1, off +; GFX9-NEXT: v_perm_b32 v0, v4, v9, s4 +; GFX9-NEXT: v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX9-NEXT: v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_add_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:WORD_1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1391,22 +1385,20 @@ define hidden void @mul_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v4, v[2:3], off -; GFX9-NEXT: global_load_dword v9, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off ; GFX9-NEXT: s_mov_b32 s4, 0x2000504 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_lo_u16_e32 v2, v9, v4 -; GFX9-NEXT: v_mul_lo_u16_sdwa v3, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_sdwa v3, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_mul_lo_u16_e32 v0, v4, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v1, v9, v4, s4 -; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v1, off +; GFX9-NEXT: v_perm_b32 v0, v4, v9, s4 +; GFX9-NEXT: v_mul_lo_u16_e32 v1, v4, v9 +; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NEXT: v_mul_lo_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_3 +; GFX9-NEXT: v_mul_lo_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1503,67 +1495,61 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_bfe_i32 v0, v4, 0, 8 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_ashrrev_i32_e32 v2, 24, v9 -; GFX10-NEXT: v_bfe_i32 v3, v4, 8, 8 -; GFX10-NEXT: v_bfe_i32 v1, v9, 16, 8 -; GFX10-NEXT: v_bfe_i32 v10, v4, 16, 8 -; GFX10-NEXT: v_cvt_f32_i32_e32 v13, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v11, 24, v4 -; GFX10-NEXT: v_xor_b32_e32 v15, v2, v3 -; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX10-NEXT: v_xor_b32_e32 v12, v1, v0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v13 -; GFX10-NEXT: v_cvt_f32_i32_e32 v14, v1 -; GFX10-NEXT: v_xor_b32_e32 v1, v1, v10 -; GFX10-NEXT: v_cvt_f32_i32_e32 v10, v10 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v3 -; GFX10-NEXT: v_xor_b32_e32 v0, v0, v11 -; GFX10-NEXT: v_cvt_f32_i32_e32 v11, v11 -; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v10 -; GFX10-NEXT: v_ashrrev_i32_e32 v12, 30, v12 -; GFX10-NEXT: v_mul_f32_e32 v16, v14, v16 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v11 -; GFX10-NEXT: v_ashrrev_i32_e32 v15, 30, v15 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 30, v1 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v15, v1 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v10 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v12 +; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v14 +; GFX10-NEXT: v_xor_b32_sdwa v3, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX10-NEXT: v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX10-NEXT: v_mul_f32_e32 v15, v2, v15 +; GFX10-NEXT: v_mul_f32_e32 v16, v19, v16 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 30, v3 ; GFX10-NEXT: v_mul_f32_e32 v17, v2, v17 +; GFX10-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_trunc_f32_e32 v15, v15 ; GFX10-NEXT: v_trunc_f32_e32 v16, v16 -; GFX10-NEXT: v_or_b32_e32 v12, 1, v12 -; GFX10-NEXT: v_or_b32_e32 v15, 1, v15 -; GFX10-NEXT: v_mul_f32_e32 v18, v14, v18 +; GFX10-NEXT: v_mul_f32_e32 v18, v1, v18 ; GFX10-NEXT: v_trunc_f32_e32 v17, v17 -; GFX10-NEXT: v_mad_f32 v20, -v16, v13, v14 -; GFX10-NEXT: v_mul_f32_e32 v19, v13, v19 -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11 +; GFX10-NEXT: v_mad_f32 v20, -v15, v1, v2 +; GFX10-NEXT: v_mad_f32 v19, -v16, v10, v19 +; GFX10-NEXT: v_or_b32_e32 v3, 1, v3 ; GFX10-NEXT: v_trunc_f32_e32 v18, v18 -; GFX10-NEXT: v_mad_f32 v2, -v17, v3, v2 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v13| -; GFX10-NEXT: v_trunc_f32_e32 v19, v19 -; GFX10-NEXT: v_or_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_mad_f32 v14, -v18, v10, v14 -; GFX10-NEXT: v_or_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, |v3| -; GFX10-NEXT: v_mad_f32 v21, -v19, v11, v13 +; GFX10-NEXT: v_mad_f32 v2, -v17, v12, v2 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v1| +; GFX10-NEXT: v_ashrrev_i32_e32 v13, 30, v13 +; GFX10-NEXT: v_or_b32_e32 v11, 1, v11 +; GFX10-NEXT: v_mad_f32 v21, -v18, v14, v1 +; GFX10-NEXT: v_cvt_i32_f32_e32 v15, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, |v10| +; GFX10-NEXT: v_or_b32_e32 v13, 1, v13 ; GFX10-NEXT: v_cvt_i32_f32_e32 v16, v16 ; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17 ; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v15, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v14|, |v10| -; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v16, v12 -; GFX10-NEXT: v_add_nc_u32_sdwa v2, v17, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v11| -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_add_nc_u32_e32 v1, v18, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo -; GFX10-NEXT: v_add_nc_u32_sdwa v0, v19, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, |v12| +; GFX10-NEXT: v_add_nc_u32_e32 v0, v15, v0 +; GFX10-NEXT: v_add_nc_u32_sdwa v1, v16, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v11, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14| +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u32_e32 v2, v17, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v13, vcc_lo +; GFX10-NEXT: v_add_nc_u32_sdwa v3, v18, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x60706 -; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: global_store_dword v[5:6], v0, off ; GFX10-NEXT: global_store_dword v[7:8], v1, off ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1581,67 +1567,61 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: global_load_dword v9, v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 0x60706 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfe_i32 v1, v4, 0, 8 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 -; GFX9-NEXT: v_bfe_i32 v2, v9, 16, 8 -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 24, v9 -; GFX9-NEXT: v_bfe_i32 v9, v4, 8, 8 -; GFX9-NEXT: v_cvt_f32_i32_e32 v12, v1 -; GFX9-NEXT: v_bfe_i32 v10, v4, 16, 8 -; GFX9-NEXT: v_ashrrev_i32_e32 v4, 24, v4 -; GFX9-NEXT: v_xor_b32_e32 v14, v3, v9 -; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v9 -; GFX9-NEXT: v_xor_b32_e32 v11, v2, v1 -; GFX9-NEXT: v_cvt_f32_i32_e32 v13, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, v2, v10 -; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v10 -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v12 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v9 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v10 +; GFX9-NEXT: v_xor_b32_sdwa v1, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX9-NEXT: v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v11, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_xor_b32_sdwa v9, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX9-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v12 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v13 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v4 -; GFX9-NEXT: v_mul_f32_e32 v15, v13, v15 -; GFX9-NEXT: v_mul_f32_e32 v16, v3, v16 +; GFX9-NEXT: v_mul_f32_e32 v15, v3, v15 +; GFX9-NEXT: v_mul_f32_e32 v16, v11, v16 ; GFX9-NEXT: v_trunc_f32_e32 v15, v15 -; GFX9-NEXT: v_ashrrev_i32_e32 v11, 30, v11 -; GFX9-NEXT: v_mul_f32_e32 v17, v13, v17 -; GFX9-NEXT: v_mul_f32_e32 v18, v12, v18 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 +; GFX9-NEXT: v_mul_f32_e32 v17, v3, v17 +; GFX9-NEXT: v_mul_f32_e32 v18, v2, v18 ; GFX9-NEXT: v_trunc_f32_e32 v16, v16 -; GFX9-NEXT: v_mad_f32 v19, -v15, v12, v13 -; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v14 -; GFX9-NEXT: v_or_b32_e32 v11, 1, v11 +; GFX9-NEXT: v_mad_f32 v19, -v15, v2, v3 +; GFX9-NEXT: v_ashrrev_i32_e32 v10, 30, v10 +; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_trunc_f32_e32 v17, v17 ; GFX9-NEXT: v_trunc_f32_e32 v18, v18 -; GFX9-NEXT: v_mad_f32 v3, -v16, v9, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v12| -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v2 -; GFX9-NEXT: v_or_b32_e32 v14, 1, v14 +; GFX9-NEXT: v_mad_f32 v11, -v16, v12, v11 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v2| +; GFX9-NEXT: v_ashrrev_i32_e32 v9, 30, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 1, v10 ; GFX9-NEXT: v_cvt_i32_f32_e32 v15, v15 ; GFX9-NEXT: v_cvt_i32_f32_e32 v16, v16 -; GFX9-NEXT: v_mad_f32 v13, -v17, v10, v13 +; GFX9-NEXT: v_mad_f32 v3, -v17, v13, v3 ; GFX9-NEXT: v_cvt_i32_f32_e32 v17, v17 -; GFX9-NEXT: v_mad_f32 v20, -v18, v4, v12 +; GFX9-NEXT: v_mad_f32 v20, -v18, v4, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v9| -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 -; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v14, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, |v10| -; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v20|, |v4| ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_add_u32_e32 v4, v15, v11 -; GFX9-NEXT: v_add_u32_sdwa v3, v16, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v2, v17, v2 -; GFX9-NEXT: v_add_u32_sdwa v1, v18, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v11|, |v12| +; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v14 +; GFX9-NEXT: v_or_b32_e32 v9, 1, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v10, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v13| +; GFX9-NEXT: v_or_b32_e32 v14, 1, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v20|, |v4| +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v14, vcc +; GFX9-NEXT: v_add_u32_e32 v1, v15, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v16, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v3, v17, v3 +; GFX9-NEXT: v_add_u32_sdwa v4, v18, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[5:6], v1, off ; GFX9-NEXT: global_store_dword v[7:8], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1876,73 +1856,67 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_bfe_i32 v1, v4, 0, 8 -; GFX10-NEXT: v_bfe_i32 v2, v4, 16, 8 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_ashrrev_i32_e32 v10, 24, v9 -; GFX10-NEXT: v_bfe_i32 v11, v4, 8, 8 -; GFX10-NEXT: v_ashrrev_i32_e32 v12, 24, v4 -; GFX10-NEXT: v_bfe_i32 v13, v9, 16, 8 -; GFX10-NEXT: v_xor_b32_e32 v14, v2, v1 -; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX10-NEXT: v_xor_b32_e32 v16, v10, v11 -; GFX10-NEXT: v_cvt_f32_i32_e32 v11, v11 -; GFX10-NEXT: v_cvt_f32_i32_e32 v15, v2 -; GFX10-NEXT: v_cvt_f32_i32_e32 v10, v10 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v1 -; GFX10-NEXT: v_cvt_f32_i32_e32 v17, v12 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v11 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v15, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v13 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v3 +; GFX10-NEXT: v_xor_b32_sdwa v1, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v20, v15 -; GFX10-NEXT: v_xor_b32_e32 v2, v12, v2 -; GFX10-NEXT: v_xor_b32_e32 v12, v13, v12 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v21, v17 -; GFX10-NEXT: v_ashrrev_i32_e32 v14, 30, v14 -; GFX10-NEXT: v_cvt_f32_i32_e32 v13, v13 -; GFX10-NEXT: v_ashrrev_i32_e32 v16, 30, v16 -; GFX10-NEXT: v_mul_f32_e32 v18, v15, v18 -; GFX10-NEXT: v_ashrrev_i32_e32 v2, 30, v2 -; GFX10-NEXT: v_mul_f32_e32 v19, v10, v19 -; GFX10-NEXT: v_mul_f32_e32 v20, v17, v20 -; GFX10-NEXT: v_or_b32_e32 v14, 1, v14 +; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v21, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX10-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 30, v1 +; GFX10-NEXT: v_xor_b32_sdwa v16, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3 +; GFX10-NEXT: v_mul_f32_e32 v17, v3, v17 +; GFX10-NEXT: v_mul_f32_e32 v18, v12, v18 +; GFX10-NEXT: v_mul_f32_e32 v19, v15, v19 +; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11 +; GFX10-NEXT: v_or_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_trunc_f32_e32 v17, v17 ; GFX10-NEXT: v_trunc_f32_e32 v18, v18 -; GFX10-NEXT: v_mul_f32_e32 v21, v13, v21 +; GFX10-NEXT: v_mul_f32_e32 v20, v21, v20 ; GFX10-NEXT: v_trunc_f32_e32 v19, v19 +; GFX10-NEXT: v_ashrrev_i32_e32 v14, 30, v14 +; GFX10-NEXT: v_mad_f32 v22, -v17, v2, v3 +; GFX10-NEXT: v_mad_f32 v12, -v18, v13, v12 +; GFX10-NEXT: v_or_b32_e32 v11, 1, v11 ; GFX10-NEXT: v_trunc_f32_e32 v20, v20 +; GFX10-NEXT: v_mad_f32 v23, -v19, v3, v15 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v22|, |v2| +; GFX10-NEXT: v_ashrrev_i32_e32 v16, 30, v16 +; GFX10-NEXT: v_or_b32_e32 v14, 1, v14 +; GFX10-NEXT: v_mad_f32 v21, -v20, v15, v21 +; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v12|, |v13| ; GFX10-NEXT: v_or_b32_e32 v16, 1, v16 -; GFX10-NEXT: v_mad_f32 v22, -v18, v1, v15 -; GFX10-NEXT: v_trunc_f32_e32 v21, v21 -; GFX10-NEXT: v_mad_f32 v10, -v19, v11, v10 -; GFX10-NEXT: v_mad_f32 v23, -v20, v15, v17 -; GFX10-NEXT: v_ashrrev_i32_e32 v12, 30, v12 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v22|, |v1| -; GFX10-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_mad_f32 v13, -v21, v17, v13 ; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18 -; GFX10-NEXT: v_or_b32_e32 v12, 1, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v14, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v10|, |v11| ; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19 ; GFX10-NEXT: v_cvt_i32_f32_e32 v20, v20 -; GFX10-NEXT: v_cvt_i32_f32_e32 v21, v21 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v11, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v23|, |v3| ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v16, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v23|, |v15| -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v18, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v19, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v13|, |v17| -; GFX10-NEXT: v_mul_lo_u32 v1, v1, v4 -; GFX10-NEXT: v_mul_lo_u32 v3, v10, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v20, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v4 -; GFX10-NEXT: v_mul_lo_u32 v2, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v11, v21, v11 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v17, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v14, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v15| +; GFX10-NEXT: v_add_nc_u32_e32 v2, v18, v2 +; GFX10-NEXT: v_mul_lo_u32 v1, v1, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v19, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v16, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v2, v2, v10 +; GFX10-NEXT: v_mul_lo_u32 v3, v3, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v11, v20, v11 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 -; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v9, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX10-NEXT: v_mul_lo_u32 v10, v11, v12 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v12, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, v12, v3 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -1965,74 +1939,68 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: global_load_dword v9, v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 0x2070306 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfe_i32 v2, v4, 0, 8 -; GFX9-NEXT: v_bfe_i32 v3, v4, 16, 8 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_ashrrev_i32_e32 v11, 24, v9 -; GFX9-NEXT: v_bfe_i32 v12, v4, 8, 8 -; GFX9-NEXT: v_xor_b32_e32 v16, v3, v2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX9-NEXT: v_ashrrev_i32_e32 v13, 24, v4 -; GFX9-NEXT: v_xor_b32_e32 v18, v11, v12 -; GFX9-NEXT: v_cvt_f32_i32_e32 v12, v12 -; GFX9-NEXT: v_cvt_f32_i32_e32 v17, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v19, v13 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v20, v2 -; GFX9-NEXT: v_bfe_i32 v15, v9, 16, 8 -; GFX9-NEXT: v_cvt_f32_i32_e32 v11, v11 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v21, v12 -; GFX9-NEXT: v_xor_b32_e32 v3, v13, v3 -; GFX9-NEXT: v_xor_b32_e32 v13, v15, v13 -; GFX9-NEXT: v_cvt_f32_i32_e32 v15, v15 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v22, v17 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v23, v19 -; GFX9-NEXT: v_mul_f32_e32 v20, v17, v20 -; GFX9-NEXT: v_mul_f32_e32 v21, v11, v21 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v16, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v20, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v21, v14 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v22, v10 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v23, v16 +; GFX9-NEXT: v_mul_f32_e32 v20, v10, v20 +; GFX9-NEXT: v_xor_b32_sdwa v2, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX9-NEXT: v_mul_f32_e32 v21, v13, v21 ; GFX9-NEXT: v_trunc_f32_e32 v20, v20 -; GFX9-NEXT: v_ashrrev_i32_e32 v16, 30, v16 -; GFX9-NEXT: v_mul_f32_e32 v22, v19, v22 -; GFX9-NEXT: v_mul_f32_e32 v23, v15, v23 +; GFX9-NEXT: v_xor_b32_sdwa v12, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v2 +; GFX9-NEXT: v_mul_f32_e32 v22, v16, v22 +; GFX9-NEXT: v_mul_f32_e32 v23, v19, v23 ; GFX9-NEXT: v_trunc_f32_e32 v21, v21 -; GFX9-NEXT: v_mad_f32 v24, -v20, v2, v17 -; GFX9-NEXT: v_ashrrev_i32_e32 v18, 30, v18 -; GFX9-NEXT: v_or_b32_e32 v16, 1, v16 +; GFX9-NEXT: v_mad_f32 v24, -v20, v3, v10 +; GFX9-NEXT: v_xor_b32_sdwa v15, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2 +; GFX9-NEXT: v_ashrrev_i32_e32 v12, 30, v12 +; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v22, v22 ; GFX9-NEXT: v_trunc_f32_e32 v23, v23 -; GFX9-NEXT: v_mad_f32 v11, -v21, v12, v11 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v24|, |v2| -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 30, v3 -; GFX9-NEXT: v_or_b32_e32 v18, 1, v18 +; GFX9-NEXT: v_mad_f32 v13, -v21, v14, v13 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v24|, |v3| +; GFX9-NEXT: v_xor_b32_sdwa v18, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3 +; GFX9-NEXT: v_ashrrev_i32_e32 v15, 30, v15 +; GFX9-NEXT: v_or_b32_e32 v12, 1, v12 ; GFX9-NEXT: v_cvt_i32_f32_e32 v20, v20 ; GFX9-NEXT: v_cvt_i32_f32_e32 v21, v21 -; GFX9-NEXT: v_mad_f32 v25, -v22, v17, v19 +; GFX9-NEXT: v_mad_f32 v25, -v22, v10, v16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v22, v22 -; GFX9-NEXT: v_mad_f32 v15, -v23, v19, v15 +; GFX9-NEXT: v_mad_f32 v19, -v23, v16, v19 ; GFX9-NEXT: v_cvt_i32_f32_e32 v23, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v11|, |v12| -; GFX9-NEXT: v_ashrrev_i32_e32 v13, 30, v13 -; GFX9-NEXT: v_or_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v18, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v25|, |v17| -; GFX9-NEXT: v_or_b32_e32 v13, 1, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v15|, |v19| -; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, |v14| +; GFX9-NEXT: v_ashrrev_i32_e32 v18, 30, v18 +; GFX9-NEXT: v_or_b32_e32 v15, 1, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v12, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v25|, |v10| +; GFX9-NEXT: v_or_b32_e32 v18, 1, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v15, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v16| +; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v18, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v4 ; GFX9-NEXT: v_add_u32_e32 v2, v20, v2 -; GFX9-NEXT: v_add_u32_e32 v11, v21, v11 -; GFX9-NEXT: v_add_u32_e32 v3, v22, v3 +; GFX9-NEXT: v_add_u32_e32 v3, v21, v3 +; GFX9-NEXT: v_add_u32_e32 v10, v22, v10 ; GFX9-NEXT: v_add_u32_e32 v12, v23, v12 ; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v4, v11, v10 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, v0 -; GFX9-NEXT: v_mul_lo_u32 v10, v12, v14 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, v11 +; GFX9-NEXT: v_mul_lo_u32 v4, v10, v0 +; GFX9-NEXT: v_mul_lo_u32 v10, v12, v17 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_sub_u32_sdwa v2, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_sub_u32_e32 v3, v14, v3 +; GFX9-NEXT: v_sub_u32_sdwa v2, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_sub_u32_e32 v3, v17, v4 ; GFX9-NEXT: v_sub_u32_sdwa v4, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2090,27 +2058,24 @@ define hidden void @sub_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 -; GFX9-NEXT: global_load_dword v2, v[2:3], off ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off ; GFX9-NEXT: s_mov_b32 s4, 0x6070007 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; GFX9-NEXT: v_sub_u16_sdwa v9, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v4, v2, v0, s4 -; GFX9-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_sub_u16_e32 v2, v3, v2 -; GFX9-NEXT: v_sub_u16_e32 v1, v3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v4, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 +; GFX9-NEXT: v_sub_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_sub_u16_sdwa v2, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_sub_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_3 +; GFX9-NEXT: v_sub_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:WORD_1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll index b62d6ee59a8545..24e420b7d657bf 100644 --- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll @@ -1838,10 +1838,9 @@ define <2 x i16> @v_mul_sub_x_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX8-LABEL: v_mul_sub_x_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_lo_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e32 v0, v1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index c9dbadcbd23157..0f2eedb1923d63 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1718,18 +1718,16 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX89-NEXT: v_mov_b32_e32 v3, s1 ; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX89-NEXT: flat_load_dword v4, v[0:1] ; GFX89-NEXT: flat_load_dword v2, v[2:3] -; GFX89-NEXT: flat_load_dword v3, v[0:1] ; GFX89-NEXT: v_mov_b32_e32 v0, s4 ; GFX89-NEXT: v_mov_b32_e32 v1, s5 -; GFX89-NEXT: s_waitcnt vmcnt(1) -; GFX89-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_mul_lo_u16_e32 v5, v3, v2 -; GFX89-NEXT: v_mul_lo_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX89-NEXT: v_mul_lo_u16_e32 v2, v5, v2 -; GFX89-NEXT: v_mul_lo_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX89-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX89-NEXT: v_mul_lo_u16_sdwa v3, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX89-NEXT: v_mul_lo_u16_e32 v4, v4, v2 +; GFX89-NEXT: v_mul_lo_u16_e32 v4, v4, v2 +; GFX89-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX89-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX89-NEXT: flat_store_dword v[0:1], v2 ; GFX89-NEXT: s_endpgm ; @@ -2205,6 +2203,94 @@ bb2: br label %bb0 } +define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { +; NOSDWA-LABEL: mac_v2half_same_srcop: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: flat_load_dword v2, v[2:3] +; NOSDWA-NEXT: flat_load_dword v3, v[0:1] +; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: s_waitcnt vmcnt(1) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; NOSDWA-NEXT: v_mac_f16_e32 v5, v4, v4 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; NOSDWA-NEXT: v_mac_f16_e32 v3, v2, v2 +; NOSDWA-NEXT: v_or_b32_e32 v2, v3, v4 +; NOSDWA-NEXT: flat_store_dword v[0:1], v2 +; NOSDWA-NEXT: s_endpgm +; +; GFX89-LABEL: mac_v2half_same_srcop: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: v_mov_b32_e32 v0, s6 +; GFX89-NEXT: v_mov_b32_e32 v1, s7 +; GFX89-NEXT: v_mov_b32_e32 v2, s0 +; GFX89-NEXT: v_mov_b32_e32 v3, s1 +; GFX89-NEXT: flat_load_dword v4, v[0:1] +; GFX89-NEXT: flat_load_dword v2, v[2:3] +; GFX89-NEXT: v_mov_b32_e32 v0, s4 +; GFX89-NEXT: v_mov_b32_e32 v1, s5 +; GFX89-NEXT: s_waitcnt vmcnt(1) +; GFX89-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_mac_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX89-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX89-NEXT: v_mac_f16_e32 v4, v2, v2 +; GFX89-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX89-NEXT: flat_store_dword v[0:1], v2 +; GFX89-NEXT: s_endpgm +; +; GFX9-LABEL: mac_v2half_same_srcop: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v1, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mac_v2half_same_srcop: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v1, v1, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm +entry: + %a = load <2 x half>, ptr addrspace(1) %ina, align 4 + %b = load <2 x half>, ptr addrspace(1) %inb, align 4 + %mul = fmul <2 x half> %b, %b + %mac = fadd <2 x half> %mul, %a + store <2 x half> %mac, ptr addrspace(1) %out, align 4 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" } diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index d7a6be51106917..f8c9827ecf7a99 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -32,12 +32,12 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 op_sel_hi:[1,0] +; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; VI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, ; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; VI-DAG: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, [[ZERO]], v{{[0-9]+}} ; VI-DAG: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}} ; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[TWO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NOT: v_and_b32