From a6e46bf0f0432d01f184ac9a2860719e89f564e4 Mon Sep 17 00:00:00 2001 From: Brian Favela Date: Tue, 10 Aug 2021 22:29:32 +0000 Subject: [PATCH 1/6] Enhance SDWA peephole phase to look at all uses of an inst to evaluate if it can be folded ; v_and_b32 v2, lit(0x0000ffff), v2 ; v_and_b32 v3, 6, v2 ; v_and_b32 v2, 1, v2 Can be folded to: ; v_and_b32 v3, 6, sel_lo(v2) ; v_and_b32 v2, 1, sel_lo(v2) --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 81 ++++++++++++++++++----- 1 file changed, 65 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 1fadd8ce45b1f59..43348a0f68b137b 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -37,20 +37,24 @@ STATISTIC(NumSDWAInstructionsPeepholed, namespace { +bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST, + const SIInstrInfo *TII); class SDWAOperand; class SDWADstOperand; -class SIPeepholeSDWA : public MachineFunctionPass { -public: - using SDWAOperandsVector = SmallVector; +using SDWAOperandsVector = SmallVector; + +// helper typedef to make code cleaner +typedef std::unordered_map SDWAOperandsMap; +class SIPeepholeSDWA : public MachineFunctionPass { private: MachineRegisterInfo *MRI; const SIRegisterInfo *TRI; const SIInstrInfo *TII; - MapVector> SDWAOperands; - MapVector PotentialMatches; + std::unordered_map> SDWAOperands; + SDWAOperandsMap PotentialMatches; SmallVector ConvertedInstructions; std::optional foldToImm(const MachineOperand &Op) const; @@ -65,7 +69,6 @@ class SIPeepholeSDWA : public MachineFunctionPass { bool runOnMachineFunction(MachineFunction &MF) override; void matchSDWAOperands(MachineBasicBlock &MBB); std::unique_ptr matchSDWAOperand(MachineInstr &MI); - bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const; void pseudoOpConvertToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const; bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); @@ -93,7 +96,9 @@ class SDWAOperand { virtual ~SDWAOperand() = default; - virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; + virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches = nullptr) = 0; virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; MachineOperand *getTargetOperand() const { return Target; } @@ -126,7 +131,9 @@ class SDWASrcOperand : public SDWAOperand { : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} - MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; + MachineInstr *potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches = nullptr) override; bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; SdwaSel getSrcSel() const { return SrcSel; } @@ -153,7 +160,9 @@ class SDWADstOperand : public SDWAOperand { SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} - MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; + MachineInstr *potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches = nullptr) override; bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; SdwaSel getDstSel() const { return DstSel; } @@ -327,7 +336,42 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, return Mods; } -MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { +MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches) { + // If PotentialMatches is not null, then fill out the map for all uses, + // if all can be converted + if (PotentialMatches != nullptr) { + MachineOperand *Reg = getReplacedOperand(); + if (!Reg->isReg() || !Reg->isDef()) { + return nullptr; + } + + for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) { + // If there exist use of subreg of Reg then return nullptr + if (!isSameReg(UseMO, *Reg)) + return nullptr; + + // Check that all instructions the use Reg can be converted + if (!isConvertibleToSDWA(*(UseMO.getParent()), ST, TII)) { + return nullptr; + } + + // Not handling the obscure case where the same use is in multiple operands + if (PotentialMatches->find(UseMO.getParent()) != PotentialMatches->end()) { + return nullptr; + } + } + // Now that it's guaranteed all uses are legal, iterate over the uses again + // to add them for later conversion. + for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) { + SDWAOperandsMap& potentialMatchesMap = *PotentialMatches; + MachineInstr* UseMI = UseMO.getParent(); + potentialMatchesMap[UseMI].push_back(this); + } + return nullptr; + } + // For SDWA src operand potential instruction is one that use register // defined by parent instruction MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); @@ -420,7 +464,9 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { return true; } -MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { +MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII, + const GCNSubtarget &ST, + SDWAOperandsMap *PotentialMatches) { // For SDWA dst operand potential instruction is one that defines register // that this operand uses MachineRegisterInfo *MRI = getMRI(); @@ -919,8 +965,10 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI); } -bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, - const GCNSubtarget &ST) const { +namespace { +bool isConvertibleToSDWA(MachineInstr &MI, + const GCNSubtarget &ST, + const SIInstrInfo* TII) { // Check if this is already an SDWA instruction unsigned Opc = MI.getOpcode(); if (TII->isSDWA(Opc)) @@ -980,6 +1028,7 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, return true; } +} // namespace bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands) { @@ -1215,7 +1264,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { matchSDWAOperands(MBB); for (const auto &OperandPair : SDWAOperands) { const auto &Operand = OperandPair.second; - MachineInstr *PotentialMI = Operand->potentialToConvert(TII); + MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST); if (PotentialMI && (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64)) @@ -1228,8 +1277,8 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { for (const auto &OperandPair : SDWAOperands) { const auto &Operand = OperandPair.second; - MachineInstr *PotentialMI = Operand->potentialToConvert(TII); - if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { + MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches); + if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) { PotentialMatches[PotentialMI].push_back(Operand.get()); } } From 4b1a12930e9bfc8556488510291e91f1163a0b64 Mon Sep 17 00:00:00 2001 From: Brian Favela Date: Thu, 6 Jun 2024 14:23:49 -0400 Subject: [PATCH 2/6] First round of test fixes --- .../AMDGPU/GlobalISel/cvt_f32_ubyte.ll | 27 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 308 +++++----- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 359 ++++++------ .../CodeGen/AMDGPU/GlobalISel/llvm.abs.ll | 55 +- .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 325 ++++++----- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 323 +++++------ llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 36 +- llvm/test/CodeGen/AMDGPU/fract-match.ll | 25 +- llvm/test/CodeGen/AMDGPU/fshr.ll | 92 ++- llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 86 ++- llvm/test/CodeGen/AMDGPU/idot4u.ll | 31 +- llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll | 24 +- llvm/test/CodeGen/AMDGPU/llvm.frexp.ll | 42 +- ...ne-sink-temporal-divergence-swdev407790.ll | 25 +- llvm/test/CodeGen/AMDGPU/permute_i8.ll | 545 ++++++++---------- .../AMDGPU/reassoc-mul-add-1-to-mad.ll | 5 +- .../CodeGen/AMDGPU/sdwa-peephole-instr.mir | 6 + llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 17 +- update.bat | 21 + 19 files changed, 1160 insertions(+), 1192 deletions(-) create mode 100644 update.bat diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll index 02781e763f44a18..eb20178f9f4d881 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -771,7 +771,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v6, 8 +; VI-NEXT: v_mov_b32_e32 v6, 9 +; VI-NEXT: v_mov_b32_e32 v7, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -779,28 +780,28 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 9 +; VI-NEXT: v_mov_b32_e32 v2, 0xff ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; VI-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1 ; VI-NEXT: v_add_u16_e32 v9, 9, v1 -; VI-NEXT: v_add_u16_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v7, 9, v7 +; VI-NEXT: v_add_u16_sdwa v10, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; VI-NEXT: v_add_u16_e32 v8, 9, v8 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; VI-NEXT: v_lshlrev_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_and_b32_e32 v1, 0xff, v8 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; VI-NEXT: v_lshlrev_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 ; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v10 +; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v6 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_or_b32_e32 v2, v0, v2 ; VI-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 06930388901b0fe..4df5fa18e2942d4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -1271,46 +1271,45 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX8-LABEL: v_fshl_v4i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_not_b32_e32 v7, v2 +; GFX8-NEXT: v_mov_b32_e32 v9, 1 +; GFX8-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX8-NEXT: v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX8-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX8-NEXT: v_not_b32_e32 v2, v2 -; GFX8-NEXT: v_mov_b32_e32 v10, 1 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b16_e32 v8, v8, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v11 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, v6, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v10 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v8, v2 -; GFX8-NEXT: v_and_b32_e32 v8, 7, v5 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: v_and_b32_e32 v7, 7, v5 ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_mov_b32_e32 v9, 0xff -; GFX8-NEXT: v_lshlrev_b16_e32 v3, v8, v3 +; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, v7, v3 ; GFX8-NEXT: v_lshrrev_b16_e32 v4, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, 0xff ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 7, v6 -; GFX8-NEXT: v_not_b32_e32 v5, v6 -; GFX8-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v6 -; GFX8-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6 -; GFX8-NEXT: v_not_b32_e32 v6, v7 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX8-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, v6, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, 7 +; GFX8-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v8 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v5 +; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 @@ -1321,47 +1320,46 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX9-LABEL: v_fshl_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_not_b32_e32 v7, v2 +; GFX9-NEXT: v_mov_b32_e32 v9, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX9-NEXT: v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX9-NEXT: v_not_b32_e32 v2, v2 -; GFX9-NEXT: v_mov_b32_e32 v10, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX9-NEXT: v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b16_e32 v8, v8, v0 -; GFX9-NEXT: v_lshrrev_b16_e32 v2, v2, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v0 +; GFX9-NEXT: v_lshrrev_b16_e32 v7, v7, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: v_or_b32_e32 v2, v8, v2 -; GFX9-NEXT: v_and_b32_e32 v8, 7, v5 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: v_and_b32_e32 v7, 7, v5 ; GFX9-NEXT: v_not_b32_e32 v5, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xff -; GFX9-NEXT: v_lshlrev_b16_e32 v3, v8, v3 +; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, v7, v3 ; GFX9-NEXT: v_lshrrev_b16_e32 v4, v5, v4 +; GFX9-NEXT: v_mov_b32_e32 v8, 0xff ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 7, v6 -; GFX9-NEXT: v_not_b32_e32 v5, v6 -; GFX9-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX9-NEXT: v_lshrrev_b16_e32 v6, 1, v6 -; GFX9-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b16_e32 v5, v5, v6 -; GFX9-NEXT: v_not_b32_e32 v6, v7 -; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX9-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshrrev_b16_e32 v1, v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, 7 +; GFX9-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX9-NEXT: v_lshrrev_b16_e32 v10, 1, v10 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b16_e32 v7, v7, v10 +; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_or_b32 v1, v2, v9, v1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v5 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX9-NEXT: v_and_or_b32 v1, v6, v8, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 @@ -1370,42 +1368,41 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10-LABEL: v_fshl_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX10-NEXT: v_and_b32_e32 v9, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v11, 0xff, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX10-NEXT: v_not_b32_e32 v12, v7 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX10-NEXT: v_not_b32_e32 v9, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_lshlrev_b16 v0, v10, v0 -; GFX10-NEXT: v_not_b32_e32 v10, v8 -; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 -; GFX10-NEXT: v_mov_b32_e32 v13, 0xff -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX10-NEXT: v_and_b32_e32 v12, 0xff, v1 +; GFX10-NEXT: v_lshlrev_b16 v0, v9, v0 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX10-NEXT: v_lshrrev_b16 v9, 1, v11 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v12 +; GFX10-NEXT: v_mov_b32_e32 v12, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v1 ; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX10-NEXT: v_lshlrev_b16 v3, v8, v3 -; GFX10-NEXT: v_not_b32_e32 v8, v11 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_not_b32_e32 v13, v2 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_lshlrev_b16 v3, v7, v3 +; GFX10-NEXT: v_mov_b32_e32 v7, 7 +; GFX10-NEXT: v_not_b32_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_not_b32_sdwa v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_not_b32_e32 v8, v2 ; GFX10-NEXT: v_lshrrev_b16 v6, 1, v6 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 -; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_and_b32_sdwa v14, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_e32 v13, 7, v13 -; GFX10-NEXT: v_lshrrev_b16 v7, 1, v7 -; GFX10-NEXT: v_and_b32_e32 v9, 7, v9 -; GFX10-NEXT: v_lshrrev_b16 v12, 1, v12 -; GFX10-NEXT: v_lshrrev_b16 v6, v10, v6 -; GFX10-NEXT: v_lshlrev_b16 v4, v11, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v8, v1 +; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX10-NEXT: v_and_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v7, 7, v12 +; GFX10-NEXT: v_lshrrev_b16 v10, 1, v10 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX10-NEXT: v_lshrrev_b16 v6, v11, v6 +; GFX10-NEXT: v_lshlrev_b16 v4, v14, v4 +; GFX10-NEXT: v_lshrrev_b16 v1, v13, v1 ; GFX10-NEXT: v_lshlrev_b16 v2, v2, v5 -; GFX10-NEXT: v_lshrrev_b16 v5, v13, v7 -; GFX10-NEXT: v_lshrrev_b16 v7, v9, v12 +; GFX10-NEXT: v_lshrrev_b16 v5, v7, v10 +; GFX10-NEXT: v_lshrrev_b16 v7, v8, v9 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, 8 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 @@ -3932,25 +3929,26 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { ; GFX8-LABEL: v_fshl_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, v3, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v5 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, 15 +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v5, -1 +; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v4, 1 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, v3, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_v2i16: @@ -4083,27 +4081,28 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg % ; ; GFX8-LABEL: v_fshl_v2i16_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s1 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: s_lshr_b32 s0, s0, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, 15 +; GFX8-NEXT: v_mov_b32_e32 v3, -1 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v1 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX8-NEXT: s_lshr_b32 s0, s3, 1 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshl_v2i16_ssv: @@ -4620,32 +4619,33 @@ define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) ; GFX8-LABEL: v_fshl_v3i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 15, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 ; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8 -; GFX8-NEXT: v_or_b32_e32 v4, v7, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, v6, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: v_mov_b32_e32 v7, 15 +; GFX8-NEXT: v_and_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v8, -1 +; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v7, 1 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v6, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v5 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v3 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v5, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -4984,42 +4984,42 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX8-LABEL: v_fshl_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v8, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v8, v8, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v9 -; GFX8-NEXT: v_or_b32_e32 v4, v8, v4 -; GFX8-NEXT: v_and_b32_e32 v8, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 15, v4 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, v6, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: v_mov_b32_e32 v7, 15 +; GFX8-NEXT: v_and_b32_sdwa v8, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v9, -1 +; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_mov_b32_e32 v8, 1 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v6, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v10, 1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6 -; GFX8-NEXT: v_xor_b32_e32 v6, -1, v7 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v7 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v10 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_and_b32_sdwa v4, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, v6, v3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index ff93cddafc8728c..61588e640be1856 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -1272,46 +1272,45 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX8-LABEL: v_fshr_v4i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_not_b32_e32 v7, v2 +; GFX8-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX8-NEXT: v_lshlrev_b16_e32 v8, 1, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX8-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX8-NEXT: v_not_b32_e32 v2, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v9 -; GFX8-NEXT: v_lshrrev_b16_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v8 +; GFX8-NEXT: v_lshrrev_b16_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v8 -; GFX8-NEXT: v_and_b32_e32 v8, 7, v5 +; GFX8-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX8-NEXT: v_and_b32_e32 v7, 7, v5 ; GFX8-NEXT: v_not_b32_e32 v5, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, v5, v3 -; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 7, v6 -; GFX8-NEXT: v_not_b32_e32 v5, v6 -; GFX8-NEXT: v_mov_b32_e32 v6, 1 -; GFX8-NEXT: v_mov_b32_e32 v9, 0xff -; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX8-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshlrev_b16_e32 v5, v5, v8 -; GFX8-NEXT: v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX8-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX8-NEXT: v_not_b32_e32 v7, v7 +; GFX8-NEXT: v_mov_b32_e32 v4, 7 +; GFX8-NEXT: v_mov_b32_e32 v8, 0xff +; GFX8-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v9, 1 +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 ; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v7, v0 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v10, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v10 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v8 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v5 +; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 @@ -1322,47 +1321,46 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX9-LABEL: v_fshr_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_not_b32_e32 v7, v2 +; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v8, 1, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 -; GFX9-NEXT: v_not_b32_e32 v2, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v9, 1, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, v2, v9 -; GFX9-NEXT: v_lshrrev_b16_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8 +; GFX9-NEXT: v_lshrrev_b16_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX9-NEXT: v_or_b32_e32 v2, v2, v8 -; GFX9-NEXT: v_and_b32_e32 v8, 7, v5 +; GFX9-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 7, v5 ; GFX9-NEXT: v_not_b32_e32 v5, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v3 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, v5, v3 -; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 7, v6 -; GFX9-NEXT: v_not_b32_e32 v5, v6 -; GFX9-NEXT: v_mov_b32_e32 v6, 1 -; GFX9-NEXT: v_mov_b32_e32 v9, 0xff -; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshlrev_b16_e32 v5, v5, v8 -; GFX9-NEXT: v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b16_e32 v4, v4, v8 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v5, 7, v7 -; GFX9-NEXT: v_not_b32_e32 v7, v7 +; GFX9-NEXT: v_mov_b32_e32 v4, 7 +; GFX9-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v9, 1 +; GFX9-NEXT: v_mov_b32_e32 v8, 0xff +; GFX9-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, v7, v0 -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_sdwa v10, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v10 +; GFX9-NEXT: v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshrrev_b16_e32 v5, v5, v10 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_or_b32_e32 v5, v7, v5 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_and_or_b32 v1, v2, v9, v1 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v5 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX9-NEXT: v_and_or_b32 v1, v6, v8, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 @@ -1372,52 +1370,51 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX10-NEXT: v_not_b32_e32 v8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; GFX10-NEXT: v_not_b32_e32 v10, v5 -; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 -; GFX10-NEXT: v_mov_b32_e32 v13, 0xff -; GFX10-NEXT: v_not_b32_e32 v14, v12 -; GFX10-NEXT: v_lshlrev_b16 v3, v10, v3 -; GFX10-NEXT: v_not_b32_e32 v10, v11 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 +; GFX10-NEXT: v_mov_b32_e32 v3, 7 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 -; GFX10-NEXT: v_and_b32_e32 v8, 0xff, v1 +; GFX10-NEXT: v_not_b32_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: v_not_b32_sdwa v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_lshlrev_b16 v4, v10, v4 +; GFX10-NEXT: v_mov_b32_e32 v10, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v1 +; GFX10-NEXT: v_and_b32_e32 v12, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v13, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX10-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v13, 7, v14 +; GFX10-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX10-NEXT: v_and_b32_sdwa v15, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 -; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 -; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_lshrrev_b16 v5, v5, v7 -; GFX10-NEXT: v_lshlrev_b16 v4, v10, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v11, v1 -; GFX10-NEXT: v_lshlrev_b16 v6, v13, v6 -; GFX10-NEXT: v_lshrrev_b16 v7, v12, v9 -; GFX10-NEXT: v_lshrrev_b16 v2, v2, v8 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v5 -; GFX10-NEXT: v_mov_b32_e32 v5, 8 -; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX10-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v10, 7, v14 +; GFX10-NEXT: v_lshlrev_b16 v7, 1, v7 +; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b16 v3, v5, v9 +; GFX10-NEXT: v_lshlrev_b16 v5, v8, v6 +; GFX10-NEXT: v_lshrrev_b16 v1, v15, v1 +; GFX10-NEXT: v_lshlrev_b16 v6, v10, v7 +; GFX10-NEXT: v_lshrrev_b16 v2, v2, v11 +; GFX10-NEXT: v_lshrrev_b16 v7, v12, v13 +; GFX10-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX10-NEXT: v_mov_b32_e32 v4, 8 +; GFX10-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v4 -; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -3718,29 +3715,29 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { ; GFX8-NEXT: v_mov_b32_e32 v4, 1 ; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-NEXT: v_lshlrev_b16_e32 v5, 1, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v1 ; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, v4, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, 1, v6 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v7, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_and_b32_sdwa v4, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v5, -1 +; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v5 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, v6, v3 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v1, v4, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v4, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v2i16: @@ -3896,30 +3893,31 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg % ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 1 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s1 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: v_lshrrev_b16_e64 v2, v2, s0 ; GFX8-NEXT: s_lshr_b32 s4, s3, 15 ; GFX8-NEXT: s_lshl_b32 s3, s3, 1 -; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, 15 +; GFX8-NEXT: v_mov_b32_e32 v3, -1 ; GFX8-NEXT: s_lshl_b32 s2, s2, 1 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 15, v1 -; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_and_b32 s0, 0xffff, s3 ; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX8-NEXT: s_lshr_b32 s0, s0, 1 ; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshr_v2i16_ssv: @@ -4536,47 +4534,47 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) ; GFX8-LABEL: v_fshr_v3i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b16_e32 v7, 1, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v8, 15, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX8-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX8-NEXT: v_mov_b32_e32 v8, 1 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v8, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v9, 15, v4 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, 15, v2 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: v_mov_b32_e32 v7, 1 +; GFX8-NEXT: v_mov_b32_e32 v8, 15 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v9, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v2 +; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v4 +; GFX8-NEXT: v_xor_b32_e32 v10, -1, v4 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, v7, v6 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v9 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, v10, v7 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: v_and_b32_sdwa v7, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v8, -1 +; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v7, v9, v7 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v7, v0 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v6 -; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v8 -; GFX8-NEXT: v_xor_b32_e32 v7, -1, v8 -; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v4, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 1, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v7, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 15, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v5 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, v5, v1 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, v4, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 15, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, v4, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -5001,44 +4999,43 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; GFX8-NEXT: v_and_b32_e32 v11, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_xor_b32_e32 v11, -1, v4 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v4 +; GFX8-NEXT: v_and_b32_e32 v11, 15, v11 ; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v9 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, v10, v6 +; GFX8-NEXT: v_lshrrev_b16_e32 v9, v11, v9 +; GFX8-NEXT: v_mov_b32_e32 v10, -1 ; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshlrev_b16_e32 v6, v11, v6 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v9 -; GFX8-NEXT: v_xor_b32_e32 v9, -1, v10 -; GFX8-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v10 -; GFX8-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX8-NEXT: v_and_b32_sdwa v9, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 ; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v9, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v9, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 1, v1 ; GFX8-NEXT: v_lshrrev_b16_e32 v4, 15, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 1, v3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX8-NEXT: v_and_b32_e32 v7, 15, v5 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v5 +; GFX8-NEXT: v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v6, 15, v5 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v7 ; GFX8-NEXT: v_lshrrev_b16_e32 v4, 1, v4 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, v7, v2 -; GFX8-NEXT: v_lshrrev_b16_e32 v4, v5, v4 -; GFX8-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, v6, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v7, v4 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_and_b32_e32 v4, 15, v6 +; GFX8-NEXT: v_and_b32_sdwa v4, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 ; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, v4, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll index 3ef059057ac8e30..41e915a4c1011b4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll @@ -248,13 +248,12 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) { ; GFX8-LABEL: abs_vgpr_v2i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v0, 8, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_sub_u16_e32 v2, 0, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v1, 8, v1 -; GFX8-NEXT: v_max_i16_e32 v0, v0, v2 -; GFX8-NEXT: v_sub_u16_e32 v2, 0, v1 -; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 +; GFX8-NEXT: v_sub_u16_sdwa v3, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_sub_u16_sdwa v2, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog @@ -340,17 +339,15 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) { ; GFX8-LABEL: abs_vgpr_v3i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v0, 8, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_sub_u16_sdwa v4, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_sub_u16_e32 v3, 0, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v1, 8, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX8-NEXT: v_max_i16_e32 v0, v0, v3 -; GFX8-NEXT: v_sub_u16_e32 v3, 0, v1 -; GFX8-NEXT: v_ashrrev_i16_e32 v2, 8, v2 -; GFX8-NEXT: v_max_i16_e32 v1, v1, v3 -; GFX8-NEXT: v_sub_u16_e32 v3, 0, v2 -; GFX8-NEXT: v_max_i16_e32 v2, v2, v3 +; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_sdwa v4, v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_sub_u16_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_sdwa v2, sext(v2), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2 @@ -424,12 +421,12 @@ define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) { ; ; GFX8-LABEL: abs_vgpr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_sub_u16_e32 v2, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v3, 0, v1 -; GFX8-NEXT: v_max_i16_e32 v0, v0, v2 -; GFX8-NEXT: v_max_i16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0 +; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v1, v0, v1 +; GFX8-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -503,14 +500,14 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) { ; ; GFX8-LABEL: abs_vgpr_v3i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_sub_u16_e32 v3, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v4, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v5, 0, v1 -; GFX8-NEXT: v_max_i16_e32 v0, v0, v3 -; GFX8-NEXT: v_max_i16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_max_i16_e32 v1, v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_sub_u16_e32 v2, 0, v0 +; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v4, 0, v1 +; GFX8-NEXT: v_max_i16_e32 v2, v0, v2 +; GFX8-NEXT: v_max_i16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_max_i16_e32 v1, v1, v4 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index a6f9bb7ee055d41..04da7b24156dcd3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -298,8 +298,9 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -606,10 +607,12 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_add_u16_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_ashrrev_i16_e32 v2, 8, v2 ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 @@ -2774,22 +2777,22 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-LABEL: v_saddsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_i16_e32 v4, 0, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_max_i16_e32 v3, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v4, 0x8000, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, 0x7fff, v3 -; GFX8-NEXT: v_max_i16_e32 v4, v4, v1 -; GFX8-NEXT: v_min_i16_e32 v5, 0, v2 -; GFX8-NEXT: v_min_i16_e32 v3, v4, v3 -; GFX8-NEXT: v_max_i16_e32 v4, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 +; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3 +; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 +; GFX8-NEXT: v_max_i16_e32 v3, v3, v1 +; GFX8-NEXT: v_min_i16_e32 v2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_max_i16_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_sub_u16_e32 v4, 0x7fff, v4 -; GFX8-NEXT: v_max_i16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v3 -; GFX8-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_add_u16_e32 v2, v0, v2 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_v2i16: @@ -2987,23 +2990,23 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX8-LABEL: saddsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_min_i16_e32 v3, 0, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_max_i16_e32 v2, 0, v0 -; GFX8-NEXT: v_sub_u16_e32 v3, 0x8000, v3 -; GFX8-NEXT: v_sub_u16_e32 v2, 0x7fff, v2 -; GFX8-NEXT: v_max_i16_e32 v3, s0, v3 -; GFX8-NEXT: v_min_i16_e32 v4, 0, v1 +; GFX8-NEXT: v_min_i16_e32 v2, 0, v0 +; GFX8-NEXT: v_max_i16_e32 v1, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v2, 0x8000, v2 +; GFX8-NEXT: v_sub_u16_e32 v1, 0x7fff, v1 +; GFX8-NEXT: v_max_i16_e32 v2, s0, v2 +; GFX8-NEXT: v_min_i16_e32 v1, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_max_i16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: v_min_i16_e32 v2, v3, v2 -; GFX8-NEXT: v_max_i16_e32 v3, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v4, 0x8000, v4 +; GFX8-NEXT: v_sub_u16_e32 v2, 0x8000, v2 ; GFX8-NEXT: v_sub_u16_e32 v3, 0x7fff, v3 -; GFX8-NEXT: v_max_i16_e32 v4, s1, v4 -; GFX8-NEXT: v_min_i16_e32 v3, v4, v3 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v2 -; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_max_i16_e32 v2, s1, v2 +; GFX8-NEXT: v_min_i16_e32 v2, v2, v3 +; GFX8-NEXT: v_add_u16_e32 v1, v0, v1 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: saddsat_v2i16_vs: @@ -3090,38 +3093,37 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-LABEL: v_saddsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_i16_e32 v7, 0, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX8-NEXT: v_max_i16_e32 v6, 0, v0 +; GFX8-NEXT: v_min_i16_e32 v5, 0, v0 +; GFX8-NEXT: v_max_i16_e32 v4, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_sub_u16_e32 v4, 0x7fff, v4 +; GFX8-NEXT: v_max_i16_e32 v5, v5, v2 +; GFX8-NEXT: v_min_i16_e32 v4, v5, v4 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_min_i16_sdwa v7, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_sdwa v6, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_sub_u16_e32 v6, 0x7fff, v6 +; GFX8-NEXT: v_max_i16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v7, 0, v1 +; GFX8-NEXT: v_min_i16_e32 v2, v2, v6 +; GFX8-NEXT: v_max_i16_e32 v6, 0, v1 ; GFX8-NEXT: v_sub_u16_e32 v7, 0x8000, v7 ; GFX8-NEXT: v_sub_u16_e32 v6, 0x7fff, v6 -; GFX8-NEXT: v_max_i16_e32 v7, v7, v2 -; GFX8-NEXT: v_min_i16_e32 v8, 0, v4 +; GFX8-NEXT: v_max_i16_e32 v7, v7, v3 ; GFX8-NEXT: v_min_i16_e32 v6, v7, v6 -; GFX8-NEXT: v_max_i16_e32 v7, 0, v4 -; GFX8-NEXT: v_sub_u16_e32 v8, 0x8000, v8 -; GFX8-NEXT: v_sub_u16_e32 v7, 0x7fff, v7 -; GFX8-NEXT: v_max_i16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v8, 0, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_min_i16_e32 v2, v2, v7 -; GFX8-NEXT: v_max_i16_e32 v7, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v8, 0x8000, v8 +; GFX8-NEXT: v_max_i16_sdwa v7, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v5, 0x8000, v5 ; GFX8-NEXT: v_sub_u16_e32 v7, 0x7fff, v7 -; GFX8-NEXT: v_max_i16_e32 v8, v8, v3 -; GFX8-NEXT: v_min_i16_e32 v9, 0, v5 -; GFX8-NEXT: v_min_i16_e32 v7, v8, v7 -; GFX8-NEXT: v_max_i16_e32 v8, 0, v5 -; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 -; GFX8-NEXT: v_sub_u16_e32 v8, 0x7fff, v8 -; GFX8-NEXT: v_max_i16_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v8 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v6 -; GFX8-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v7 -; GFX8-NEXT: v_add_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_max_i16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v3, v3, v7 +; GFX8-NEXT: v_add_u16_e32 v4, v0, v4 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e32 v2, v1, v6 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_v4i16: @@ -3376,54 +3378,52 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX8-LABEL: v_saddsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_i16_e32 v10, 0, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX8-NEXT: v_max_i16_e32 v9, 0, v0 +; GFX8-NEXT: v_min_i16_e32 v7, 0, v0 +; GFX8-NEXT: v_max_i16_e32 v6, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_sub_u16_e32 v6, 0x7fff, v6 +; GFX8-NEXT: v_max_i16_e32 v7, v7, v3 +; GFX8-NEXT: v_min_i16_e32 v6, v7, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, 0 +; GFX8-NEXT: v_min_i16_sdwa v9, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_sdwa v8, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_sub_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_max_i16_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v9, 0, v1 +; GFX8-NEXT: v_min_i16_e32 v3, v3, v8 +; GFX8-NEXT: v_max_i16_e32 v8, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_sub_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_max_i16_e32 v9, v9, v4 +; GFX8-NEXT: v_min_i16_sdwa v10, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_e32 v8, v9, v8 +; GFX8-NEXT: v_max_i16_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v10, 0x8000, v10 +; GFX8-NEXT: v_sub_u16_e32 v9, 0x7fff, v9 +; GFX8-NEXT: v_max_i16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v10, 0, v2 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v9 +; GFX8-NEXT: v_max_i16_e32 v9, 0, v2 ; GFX8-NEXT: v_sub_u16_e32 v10, 0x8000, v10 ; GFX8-NEXT: v_sub_u16_e32 v9, 0x7fff, v9 -; GFX8-NEXT: v_max_i16_e32 v10, v10, v3 -; GFX8-NEXT: v_min_i16_e32 v11, 0, v6 +; GFX8-NEXT: v_max_i16_e32 v10, v10, v5 ; GFX8-NEXT: v_min_i16_e32 v9, v10, v9 -; GFX8-NEXT: v_max_i16_e32 v10, 0, v6 -; GFX8-NEXT: v_sub_u16_e32 v11, 0x8000, v11 -; GFX8-NEXT: v_sub_u16_e32 v10, 0x7fff, v10 -; GFX8-NEXT: v_max_i16_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v11, 0, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v10 -; GFX8-NEXT: v_max_i16_e32 v10, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_max_i16_sdwa v10, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_sdwa v7, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v7, 0x8000, v7 ; GFX8-NEXT: v_sub_u16_e32 v10, 0x7fff, v10 -; GFX8-NEXT: v_max_i16_e32 v11, v11, v4 -; GFX8-NEXT: v_min_i16_e32 v12, 0, v7 -; GFX8-NEXT: v_min_i16_e32 v10, v11, v10 -; GFX8-NEXT: v_max_i16_e32 v11, 0, v7 -; GFX8-NEXT: v_sub_u16_e32 v12, 0x8000, v12 -; GFX8-NEXT: v_sub_u16_e32 v11, 0x7fff, v11 -; GFX8-NEXT: v_max_i16_sdwa v4, v12, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v12, 0, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v11 -; GFX8-NEXT: v_max_i16_e32 v11, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v12, 0x8000, v12 -; GFX8-NEXT: v_sub_u16_e32 v11, 0x7fff, v11 -; GFX8-NEXT: v_max_i16_e32 v12, v12, v5 -; GFX8-NEXT: v_min_i16_e32 v13, 0, v8 -; GFX8-NEXT: v_min_i16_e32 v11, v12, v11 -; GFX8-NEXT: v_max_i16_e32 v12, 0, v8 -; GFX8-NEXT: v_sub_u16_e32 v13, 0x8000, v13 -; GFX8-NEXT: v_sub_u16_e32 v12, 0x7fff, v12 -; GFX8-NEXT: v_max_i16_sdwa v5, v13, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v9 -; GFX8-NEXT: v_add_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v5, v5, v12 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v10 -; GFX8-NEXT: v_add_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v11 -; GFX8-NEXT: v_add_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_max_i16_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v10 +; GFX8-NEXT: v_add_u16_e32 v6, v0, v6 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e32 v3, v1, v8 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_add_u16_e32 v3, v2, v9 +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_v6i16: @@ -3752,70 +3752,67 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX8-LABEL: v_saddsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_min_i16_e32 v13, 0, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX8-NEXT: v_max_i16_e32 v12, 0, v0 +; GFX8-NEXT: v_min_i16_e32 v9, 0, v0 +; GFX8-NEXT: v_max_i16_e32 v8, 0, v0 +; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_sub_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_max_i16_e32 v9, v9, v4 +; GFX8-NEXT: v_min_i16_e32 v8, v9, v8 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_min_i16_sdwa v11, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_i16_sdwa v10, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_sub_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_max_i16_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v11, 0, v1 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v10 +; GFX8-NEXT: v_max_i16_e32 v10, 0, v1 +; GFX8-NEXT: v_sub_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_sub_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_max_i16_e32 v11, v11, v5 +; GFX8-NEXT: v_min_i16_sdwa v12, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_e32 v10, v11, v10 +; GFX8-NEXT: v_max_i16_sdwa v11, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_sub_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v12, 0, v2 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v11 +; GFX8-NEXT: v_max_i16_e32 v11, 0, v2 +; GFX8-NEXT: v_sub_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_sub_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_max_i16_e32 v12, v12, v6 +; GFX8-NEXT: v_min_i16_sdwa v13, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_e32 v11, v12, v11 +; GFX8-NEXT: v_max_i16_sdwa v12, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v13, 0x8000, v13 +; GFX8-NEXT: v_sub_u16_e32 v12, 0x7fff, v12 +; GFX8-NEXT: v_max_i16_sdwa v6, v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v13, 0, v3 +; GFX8-NEXT: v_min_i16_e32 v6, v6, v12 +; GFX8-NEXT: v_max_i16_e32 v12, 0, v3 ; GFX8-NEXT: v_sub_u16_e32 v13, 0x8000, v13 ; GFX8-NEXT: v_sub_u16_e32 v12, 0x7fff, v12 -; GFX8-NEXT: v_max_i16_e32 v13, v13, v4 -; GFX8-NEXT: v_min_i16_e32 v14, 0, v8 +; GFX8-NEXT: v_max_i16_e32 v13, v13, v7 ; GFX8-NEXT: v_min_i16_e32 v12, v13, v12 -; GFX8-NEXT: v_max_i16_e32 v13, 0, v8 -; GFX8-NEXT: v_sub_u16_e32 v14, 0x8000, v14 -; GFX8-NEXT: v_sub_u16_e32 v13, 0x7fff, v13 -; GFX8-NEXT: v_max_i16_sdwa v4, v14, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v14, 0, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v13 -; GFX8-NEXT: v_max_i16_e32 v13, 0, v1 -; GFX8-NEXT: v_sub_u16_e32 v14, 0x8000, v14 +; GFX8-NEXT: v_max_i16_sdwa v13, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_sdwa v9, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v9, 0x8000, v9 ; GFX8-NEXT: v_sub_u16_e32 v13, 0x7fff, v13 -; GFX8-NEXT: v_max_i16_e32 v14, v14, v5 -; GFX8-NEXT: v_min_i16_e32 v15, 0, v9 -; GFX8-NEXT: v_min_i16_e32 v13, v14, v13 -; GFX8-NEXT: v_max_i16_e32 v14, 0, v9 -; GFX8-NEXT: v_sub_u16_e32 v15, 0x8000, v15 -; GFX8-NEXT: v_sub_u16_e32 v14, 0x7fff, v14 -; GFX8-NEXT: v_max_i16_sdwa v5, v15, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v15, 0, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_min_i16_e32 v5, v5, v14 -; GFX8-NEXT: v_max_i16_e32 v14, 0, v2 -; GFX8-NEXT: v_sub_u16_e32 v15, 0x8000, v15 -; GFX8-NEXT: v_sub_u16_e32 v14, 0x7fff, v14 -; GFX8-NEXT: v_max_i16_e32 v15, v15, v6 -; GFX8-NEXT: v_min_i16_e32 v16, 0, v10 -; GFX8-NEXT: v_min_i16_e32 v14, v15, v14 -; GFX8-NEXT: v_max_i16_e32 v15, 0, v10 -; GFX8-NEXT: v_sub_u16_e32 v16, 0x8000, v16 -; GFX8-NEXT: v_sub_u16_e32 v15, 0x7fff, v15 -; GFX8-NEXT: v_max_i16_sdwa v6, v16, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v16, 0, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX8-NEXT: v_min_i16_e32 v6, v6, v15 -; GFX8-NEXT: v_max_i16_e32 v15, 0, v3 -; GFX8-NEXT: v_sub_u16_e32 v16, 0x8000, v16 -; GFX8-NEXT: v_sub_u16_e32 v15, 0x7fff, v15 -; GFX8-NEXT: v_max_i16_e32 v16, v16, v7 -; GFX8-NEXT: v_min_i16_e32 v17, 0, v11 -; GFX8-NEXT: v_min_i16_e32 v15, v16, v15 -; GFX8-NEXT: v_max_i16_e32 v16, 0, v11 -; GFX8-NEXT: v_sub_u16_e32 v17, 0x8000, v17 -; GFX8-NEXT: v_add_u16_e32 v0, v0, v12 -; GFX8-NEXT: v_add_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_sub_u16_e32 v16, 0x7fff, v16 -; GFX8-NEXT: v_max_i16_sdwa v7, v17, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_add_u16_e32 v1, v1, v13 -; GFX8-NEXT: v_add_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v7, v7, v16 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v14 -; GFX8-NEXT: v_add_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v15 -; GFX8-NEXT: v_add_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_max_i16_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_add_u16_e32 v8, v0, v8 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_add_u16_e32 v4, v1, v10 +; GFX8-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_e32 v7, v7, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_add_u16_e32 v4, v2, v11 +; GFX8-NEXT: v_add_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_add_u16_e32 v4, v3, v12 +; GFX8-NEXT: v_add_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 320dfbb4980e4c1..d7b1e37a81a1b97 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -279,9 +279,11 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 +; GFX8-NEXT: v_ashrrev_i16_e32 v1, 8, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -298,8 +300,9 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_mov_b32_e32 v1, 0xff -; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2774,22 +2777,22 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-LABEL: v_ssubsat_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_i16_e32 v3, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v3, 0x7fff, v3 -; GFX8-NEXT: v_min_i16_e32 v4, -1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v4, 0x8000, v4 -; GFX8-NEXT: v_max_i16_e32 v3, v3, v1 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 -; GFX8-NEXT: v_max_i16_e32 v4, -1, v2 +; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 +; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 +; GFX8-NEXT: v_max_i16_e32 v2, v2, v1 +; GFX8-NEXT: v_min_i16_e32 v2, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, -1 +; GFX8-NEXT: v_max_i16_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4 -; GFX8-NEXT: v_min_i16_e32 v5, -1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_min_i16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v1, v1, v5 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v3 -; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_min_i16_e32 v1, v1, v3 +; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v2i16: @@ -2987,23 +2990,23 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX8-LABEL: ssubsat_v2i16_vs: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 -; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 -; GFX8-NEXT: v_max_i16_e32 v2, s0, v2 -; GFX8-NEXT: v_min_i16_e32 v2, v2, v3 -; GFX8-NEXT: v_max_i16_e32 v3, -1, v1 +; GFX8-NEXT: v_max_i16_e32 v1, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 +; GFX8-NEXT: v_min_i16_e32 v2, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v2, 0x8000, v2 +; GFX8-NEXT: v_max_i16_e32 v1, s0, v1 +; GFX8-NEXT: v_min_i16_e32 v1, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, -1 +; GFX8-NEXT: v_max_i16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: v_subrev_u16_e32 v3, 0x7fff, v3 -; GFX8-NEXT: v_min_i16_e32 v4, -1, v1 -; GFX8-NEXT: v_subrev_u16_e32 v4, 0x8000, v4 +; GFX8-NEXT: v_min_i16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v2, 0x8000, v2 ; GFX8-NEXT: v_max_i16_e32 v3, s1, v3 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v4 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v2 -; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_min_i16_e32 v2, v3, v2 +; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ssubsat_v2i16_vs: @@ -3090,38 +3093,37 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-LABEL: v_ssubsat_v4i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_i16_e32 v6, -1, v0 +; GFX8-NEXT: v_max_i16_e32 v4, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4 +; GFX8-NEXT: v_min_i16_e32 v5, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_max_i16_e32 v4, v4, v2 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v5 +; GFX8-NEXT: v_mov_b32_e32 v5, -1 +; GFX8-NEXT: v_max_i16_sdwa v6, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_subrev_u16_e32 v6, 0x7fff, v6 -; GFX8-NEXT: v_min_i16_e32 v7, -1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX8-NEXT: v_min_i16_sdwa v7, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_max_i16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v6, -1, v1 +; GFX8-NEXT: v_min_i16_e32 v2, v2, v7 +; GFX8-NEXT: v_subrev_u16_e32 v6, 0x7fff, v6 +; GFX8-NEXT: v_min_i16_e32 v7, -1, v1 ; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7 -; GFX8-NEXT: v_max_i16_e32 v6, v6, v2 +; GFX8-NEXT: v_max_i16_e32 v6, v6, v3 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v7 -; GFX8-NEXT: v_max_i16_e32 v7, -1, v4 +; GFX8-NEXT: v_max_i16_sdwa v7, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_subrev_u16_e32 v7, 0x7fff, v7 -; GFX8-NEXT: v_min_i16_e32 v8, -1, v4 -; GFX8-NEXT: v_subrev_u16_e32 v8, 0x8000, v8 -; GFX8-NEXT: v_max_i16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v7, -1, v1 -; GFX8-NEXT: v_min_i16_e32 v2, v2, v8 -; GFX8-NEXT: v_subrev_u16_e32 v7, 0x7fff, v7 -; GFX8-NEXT: v_min_i16_e32 v8, -1, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: v_subrev_u16_e32 v8, 0x8000, v8 -; GFX8-NEXT: v_max_i16_e32 v7, v7, v3 -; GFX8-NEXT: v_min_i16_e32 v7, v7, v8 -; GFX8-NEXT: v_max_i16_e32 v8, -1, v5 -; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 -; GFX8-NEXT: v_min_i16_e32 v9, -1, v5 -; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 -; GFX8-NEXT: v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v9 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v6 -; GFX8-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v7 -; GFX8-NEXT: v_sub_u16_sdwa v2, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_min_i16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_max_i16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v3, v3, v5 +; GFX8-NEXT: v_sub_u16_e32 v4, v0, v4 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v2, v1, v6 +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v4i16: @@ -3376,54 +3378,52 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX8-LABEL: v_ssubsat_v6i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_i16_e32 v9, -1, v0 +; GFX8-NEXT: v_max_i16_e32 v6, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v6, 0x7fff, v6 +; GFX8-NEXT: v_min_i16_e32 v7, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_max_i16_e32 v6, v6, v3 +; GFX8-NEXT: v_min_i16_e32 v6, v6, v7 +; GFX8-NEXT: v_mov_b32_e32 v7, -1 +; GFX8-NEXT: v_max_i16_sdwa v8, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_min_i16_sdwa v9, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v8, -1, v1 +; GFX8-NEXT: v_min_i16_e32 v3, v3, v9 +; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_min_i16_e32 v9, -1, v1 +; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_max_i16_e32 v8, v8, v4 +; GFX8-NEXT: v_min_i16_e32 v8, v8, v9 +; GFX8-NEXT: v_max_i16_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_subrev_u16_e32 v9, 0x7fff, v9 -; GFX8-NEXT: v_min_i16_e32 v10, -1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX8-NEXT: v_min_i16_sdwa v10, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_subrev_u16_e32 v10, 0x8000, v10 -; GFX8-NEXT: v_max_i16_e32 v9, v9, v3 +; GFX8-NEXT: v_max_i16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v9, -1, v2 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v10 +; GFX8-NEXT: v_subrev_u16_e32 v9, 0x7fff, v9 +; GFX8-NEXT: v_min_i16_e32 v10, -1, v2 +; GFX8-NEXT: v_subrev_u16_e32 v10, 0x8000, v10 +; GFX8-NEXT: v_max_i16_e32 v9, v9, v5 ; GFX8-NEXT: v_min_i16_e32 v9, v9, v10 -; GFX8-NEXT: v_max_i16_e32 v10, -1, v6 +; GFX8-NEXT: v_max_i16_sdwa v10, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10 -; GFX8-NEXT: v_min_i16_e32 v11, -1, v6 -; GFX8-NEXT: v_subrev_u16_e32 v11, 0x8000, v11 -; GFX8-NEXT: v_max_i16_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v10, -1, v1 -; GFX8-NEXT: v_min_i16_e32 v3, v3, v11 -; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10 -; GFX8-NEXT: v_min_i16_e32 v11, -1, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX8-NEXT: v_subrev_u16_e32 v11, 0x8000, v11 -; GFX8-NEXT: v_max_i16_e32 v10, v10, v4 -; GFX8-NEXT: v_min_i16_e32 v10, v10, v11 -; GFX8-NEXT: v_max_i16_e32 v11, -1, v7 -; GFX8-NEXT: v_subrev_u16_e32 v11, 0x7fff, v11 -; GFX8-NEXT: v_min_i16_e32 v12, -1, v7 -; GFX8-NEXT: v_subrev_u16_e32 v12, 0x8000, v12 -; GFX8-NEXT: v_max_i16_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v11, -1, v2 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v12 -; GFX8-NEXT: v_subrev_u16_e32 v11, 0x7fff, v11 -; GFX8-NEXT: v_min_i16_e32 v12, -1, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX8-NEXT: v_subrev_u16_e32 v12, 0x8000, v12 -; GFX8-NEXT: v_max_i16_e32 v11, v11, v5 -; GFX8-NEXT: v_min_i16_e32 v11, v11, v12 -; GFX8-NEXT: v_max_i16_e32 v12, -1, v8 -; GFX8-NEXT: v_subrev_u16_e32 v12, 0x7fff, v12 -; GFX8-NEXT: v_min_i16_e32 v13, -1, v8 -; GFX8-NEXT: v_subrev_u16_e32 v13, 0x8000, v13 -; GFX8-NEXT: v_max_i16_sdwa v5, v12, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v9 -; GFX8-NEXT: v_sub_u16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v5, v5, v13 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v10 -; GFX8-NEXT: v_sub_u16_sdwa v3, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v11 -; GFX8-NEXT: v_sub_u16_sdwa v3, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_min_i16_sdwa v7, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_max_i16_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v7 +; GFX8-NEXT: v_sub_u16_e32 v6, v0, v6 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v3, v1, v8 +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_sub_u16_e32 v3, v2, v9 +; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v6i16: @@ -3752,70 +3752,67 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX8-LABEL: v_ssubsat_v8i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_i16_e32 v12, -1, v0 +; GFX8-NEXT: v_max_i16_e32 v8, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_min_i16_e32 v9, -1, v0 +; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_max_i16_e32 v8, v8, v4 +; GFX8-NEXT: v_min_i16_e32 v8, v8, v9 +; GFX8-NEXT: v_mov_b32_e32 v9, -1 +; GFX8-NEXT: v_max_i16_sdwa v10, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_min_i16_sdwa v11, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_max_i16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v10, -1, v1 +; GFX8-NEXT: v_min_i16_e32 v4, v4, v11 +; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_min_i16_e32 v11, -1, v1 +; GFX8-NEXT: v_subrev_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_max_i16_e32 v10, v10, v5 +; GFX8-NEXT: v_min_i16_e32 v10, v10, v11 +; GFX8-NEXT: v_max_i16_sdwa v11, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_min_i16_sdwa v12, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_max_i16_sdwa v5, v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v11, -1, v2 +; GFX8-NEXT: v_min_i16_e32 v5, v5, v12 +; GFX8-NEXT: v_subrev_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_min_i16_e32 v12, -1, v2 +; GFX8-NEXT: v_subrev_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_max_i16_e32 v11, v11, v6 +; GFX8-NEXT: v_min_i16_e32 v11, v11, v12 +; GFX8-NEXT: v_max_i16_sdwa v12, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v12, 0x7fff, v12 +; GFX8-NEXT: v_min_i16_sdwa v13, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v13, 0x8000, v13 +; GFX8-NEXT: v_max_i16_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_max_i16_e32 v12, -1, v3 +; GFX8-NEXT: v_min_i16_e32 v6, v6, v13 ; GFX8-NEXT: v_subrev_u16_e32 v12, 0x7fff, v12 -; GFX8-NEXT: v_min_i16_e32 v13, -1, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX8-NEXT: v_min_i16_e32 v13, -1, v3 ; GFX8-NEXT: v_subrev_u16_e32 v13, 0x8000, v13 -; GFX8-NEXT: v_max_i16_e32 v12, v12, v4 +; GFX8-NEXT: v_max_i16_e32 v12, v12, v7 ; GFX8-NEXT: v_min_i16_e32 v12, v12, v13 -; GFX8-NEXT: v_max_i16_e32 v13, -1, v8 -; GFX8-NEXT: v_subrev_u16_e32 v13, 0x7fff, v13 -; GFX8-NEXT: v_min_i16_e32 v14, -1, v8 -; GFX8-NEXT: v_subrev_u16_e32 v14, 0x8000, v14 -; GFX8-NEXT: v_max_i16_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v13, -1, v1 -; GFX8-NEXT: v_min_i16_e32 v4, v4, v14 +; GFX8-NEXT: v_max_i16_sdwa v13, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_subrev_u16_e32 v13, 0x7fff, v13 -; GFX8-NEXT: v_min_i16_e32 v14, -1, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; GFX8-NEXT: v_subrev_u16_e32 v14, 0x8000, v14 -; GFX8-NEXT: v_max_i16_e32 v13, v13, v5 -; GFX8-NEXT: v_min_i16_e32 v13, v13, v14 -; GFX8-NEXT: v_max_i16_e32 v14, -1, v9 -; GFX8-NEXT: v_subrev_u16_e32 v14, 0x7fff, v14 -; GFX8-NEXT: v_min_i16_e32 v15, -1, v9 -; GFX8-NEXT: v_subrev_u16_e32 v15, 0x8000, v15 -; GFX8-NEXT: v_max_i16_sdwa v5, v14, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v14, -1, v2 -; GFX8-NEXT: v_min_i16_e32 v5, v5, v15 -; GFX8-NEXT: v_subrev_u16_e32 v14, 0x7fff, v14 -; GFX8-NEXT: v_min_i16_e32 v15, -1, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX8-NEXT: v_subrev_u16_e32 v15, 0x8000, v15 -; GFX8-NEXT: v_max_i16_e32 v14, v14, v6 -; GFX8-NEXT: v_min_i16_e32 v14, v14, v15 -; GFX8-NEXT: v_max_i16_e32 v15, -1, v10 -; GFX8-NEXT: v_subrev_u16_e32 v15, 0x7fff, v15 -; GFX8-NEXT: v_min_i16_e32 v16, -1, v10 -; GFX8-NEXT: v_subrev_u16_e32 v16, 0x8000, v16 -; GFX8-NEXT: v_max_i16_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_max_i16_e32 v15, -1, v3 -; GFX8-NEXT: v_min_i16_e32 v6, v6, v16 -; GFX8-NEXT: v_subrev_u16_e32 v15, 0x7fff, v15 -; GFX8-NEXT: v_min_i16_e32 v16, -1, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX8-NEXT: v_subrev_u16_e32 v16, 0x8000, v16 -; GFX8-NEXT: v_max_i16_e32 v15, v15, v7 -; GFX8-NEXT: v_min_i16_e32 v15, v15, v16 -; GFX8-NEXT: v_max_i16_e32 v16, -1, v11 -; GFX8-NEXT: v_subrev_u16_e32 v16, 0x7fff, v16 -; GFX8-NEXT: v_min_i16_e32 v17, -1, v11 -; GFX8-NEXT: v_sub_u16_e32 v0, v0, v12 -; GFX8-NEXT: v_sub_u16_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v17, 0x8000, v17 -; GFX8-NEXT: v_max_i16_sdwa v7, v16, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX8-NEXT: v_sub_u16_e32 v1, v1, v13 -; GFX8-NEXT: v_sub_u16_sdwa v4, v9, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_i16_e32 v7, v7, v17 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_sub_u16_e32 v2, v2, v14 -; GFX8-NEXT: v_sub_u16_sdwa v4, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX8-NEXT: v_sub_u16_e32 v3, v3, v15 -; GFX8-NEXT: v_sub_u16_sdwa v4, v11, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_min_i16_sdwa v9, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_max_i16_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_sub_u16_e32 v8, v0, v8 +; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_e32 v4, v1, v10 +; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_i16_e32 v7, v7, v9 +; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_sub_u16_e32 v4, v2, v11 +; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_sub_u16_e32 v4, v3, v12 +; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v8i16: diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 028a28ed9a23b75..3f513e120e141b9 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1608,34 +1608,35 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: v_mov_b32_e32 v5, 0xffffff00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v6, 9 +; VI-NEXT: v_mov_b32_e32 v7, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s2 ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_mov_b32 s2, s6 ; VI-NEXT: s_mov_b32 s3, s7 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; VI-NEXT: v_and_b32_e32 v6, 0xffffff00, v4 -; VI-NEXT: v_add_u16_e32 v4, 9, v4 +; VI-NEXT: v_and_b32_e32 v8, 0xffffff00, v4 +; VI-NEXT: v_add_u16_e32 v9, 9, v4 +; VI-NEXT: v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_nop 0 -; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v5 -; VI-NEXT: v_add_u16_e32 v2, 9, v5 -; VI-NEXT: v_or_b32_sdwa v0, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_mov_b32_e32 v2, 0x900 +; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 0x900, v0 -; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1674,28 +1675,29 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_movk_i32 s4, 0xff00 +; GFX9-NEXT: v_mov_b32_e32 v6, 9 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: s_movk_i32 s4, 0x900 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_movk_i32 s5, 0x900 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 -; GFX9-NEXT: v_add_u16_e32 v4, 9, v4 +; GFX9-NEXT: v_add_u16_e32 v8, 9, v4 +; GFX9-NEXT: v_and_b32_sdwa v9, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff00, v6 -; GFX9-NEXT: v_add_u16_e32 v2, 9, v6 -; GFX9-NEXT: v_or_b32_sdwa v0, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x900, v0 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v5, v0, s[2:3] ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index e361aa4db2aa948..1b28ddb2c586204 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -2135,19 +2135,18 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX8-LABEL: safe_math_fract_v2f16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_mov_b32_e32 v7, 0x204 -; GFX8-NEXT: v_floor_f16_e32 v4, v3 -; GFX8-NEXT: v_floor_f16_e32 v5, v0 -; GFX8-NEXT: v_fract_f16_e32 v6, v3 -; GFX8-NEXT: v_cmp_class_f16_e32 vcc, v3, v7 -; GFX8-NEXT: v_pack_b32_f16 v4, v5, v4 -; GFX8-NEXT: v_fract_f16_e32 v5, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, 0, vcc -; GFX8-NEXT: v_cmp_class_f16_e32 vcc, v0, v7 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc -; GFX8-NEXT: v_pack_b32_f16 v0, v0, v3 -; GFX8-NEXT: global_store_dword v[1:2], v4, off +; GFX8-NEXT: v_mov_b32_e32 v6, 0x204 +; GFX8-NEXT: v_floor_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_floor_f16_e32 v4, v0 +; GFX8-NEXT: v_cmp_class_f16_sdwa s[4:5], v0, v6 src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_pack_b32_f16 v3, v4, v3 +; GFX8-NEXT: v_fract_f16_e32 v4, v0 +; GFX8-NEXT: v_fract_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_cmp_class_f16_e32 vcc, v0, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX8-NEXT: v_pack_b32_f16 v0, v0, v5 +; GFX8-NEXT: global_store_dword v[1:2], v3, off ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index 3118d6378804258..e8310e73f9a4759 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -803,13 +803,13 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2 ; VI-LABEL: v_fshr_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; VI-NEXT: v_mov_b32_e32 v5, 1 -; VI-NEXT: v_lshrrev_b16_sdwa v4, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_xor_b32_e32 v3, -1, v3 -; VI-NEXT: v_lshlrev_b16_e32 v3, v3, v5 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v4, 1 +; VI-NEXT: v_mov_b32_e32 v5, -1 +; VI-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_xor_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b16_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v4, v5, v4 +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 ; VI-NEXT: v_xor_b32_e32 v4, -1, v2 ; VI-NEXT: v_lshlrev_b16_e32 v0, v4, v0 @@ -887,13 +887,13 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2 ; VI-LABEL: v_fshr_v3i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; VI-NEXT: v_mov_b32_e32 v8, 1 -; VI-NEXT: v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_xor_b32_e32 v6, -1, v6 -; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v8 -; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v7, 1 +; VI-NEXT: v_mov_b32_e32 v8, -1 +; VI-NEXT: v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_xor_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b16_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v7, v8, v7 +; VI-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 ; VI-NEXT: v_xor_b32_e32 v7, -1, v5 ; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1 @@ -910,13 +910,13 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2 ; GFX9-LABEL: v_fshr_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX9-NEXT: v_mov_b32_e32 v8, 1 -; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v8 -; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: v_mov_b32_e32 v7, 1 +; GFX9-NEXT: v_mov_b32_e32 v8, -1 +; GFX9-NEXT: v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_xor_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, v8, v7 +; GFX9-NEXT: v_or_b32_e32 v6, v7, v6 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 ; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v7, v1 @@ -1019,18 +1019,18 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2 ; VI-LABEL: v_fshr_v4i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; VI-NEXT: v_mov_b32_e32 v8, 1 -; VI-NEXT: v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_xor_b32_e32 v6, -1, v6 -; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v9 -; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; VI-NEXT: v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_xor_b32_e32 v7, -1, v7 -; VI-NEXT: v_lshlrev_b16_e32 v7, v7, v8 +; VI-NEXT: v_mov_b32_e32 v7, 1 +; VI-NEXT: v_mov_b32_e32 v9, -1 +; VI-NEXT: v_lshlrev_b16_sdwa v8, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_xor_b32_sdwa v10, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b16_sdwa v6, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v8, v10, v8 +; VI-NEXT: v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_xor_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_lshrrev_b16_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v7, v9, v7 +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 ; VI-NEXT: v_xor_b32_e32 v8, -1, v5 ; VI-NEXT: v_lshlrev_b16_e32 v1, v8, v1 @@ -1040,7 +1040,6 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2 ; VI-NEXT: v_xor_b32_e32 v3, -1, v4 ; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; VI-NEXT: v_lshrrev_b16_e32 v2, v4, v2 -; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -1049,18 +1048,18 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2 ; GFX9-LABEL: v_fshr_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, 1 -; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v9 -; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX9-NEXT: v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8 +; GFX9-NEXT: v_mov_b32_e32 v7, 1 +; GFX9-NEXT: v_mov_b32_e32 v9, -1 +; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_xor_b32_sdwa v10, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_sdwa v6, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_lshlrev_b16_e32 v8, v10, v8 +; GFX9-NEXT: v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_xor_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX9-NEXT: v_lshrrev_b16_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, v9, v7 +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 ; GFX9-NEXT: v_xor_b32_e32 v8, -1, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v8, v1 @@ -1070,7 +1069,6 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2 -; GFX9-NEXT: v_or_b32_e32 v7, v7, v9 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 ; GFX9-NEXT: v_perm_b32 v0, v7, v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index d4ff845e1edf3a0..7ee31bf4dce7cd2 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -637,6 +637,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_movk_i32 s4, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -644,19 +645,18 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX9-NEXT: v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_add_u16_e32 v2, 1, v2 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 -; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 +; GFX9-NEXT: v_mul_f32_e32 v6, v4, v1 +; GFX9-NEXT: v_trunc_f32_e32 v6, v6 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 +; GFX9-NEXT: v_mad_f32 v4, -v6, v0, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v6, s[0:1] +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_short v3, v4, s[2:3] +; GFX9-NEXT: global_store_short v5, v4, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB4_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -667,25 +667,25 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s0, s4, 0xffff ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB4_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX10-NEXT: v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_add_nc_u16 v2, v2, 1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX10-NEXT: v_mul_f32_e32 v6, v4, v1 ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX10-NEXT: v_trunc_f32_e32 v6, v6 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo -; GFX10-NEXT: v_trunc_f32_e32 v5, v5 -; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4 -; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX10-NEXT: v_mad_f32 v4, -v6, v0, v4 +; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v4|, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v5, s0 -; GFX10-NEXT: global_store_short v3, v4, s[2:3] +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v6, s0 +; GFX10-NEXT: global_store_short v5, v4, s[2:3] ; GFX10-NEXT: s_cbranch_vccz .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -748,30 +748,28 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_movk_i32 s5, 0x400 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s3, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v3, 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s2, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX9-NEXT: v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX9-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 +; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s2 +; GFX9-NEXT: v_sub_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u16_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 1, v3 -; GFX9-NEXT: v_mul_f32_e32 v6, v4, v1 -; GFX9-NEXT: v_trunc_f32_e32 v6, v6 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 -; GFX9-NEXT: v_mad_f32 v4, -v6, v0, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, v0 -; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1] -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s4 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_short v5, v3, s[2:3] +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2 +; GFX9-NEXT: global_store_short v5, v4, s[0:1] ; GFX9-NEXT: s_cbranch_vccz .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -782,26 +780,26 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s0, s4, 0xffff ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB5_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v2 -; GFX10-NEXT: v_add_nc_u16 v2, v2, 1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX10-NEXT: v_cvt_f32_u32_sdwa v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 ; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5 ; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4 ; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v4|, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 1, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_mul_lo_u32 v4, v4, s0 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, v3, v4 -; GFX10-NEXT: global_store_short v5, v3, s[2:3] +; GFX10-NEXT: v_sub_nc_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u16 v2, v2, 1 +; GFX10-NEXT: global_store_short v5, v4, s[2:3] +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 ; GFX10-NEXT: s_cbranch_vccz .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 9a1de74034cd835..0b131ea74f1abbc 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -4713,29 +4713,24 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: global_load_dword v3, v2, s[6:7] ; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NODL-NEXT: global_load_dword v3, v2, s[6:7] ; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v3 -; GFX9-NODL-NEXT: v_bfe_u32 v6, v3, 16, 8 -; GFX9-NODL-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v7, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v9, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v8, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v6, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v5, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v7, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v8, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v9, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_add3_u32 v3, v7, s0, v9 -; GFX9-NODL-NEXT: v_add3_u32 v3, v3, v4, v6 -; GFX9-NODL-NEXT: v_add3_u32 v0, v8, v3, v0 -; GFX9-NODL-NEXT: v_add3_u32 v0, v0, v5, v1 +; GFX9-NODL-NEXT: v_add3_u32 v3, v4, s0, v6 +; GFX9-NODL-NEXT: v_add3_u32 v3, v3, v7, v9 +; GFX9-NODL-NEXT: v_add3_u32 v0, v5, v3, v0 +; GFX9-NODL-NEXT: v_add3_u32 v0, v0, v8, v1 ; GFX9-NODL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index a2e30603b6afcd4..663b00911628695 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -659,28 +659,30 @@ define amdgpu_kernel void @fmuladd_v2f16( ; VI-FLUSH-NEXT: s_mov_b32 s14, s10 ; VI-FLUSH-NEXT: s_mov_b32 s15, s11 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: s_mov_b32 s12, s2 -; VI-FLUSH-NEXT: s_mov_b32 s13, s3 ; VI-FLUSH-NEXT: s_mov_b32 s16, s4 ; VI-FLUSH-NEXT: s_mov_b32 s17, s5 -; VI-FLUSH-NEXT: s_mov_b32 s18, s10 -; VI-FLUSH-NEXT: s_mov_b32 s19, s11 ; VI-FLUSH-NEXT: s_mov_b32 s4, s6 ; VI-FLUSH-NEXT: s_mov_b32 s5, s7 ; VI-FLUSH-NEXT: s_mov_b32 s6, s10 ; VI-FLUSH-NEXT: s_mov_b32 s7, s11 -; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[4:7], 0 -; VI-FLUSH-NEXT: buffer_load_dword v2, off, s[16:19], 0 +; VI-FLUSH-NEXT: s_mov_b32 s12, s2 +; VI-FLUSH-NEXT: s_mov_b32 s13, s3 +; VI-FLUSH-NEXT: s_mov_b32 s18, s10 +; VI-FLUSH-NEXT: s_mov_b32 s19, s11 +; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-FLUSH-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; VI-FLUSH-NEXT: s_mov_b32 s8, s0 ; VI-FLUSH-NEXT: s_mov_b32 s9, s1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(2) +; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(1) -; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: v_mac_f16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-FLUSH-NEXT: v_mac_f16_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-FLUSH-NEXT: v_mac_f16_e32 v1, v0, v2 -; VI-FLUSH-NEXT: v_or_b32_e32 v0, v1, v3 +; VI-FLUSH-NEXT: v_mac_f16_e32 v0, v2, v1 +; VI-FLUSH-NEXT: v_or_b32_e32 v0, v0, v3 ; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-FLUSH-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll index 14742c5827c1e4c..b9fef0834cb2456 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll @@ -183,11 +183,10 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) { ; GFX8-SDAG-LABEL: test_frexp_v2f16_v2i32: ; GFX8-SDAG: ; %bb.0: ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-SDAG-NEXT: v_frexp_mant_f16_e32 v1, v0 -; GFX8-SDAG-NEXT: v_frexp_mant_f16_sdwa v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-SDAG-NEXT: v_frexp_exp_i16_f16_e32 v1, v2 +; GFX8-SDAG-NEXT: v_frexp_mant_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v1, v2 +; GFX8-SDAG-NEXT: v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-SDAG-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 ; GFX8-SDAG-NEXT: v_bfe_i32 v2, v1, 0, 16 ; GFX8-SDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 @@ -197,11 +196,10 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) { ; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX9-SDAG-NEXT: v_frexp_mant_f16_e32 v2, v1 -; GFX9-SDAG-NEXT: v_frexp_mant_f16_e32 v3, v0 -; GFX9-SDAG-NEXT: v_pack_b32_f16 v3, v3, v2 -; GFX9-SDAG-NEXT: v_frexp_exp_i16_f16_e32 v1, v1 +; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-SDAG-NEXT: v_frexp_mant_f16_e32 v2, v0 +; GFX9-SDAG-NEXT: v_pack_b32_f16 v3, v2, v1 +; GFX9-SDAG-NEXT: v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9-SDAG-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 ; GFX9-SDAG-NEXT: v_bfe_i32 v2, v1, 0, 16 ; GFX9-SDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 @@ -246,27 +244,25 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) { ; GFX8-GISEL-LABEL: test_frexp_v2f16_v2i32: ; GFX8-GISEL: ; %bb.0: ; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-GISEL-NEXT: v_frexp_mant_f16_e32 v3, v0 -; GFX8-GISEL-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 -; GFX8-GISEL-NEXT: v_bfe_i32 v1, v0, 0, 16 -; GFX8-GISEL-NEXT: v_frexp_mant_f16_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX8-GISEL-NEXT: v_frexp_exp_i16_f16_e32 v2, v2 -; GFX8-GISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX8-GISEL-NEXT: v_frexp_exp_i16_f16_e32 v1, v0 +; GFX8-GISEL-NEXT: v_frexp_mant_f16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-GISEL-NEXT: v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX8-GISEL-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v3, v4 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-GISEL-NEXT: v_frexp_mant_f16_e32 v3, v0 -; GFX9-GISEL-NEXT: v_frexp_exp_i16_f16_e32 v0, v0 -; GFX9-GISEL-NEXT: v_bfe_i32 v1, v0, 0, 16 -; GFX9-GISEL-NEXT: v_frexp_mant_f16_e32 v0, v2 -; GFX9-GISEL-NEXT: v_frexp_exp_i16_f16_e32 v2, v2 -; GFX9-GISEL-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v3, v0 +; GFX9-GISEL-NEXT: v_frexp_exp_i16_f16_e32 v1, v0 +; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-GISEL-NEXT: v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX9-GISEL-NEXT: v_bfe_i32 v2, v0, 0, 16 +; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v3, v4 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] %result = call { <2 x half>, <2 x i32> } @llvm.frexp.v2f16.v2i32(<2 x half> %a) ret { <2 x half>, <2 x i32> } %result diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index 6672568b98a2037..8861ee380be031e 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -147,11 +147,10 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_add_i32 s48, s49, 1 ; CHECK-NEXT: s_add_i32 s5, s49, 5 ; CHECK-NEXT: v_or3_b32 v57, s4, v43, s48 -; CHECK-NEXT: ds_read_u8 v0, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: ds_read_u8 v56, v0 ; CHECK-NEXT: v_mov_b32_e32 v58, s48 ; CHECK-NEXT: s_mov_b32 s52, exec_lo -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v56, 0xff, v0 ; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42 ; CHECK-NEXT: s_cbranch_execz .LBB0_17 ; CHECK-NEXT: ; %bb.6: ; %.preheader2 @@ -175,10 +174,10 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_add_nc_u32_e32 v59, s54, v46 ; CHECK-NEXT: v_add_nc_u32_e32 v58, s54, v57 -; CHECK-NEXT: s_mov_b32 s55, exec_lo ; CHECK-NEXT: ds_read_u8 v0, v59 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 +; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD +; CHECK-NEXT: s_and_saveexec_b32 s55, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.9: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -200,9 +199,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:1 -; CHECK-NEXT: s_mov_b32 s55, exec_lo ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 +; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD +; CHECK-NEXT: s_and_saveexec_b32 s55, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_12 ; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -225,9 +224,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:2 -; CHECK-NEXT: s_mov_b32 s55, exec_lo ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 +; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD +; CHECK-NEXT: s_and_saveexec_b32 s55, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_14 ; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -250,9 +249,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: ds_read_u8 v0, v59 offset:3 -; CHECK-NEXT: s_mov_b32 s55, exec_lo ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 +; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD +; CHECK-NEXT: s_and_saveexec_b32 s55, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_7 ; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_8 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 @@ -300,10 +299,10 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: .LBB0_20: ; Parent Loop BB0_5 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v58 -; CHECK-NEXT: s_mov_b32 s53, exec_lo ; CHECK-NEXT: ds_read_u8 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cmpx_eq_u16_e64 v56, v0 +; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD +; CHECK-NEXT: s_and_saveexec_b32 s53, s4 ; CHECK-NEXT: s_cbranch_execz .LBB0_19 ; CHECK-NEXT: ; %bb.21: ; in Loop: Header=BB0_20 Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index 7ca9ae359a49928..352c1ecf8ece4a6 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -697,18 +697,16 @@ define hidden void @add(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, p ; GFX9-LABEL: add: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[2:3], off -; GFX9-NEXT: global_load_dword v7, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_sdwa v1, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v2, v4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_add_u16_sdwa v3, v7, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v2, v7, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_add_u16_sdwa v3, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:WORD_1 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[5:6], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -808,17 +806,16 @@ define hidden void @add_store(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 % ; GFX9-NEXT: global_load_dword v9, v[2:3], off ; GFX9-NEXT: s_movk_i32 s4, 0xff00 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4 -; GFX9-NEXT: v_and_b32_sdwa v1, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_sdwa v2, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 -; GFX9-NEXT: v_or_b32_e32 v1, v0, v1 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v9 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX9-NEXT: v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v1, off +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: global_store_dword v[5:6], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 @@ -868,23 +865,22 @@ define hidden void @add_store_div_16(ptr addrspace(1) %in0, ptr addrspace(1) %in ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v9, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off ; GFX9-NEXT: s_movk_i32 s4, 0xff00 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b16_e32 v1, 8, v9 -; GFX9-NEXT: v_and_b32_sdwa v2, v9, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v2, v1, v2 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_sdwa v3, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 -; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX9-NEXT: v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v1, off +; GFX9-NEXT: global_store_dword v[5:6], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -942,22 +938,20 @@ define hidden void @add_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v4, v[2:3], off -; GFX9-NEXT: global_load_dword v9, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off ; GFX9-NEXT: s_mov_b32 s4, 0x10705 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v1, v9, v4, s4 -; GFX9-NEXT: v_add_u16_sdwa v2, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v3, v4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_add_u16_sdwa v9, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v1, off +; GFX9-NEXT: v_perm_b32 v0, v4, v9, s4 +; GFX9-NEXT: v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX9-NEXT: v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_add_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:WORD_1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1391,22 +1385,20 @@ define hidden void @mul_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v4, v[2:3], off -; GFX9-NEXT: global_load_dword v9, v[0:1], off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off ; GFX9-NEXT: s_mov_b32 s4, 0x2000504 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mul_lo_u16_e32 v2, v9, v4 -; GFX9-NEXT: v_mul_lo_u16_sdwa v3, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_sdwa v3, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_mul_lo_u16_e32 v0, v4, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v1, v9, v4, s4 -; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v1, off +; GFX9-NEXT: v_perm_b32 v0, v4, v9, s4 +; GFX9-NEXT: v_mul_lo_u16_e32 v1, v4, v9 +; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NEXT: v_mul_lo_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_3 +; GFX9-NEXT: v_mul_lo_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1503,67 +1495,61 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_bfe_i32 v0, v4, 0, 8 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_ashrrev_i32_e32 v2, 24, v9 -; GFX10-NEXT: v_bfe_i32 v3, v4, 8, 8 -; GFX10-NEXT: v_bfe_i32 v1, v9, 16, 8 -; GFX10-NEXT: v_bfe_i32 v10, v4, 16, 8 -; GFX10-NEXT: v_cvt_f32_i32_e32 v13, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v11, 24, v4 -; GFX10-NEXT: v_xor_b32_e32 v15, v2, v3 -; GFX10-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX10-NEXT: v_xor_b32_e32 v12, v1, v0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v13 -; GFX10-NEXT: v_cvt_f32_i32_e32 v14, v1 -; GFX10-NEXT: v_xor_b32_e32 v1, v1, v10 -; GFX10-NEXT: v_cvt_f32_i32_e32 v10, v10 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v3 -; GFX10-NEXT: v_xor_b32_e32 v0, v0, v11 -; GFX10-NEXT: v_cvt_f32_i32_e32 v11, v11 -; GFX10-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v10 -; GFX10-NEXT: v_ashrrev_i32_e32 v12, 30, v12 -; GFX10-NEXT: v_mul_f32_e32 v16, v14, v16 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v11 -; GFX10-NEXT: v_ashrrev_i32_e32 v15, 30, v15 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 30, v1 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v15, v1 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v10 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v12 +; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v14 +; GFX10-NEXT: v_xor_b32_sdwa v3, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX10-NEXT: v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX10-NEXT: v_mul_f32_e32 v15, v2, v15 +; GFX10-NEXT: v_mul_f32_e32 v16, v19, v16 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 30, v3 ; GFX10-NEXT: v_mul_f32_e32 v17, v2, v17 +; GFX10-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_trunc_f32_e32 v15, v15 ; GFX10-NEXT: v_trunc_f32_e32 v16, v16 -; GFX10-NEXT: v_or_b32_e32 v12, 1, v12 -; GFX10-NEXT: v_or_b32_e32 v15, 1, v15 -; GFX10-NEXT: v_mul_f32_e32 v18, v14, v18 +; GFX10-NEXT: v_mul_f32_e32 v18, v1, v18 ; GFX10-NEXT: v_trunc_f32_e32 v17, v17 -; GFX10-NEXT: v_mad_f32 v20, -v16, v13, v14 -; GFX10-NEXT: v_mul_f32_e32 v19, v13, v19 -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11 +; GFX10-NEXT: v_mad_f32 v20, -v15, v1, v2 +; GFX10-NEXT: v_mad_f32 v19, -v16, v10, v19 +; GFX10-NEXT: v_or_b32_e32 v3, 1, v3 ; GFX10-NEXT: v_trunc_f32_e32 v18, v18 -; GFX10-NEXT: v_mad_f32 v2, -v17, v3, v2 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v13| -; GFX10-NEXT: v_trunc_f32_e32 v19, v19 -; GFX10-NEXT: v_or_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_mad_f32 v14, -v18, v10, v14 -; GFX10-NEXT: v_or_b32_e32 v0, 1, v0 -; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, |v3| -; GFX10-NEXT: v_mad_f32 v21, -v19, v11, v13 +; GFX10-NEXT: v_mad_f32 v2, -v17, v12, v2 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v1| +; GFX10-NEXT: v_ashrrev_i32_e32 v13, 30, v13 +; GFX10-NEXT: v_or_b32_e32 v11, 1, v11 +; GFX10-NEXT: v_mad_f32 v21, -v18, v14, v1 +; GFX10-NEXT: v_cvt_i32_f32_e32 v15, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, |v10| +; GFX10-NEXT: v_or_b32_e32 v13, 1, v13 ; GFX10-NEXT: v_cvt_i32_f32_e32 v16, v16 ; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17 ; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v15, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v14|, |v10| -; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v16, v12 -; GFX10-NEXT: v_add_nc_u32_sdwa v2, v17, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v11| -; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_add_nc_u32_e32 v1, v18, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo -; GFX10-NEXT: v_add_nc_u32_sdwa v0, v19, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, |v12| +; GFX10-NEXT: v_add_nc_u32_e32 v0, v15, v0 +; GFX10-NEXT: v_add_nc_u32_sdwa v1, v16, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v11, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14| +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u32_e32 v2, v17, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v13, vcc_lo +; GFX10-NEXT: v_add_nc_u32_sdwa v3, v18, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x60706 -; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: global_store_dword v[5:6], v0, off ; GFX10-NEXT: global_store_dword v[7:8], v1, off ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1581,67 +1567,61 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: global_load_dword v9, v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 0x60706 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfe_i32 v1, v4, 0, 8 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 -; GFX9-NEXT: v_bfe_i32 v2, v9, 16, 8 -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 24, v9 -; GFX9-NEXT: v_bfe_i32 v9, v4, 8, 8 -; GFX9-NEXT: v_cvt_f32_i32_e32 v12, v1 -; GFX9-NEXT: v_bfe_i32 v10, v4, 16, 8 -; GFX9-NEXT: v_ashrrev_i32_e32 v4, 24, v4 -; GFX9-NEXT: v_xor_b32_e32 v14, v3, v9 -; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v9 -; GFX9-NEXT: v_xor_b32_e32 v11, v2, v1 -; GFX9-NEXT: v_cvt_f32_i32_e32 v13, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, v2, v10 -; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v10 -; GFX9-NEXT: v_xor_b32_e32 v1, v1, v4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v12 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v9 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v10 +; GFX9-NEXT: v_xor_b32_sdwa v1, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX9-NEXT: v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v11, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_xor_b32_sdwa v9, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX9-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v12 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v13 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v4 -; GFX9-NEXT: v_mul_f32_e32 v15, v13, v15 -; GFX9-NEXT: v_mul_f32_e32 v16, v3, v16 +; GFX9-NEXT: v_mul_f32_e32 v15, v3, v15 +; GFX9-NEXT: v_mul_f32_e32 v16, v11, v16 ; GFX9-NEXT: v_trunc_f32_e32 v15, v15 -; GFX9-NEXT: v_ashrrev_i32_e32 v11, 30, v11 -; GFX9-NEXT: v_mul_f32_e32 v17, v13, v17 -; GFX9-NEXT: v_mul_f32_e32 v18, v12, v18 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 +; GFX9-NEXT: v_mul_f32_e32 v17, v3, v17 +; GFX9-NEXT: v_mul_f32_e32 v18, v2, v18 ; GFX9-NEXT: v_trunc_f32_e32 v16, v16 -; GFX9-NEXT: v_mad_f32 v19, -v15, v12, v13 -; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v14 -; GFX9-NEXT: v_or_b32_e32 v11, 1, v11 +; GFX9-NEXT: v_mad_f32 v19, -v15, v2, v3 +; GFX9-NEXT: v_ashrrev_i32_e32 v10, 30, v10 +; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_trunc_f32_e32 v17, v17 ; GFX9-NEXT: v_trunc_f32_e32 v18, v18 -; GFX9-NEXT: v_mad_f32 v3, -v16, v9, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v12| -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v2 -; GFX9-NEXT: v_or_b32_e32 v14, 1, v14 +; GFX9-NEXT: v_mad_f32 v11, -v16, v12, v11 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v2| +; GFX9-NEXT: v_ashrrev_i32_e32 v9, 30, v9 +; GFX9-NEXT: v_or_b32_e32 v10, 1, v10 ; GFX9-NEXT: v_cvt_i32_f32_e32 v15, v15 ; GFX9-NEXT: v_cvt_i32_f32_e32 v16, v16 -; GFX9-NEXT: v_mad_f32 v13, -v17, v10, v13 +; GFX9-NEXT: v_mad_f32 v3, -v17, v13, v3 ; GFX9-NEXT: v_cvt_i32_f32_e32 v17, v17 -; GFX9-NEXT: v_mad_f32 v20, -v18, v4, v12 +; GFX9-NEXT: v_mad_f32 v20, -v18, v4, v2 ; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v9| -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 -; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v14, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, |v10| -; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v20|, |v4| ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_add_u32_e32 v4, v15, v11 -; GFX9-NEXT: v_add_u32_sdwa v3, v16, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v2, v17, v2 -; GFX9-NEXT: v_add_u32_sdwa v1, v18, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v11|, |v12| +; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v14 +; GFX9-NEXT: v_or_b32_e32 v9, 1, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v10, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v13| +; GFX9-NEXT: v_or_b32_e32 v14, 1, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v20|, |v4| +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v14, vcc +; GFX9-NEXT: v_add_u32_e32 v1, v15, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v16, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v3, v17, v3 +; GFX9-NEXT: v_add_u32_sdwa v4, v18, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[5:6], v1, off ; GFX9-NEXT: global_store_dword v[7:8], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1876,73 +1856,67 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_bfe_i32 v1, v4, 0, 8 -; GFX10-NEXT: v_bfe_i32 v2, v4, 16, 8 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_ashrrev_i32_e32 v10, 24, v9 -; GFX10-NEXT: v_bfe_i32 v11, v4, 8, 8 -; GFX10-NEXT: v_ashrrev_i32_e32 v12, 24, v4 -; GFX10-NEXT: v_bfe_i32 v13, v9, 16, 8 -; GFX10-NEXT: v_xor_b32_e32 v14, v2, v1 -; GFX10-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX10-NEXT: v_xor_b32_e32 v16, v10, v11 -; GFX10-NEXT: v_cvt_f32_i32_e32 v11, v11 -; GFX10-NEXT: v_cvt_f32_i32_e32 v15, v2 -; GFX10-NEXT: v_cvt_f32_i32_e32 v10, v10 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v1 -; GFX10-NEXT: v_cvt_f32_i32_e32 v17, v12 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v11 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v15, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v13 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v3 +; GFX10-NEXT: v_xor_b32_sdwa v1, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v20, v15 -; GFX10-NEXT: v_xor_b32_e32 v2, v12, v2 -; GFX10-NEXT: v_xor_b32_e32 v12, v13, v12 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v21, v17 -; GFX10-NEXT: v_ashrrev_i32_e32 v14, 30, v14 -; GFX10-NEXT: v_cvt_f32_i32_e32 v13, v13 -; GFX10-NEXT: v_ashrrev_i32_e32 v16, 30, v16 -; GFX10-NEXT: v_mul_f32_e32 v18, v15, v18 -; GFX10-NEXT: v_ashrrev_i32_e32 v2, 30, v2 -; GFX10-NEXT: v_mul_f32_e32 v19, v10, v19 -; GFX10-NEXT: v_mul_f32_e32 v20, v17, v20 -; GFX10-NEXT: v_or_b32_e32 v14, 1, v14 +; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v21, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX10-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 30, v1 +; GFX10-NEXT: v_xor_b32_sdwa v16, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3 +; GFX10-NEXT: v_mul_f32_e32 v17, v3, v17 +; GFX10-NEXT: v_mul_f32_e32 v18, v12, v18 +; GFX10-NEXT: v_mul_f32_e32 v19, v15, v19 +; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11 +; GFX10-NEXT: v_or_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_trunc_f32_e32 v17, v17 ; GFX10-NEXT: v_trunc_f32_e32 v18, v18 -; GFX10-NEXT: v_mul_f32_e32 v21, v13, v21 +; GFX10-NEXT: v_mul_f32_e32 v20, v21, v20 ; GFX10-NEXT: v_trunc_f32_e32 v19, v19 +; GFX10-NEXT: v_ashrrev_i32_e32 v14, 30, v14 +; GFX10-NEXT: v_mad_f32 v22, -v17, v2, v3 +; GFX10-NEXT: v_mad_f32 v12, -v18, v13, v12 +; GFX10-NEXT: v_or_b32_e32 v11, 1, v11 ; GFX10-NEXT: v_trunc_f32_e32 v20, v20 +; GFX10-NEXT: v_mad_f32 v23, -v19, v3, v15 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v22|, |v2| +; GFX10-NEXT: v_ashrrev_i32_e32 v16, 30, v16 +; GFX10-NEXT: v_or_b32_e32 v14, 1, v14 +; GFX10-NEXT: v_mad_f32 v21, -v20, v15, v21 +; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v12|, |v13| ; GFX10-NEXT: v_or_b32_e32 v16, 1, v16 -; GFX10-NEXT: v_mad_f32 v22, -v18, v1, v15 -; GFX10-NEXT: v_trunc_f32_e32 v21, v21 -; GFX10-NEXT: v_mad_f32 v10, -v19, v11, v10 -; GFX10-NEXT: v_mad_f32 v23, -v20, v15, v17 -; GFX10-NEXT: v_ashrrev_i32_e32 v12, 30, v12 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v22|, |v1| -; GFX10-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_mad_f32 v13, -v21, v17, v13 ; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18 -; GFX10-NEXT: v_or_b32_e32 v12, 1, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v14, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v10|, |v11| ; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19 ; GFX10-NEXT: v_cvt_i32_f32_e32 v20, v20 -; GFX10-NEXT: v_cvt_i32_f32_e32 v21, v21 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v11, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v23|, |v3| ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v16, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v23|, |v15| -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v18, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v10, v19, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v13|, |v17| -; GFX10-NEXT: v_mul_lo_u32 v1, v1, v4 -; GFX10-NEXT: v_mul_lo_u32 v3, v10, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v20, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v4 -; GFX10-NEXT: v_mul_lo_u32 v2, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v11, v21, v11 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v17, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v14, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v15| +; GFX10-NEXT: v_add_nc_u32_e32 v2, v18, v2 +; GFX10-NEXT: v_mul_lo_u32 v1, v1, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v19, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v16, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v2, v2, v10 +; GFX10-NEXT: v_mul_lo_u32 v3, v3, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v11, v20, v11 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 -; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v9, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX10-NEXT: v_mul_lo_u32 v10, v11, v12 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v12, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, v12, v3 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -1965,74 +1939,68 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: global_load_dword v9, v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 0x2070306 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfe_i32 v2, v4, 0, 8 -; GFX9-NEXT: v_bfe_i32 v3, v4, 16, 8 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_ashrrev_i32_e32 v11, 24, v9 -; GFX9-NEXT: v_bfe_i32 v12, v4, 8, 8 -; GFX9-NEXT: v_xor_b32_e32 v16, v3, v2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX9-NEXT: v_ashrrev_i32_e32 v13, 24, v4 -; GFX9-NEXT: v_xor_b32_e32 v18, v11, v12 -; GFX9-NEXT: v_cvt_f32_i32_e32 v12, v12 -; GFX9-NEXT: v_cvt_f32_i32_e32 v17, v3 -; GFX9-NEXT: v_cvt_f32_i32_e32 v19, v13 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v20, v2 -; GFX9-NEXT: v_bfe_i32 v15, v9, 16, 8 -; GFX9-NEXT: v_cvt_f32_i32_e32 v11, v11 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v21, v12 -; GFX9-NEXT: v_xor_b32_e32 v3, v13, v3 -; GFX9-NEXT: v_xor_b32_e32 v13, v15, v13 -; GFX9-NEXT: v_cvt_f32_i32_e32 v15, v15 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v22, v17 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v23, v19 -; GFX9-NEXT: v_mul_f32_e32 v20, v17, v20 -; GFX9-NEXT: v_mul_f32_e32 v21, v11, v21 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v16, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v20, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v21, v14 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v22, v10 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v23, v16 +; GFX9-NEXT: v_mul_f32_e32 v20, v10, v20 +; GFX9-NEXT: v_xor_b32_sdwa v2, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX9-NEXT: v_mul_f32_e32 v21, v13, v21 ; GFX9-NEXT: v_trunc_f32_e32 v20, v20 -; GFX9-NEXT: v_ashrrev_i32_e32 v16, 30, v16 -; GFX9-NEXT: v_mul_f32_e32 v22, v19, v22 -; GFX9-NEXT: v_mul_f32_e32 v23, v15, v23 +; GFX9-NEXT: v_xor_b32_sdwa v12, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v2 +; GFX9-NEXT: v_mul_f32_e32 v22, v16, v22 +; GFX9-NEXT: v_mul_f32_e32 v23, v19, v23 ; GFX9-NEXT: v_trunc_f32_e32 v21, v21 -; GFX9-NEXT: v_mad_f32 v24, -v20, v2, v17 -; GFX9-NEXT: v_ashrrev_i32_e32 v18, 30, v18 -; GFX9-NEXT: v_or_b32_e32 v16, 1, v16 +; GFX9-NEXT: v_mad_f32 v24, -v20, v3, v10 +; GFX9-NEXT: v_xor_b32_sdwa v15, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2 +; GFX9-NEXT: v_ashrrev_i32_e32 v12, 30, v12 +; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 ; GFX9-NEXT: v_trunc_f32_e32 v22, v22 ; GFX9-NEXT: v_trunc_f32_e32 v23, v23 -; GFX9-NEXT: v_mad_f32 v11, -v21, v12, v11 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v24|, |v2| -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 30, v3 -; GFX9-NEXT: v_or_b32_e32 v18, 1, v18 +; GFX9-NEXT: v_mad_f32 v13, -v21, v14, v13 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v24|, |v3| +; GFX9-NEXT: v_xor_b32_sdwa v18, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3 +; GFX9-NEXT: v_ashrrev_i32_e32 v15, 30, v15 +; GFX9-NEXT: v_or_b32_e32 v12, 1, v12 ; GFX9-NEXT: v_cvt_i32_f32_e32 v20, v20 ; GFX9-NEXT: v_cvt_i32_f32_e32 v21, v21 -; GFX9-NEXT: v_mad_f32 v25, -v22, v17, v19 +; GFX9-NEXT: v_mad_f32 v25, -v22, v10, v16 ; GFX9-NEXT: v_cvt_i32_f32_e32 v22, v22 -; GFX9-NEXT: v_mad_f32 v15, -v23, v19, v15 +; GFX9-NEXT: v_mad_f32 v19, -v23, v16, v19 ; GFX9-NEXT: v_cvt_i32_f32_e32 v23, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v11|, |v12| -; GFX9-NEXT: v_ashrrev_i32_e32 v13, 30, v13 -; GFX9-NEXT: v_or_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v18, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v25|, |v17| -; GFX9-NEXT: v_or_b32_e32 v13, 1, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v15|, |v19| -; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, |v14| +; GFX9-NEXT: v_ashrrev_i32_e32 v18, 30, v18 +; GFX9-NEXT: v_or_b32_e32 v15, 1, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v12, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v25|, |v10| +; GFX9-NEXT: v_or_b32_e32 v18, 1, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v15, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v16| +; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v18, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v4 ; GFX9-NEXT: v_add_u32_e32 v2, v20, v2 -; GFX9-NEXT: v_add_u32_e32 v11, v21, v11 -; GFX9-NEXT: v_add_u32_e32 v3, v22, v3 +; GFX9-NEXT: v_add_u32_e32 v3, v21, v3 +; GFX9-NEXT: v_add_u32_e32 v10, v22, v10 ; GFX9-NEXT: v_add_u32_e32 v12, v23, v12 ; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v4, v11, v10 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, v0 -; GFX9-NEXT: v_mul_lo_u32 v10, v12, v14 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, v11 +; GFX9-NEXT: v_mul_lo_u32 v4, v10, v0 +; GFX9-NEXT: v_mul_lo_u32 v10, v12, v17 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_sub_u32_sdwa v2, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_sub_u32_e32 v3, v14, v3 +; GFX9-NEXT: v_sub_u32_sdwa v2, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_sub_u32_e32 v3, v17, v4 ; GFX9-NEXT: v_sub_u32_sdwa v4, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -2090,27 +2058,24 @@ define hidden void @sub_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 -; GFX9-NEXT: global_load_dword v2, v[2:3], off ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off ; GFX9-NEXT: s_mov_b32 s4, 0x6070007 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; GFX9-NEXT: v_sub_u16_sdwa v9, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v4, v2, v0, s4 -; GFX9-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_sub_u16_e32 v2, v3, v2 -; GFX9-NEXT: v_sub_u16_e32 v1, v3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v4, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 +; GFX9-NEXT: v_sub_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_sub_u16_sdwa v2, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_sub_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_3 +; GFX9-NEXT: v_sub_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:WORD_1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll index 77e1694dbe7e19f..470b17f5cc560e8 100644 --- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll @@ -1840,10 +1840,9 @@ define <2 x i16> @v_mul_sub_x_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX8-LABEL: v_mul_sub_x_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-NEXT: v_mul_lo_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e32 v0, v1, v0 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir index e2854df2468b390..8d374e19bafad07 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=VI -check-prefix=GFX89 -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=GCN %s @@ -468,3 +469,8 @@ body: | S_ENDPGM 0, implicit %7 ... +## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +# GCN: {{.*}} +# GFX89: {{.*}} +# GFX9: {{.*}} +# VI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index c9dbadcbd23157b..c2930313271a979 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1557,7 +1557,8 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX89-NEXT: s_waitcnt vmcnt(1) ; GFX89-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_mac_f16_sdwa v4, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX89-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX89-NEXT: v_mac_f16_sdwa v4, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX89-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX89-NEXT: v_mac_f16_e32 v2, v3, v2 ; GFX89-NEXT: v_or_b32_e32 v2, v2, v4 @@ -1718,18 +1719,16 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX89-NEXT: v_mov_b32_e32 v3, s1 ; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX89-NEXT: flat_load_dword v4, v[0:1] ; GFX89-NEXT: flat_load_dword v2, v[2:3] -; GFX89-NEXT: flat_load_dword v3, v[0:1] ; GFX89-NEXT: v_mov_b32_e32 v0, s4 ; GFX89-NEXT: v_mov_b32_e32 v1, s5 -; GFX89-NEXT: s_waitcnt vmcnt(1) -; GFX89-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_mul_lo_u16_e32 v5, v3, v2 -; GFX89-NEXT: v_mul_lo_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX89-NEXT: v_mul_lo_u16_e32 v2, v5, v2 -; GFX89-NEXT: v_mul_lo_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX89-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX89-NEXT: v_mul_lo_u16_sdwa v3, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX89-NEXT: v_mul_lo_u16_e32 v4, v4, v2 +; GFX89-NEXT: v_mul_lo_u16_e32 v4, v4, v2 +; GFX89-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX89-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX89-NEXT: flat_store_dword v[0:1], v2 ; GFX89-NEXT: s_endpgm ; diff --git a/update.bat b/update.bat new file mode 100644 index 000000000000000..b7cd2c4ba9d9729 --- /dev/null +++ b/update.bat @@ -0,0 +1,21 @@ +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/fract-match.ll +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/idiv-licm.ll +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/idot4u.ll +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/llvm.frexp.ll +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/permute_i8.ll +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/v_mac_f16.ll From 31c940f41b83a70bf04515d06e0ef70254dc51b5 Mon Sep 17 00:00:00 2001 From: Brian Favela Date: Fri, 7 Jun 2024 14:56:59 -0400 Subject: [PATCH 3/6] Updating tests and removing superfluous check --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 9 +- .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 9 +- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 9 +- llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll | 3 +- .../CodeGen/AMDGPU/sdwa-peephole-instr.mir | 6 -- llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 88 +++++++++++++++++++ llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll | 6 +- update.bat | 21 ----- 8 files changed, 101 insertions(+), 50 deletions(-) delete mode 100644 update.bat diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 43348a0f68b137b..082aeeea2c7cc3f 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -45,7 +45,7 @@ class SDWADstOperand; using SDWAOperandsVector = SmallVector; // helper typedef to make code cleaner -typedef std::unordered_map SDWAOperandsMap; +typedef MapVector SDWAOperandsMap; class SIPeepholeSDWA : public MachineFunctionPass { private: @@ -53,7 +53,7 @@ class SIPeepholeSDWA : public MachineFunctionPass { const SIRegisterInfo *TRI; const SIInstrInfo *TII; - std::unordered_map> SDWAOperands; + MapVector> SDWAOperands; SDWAOperandsMap PotentialMatches; SmallVector ConvertedInstructions; @@ -356,11 +356,6 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, if (!isConvertibleToSDWA(*(UseMO.getParent()), ST, TII)) { return nullptr; } - - // Not handling the obscure case where the same use is in multiple operands - if (PotentialMatches->find(UseMO.getParent()) != PotentialMatches->end()) { - return nullptr; - } } // Now that it's guaranteed all uses are legal, iterate over the uses again // to add them for later conversion. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index 04da7b24156dcd3..168e6dfa5f147d2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -298,9 +298,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -607,12 +606,10 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_add_u16_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xff ; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_ashrrev_i16_e32 v2, 8, v2 ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index d7b1e37a81a1b97..2572f8581f0edf6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -279,11 +279,9 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 -; GFX8-NEXT: v_ashrrev_i16_e32 v1, 8, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0xff -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -300,9 +298,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index 663b00911628695..cfdb4f7a07d024e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -679,7 +679,8 @@ define amdgpu_kernel void @fmuladd_v2f16( ; VI-FLUSH-NEXT: s_waitcnt vmcnt(1) ; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: v_mac_f16_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v5, v4 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-FLUSH-NEXT: v_mac_f16_e32 v0, v2, v1 ; VI-FLUSH-NEXT: v_or_b32_e32 v0, v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir index 8d374e19bafad07..e2854df2468b390 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir @@ -1,4 +1,3 @@ -# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=VI -check-prefix=GFX89 -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=GCN %s @@ -469,8 +468,3 @@ body: | S_ENDPGM 0, implicit %7 ... -## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -# GCN: {{.*}} -# GFX89: {{.*}} -# GFX9: {{.*}} -# VI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index c2930313271a979..bd0808e7e359db5 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -2204,6 +2204,94 @@ bb2: br label %bb0 } +define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addrspace(1) %ina, ptr addrspace(1) %inb) #0 { +; NOSDWA-LABEL: mac_v2half_same_srcop: +; NOSDWA: ; %bb.0: ; %entry +; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) +; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 +; NOSDWA-NEXT: flat_load_dword v2, v[2:3] +; NOSDWA-NEXT: flat_load_dword v3, v[0:1] +; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 +; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 +; NOSDWA-NEXT: s_waitcnt vmcnt(1) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; NOSDWA-NEXT: s_waitcnt vmcnt(0) +; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; NOSDWA-NEXT: v_mac_f16_e32 v5, v4, v4 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; NOSDWA-NEXT: v_mac_f16_e32 v3, v2, v2 +; NOSDWA-NEXT: v_or_b32_e32 v2, v3, v4 +; NOSDWA-NEXT: flat_store_dword v[0:1], v2 +; NOSDWA-NEXT: s_endpgm +; +; GFX89-LABEL: mac_v2half_same_srcop: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: v_mov_b32_e32 v0, s6 +; GFX89-NEXT: v_mov_b32_e32 v1, s7 +; GFX89-NEXT: v_mov_b32_e32 v2, s0 +; GFX89-NEXT: v_mov_b32_e32 v3, s1 +; GFX89-NEXT: flat_load_dword v4, v[0:1] +; GFX89-NEXT: flat_load_dword v2, v[2:3] +; GFX89-NEXT: v_mov_b32_e32 v0, s4 +; GFX89-NEXT: v_mov_b32_e32 v1, s5 +; GFX89-NEXT: s_waitcnt vmcnt(1) +; GFX89-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_mac_f16_sdwa v3, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX89-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX89-NEXT: v_mac_f16_e32 v4, v2, v2 +; GFX89-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX89-NEXT: flat_store_dword v[0:1], v2 +; GFX89-NEXT: s_endpgm +; +; GFX9-LABEL: mac_v2half_same_srcop: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v1, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: mac_v2half_same_srcop: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v1, v1, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: s_endpgm +entry: + %a = load <2 x half>, ptr addrspace(1) %ina, align 4 + %b = load <2 x half>, ptr addrspace(1) %inb, align 4 + %mul = fmul <2 x half> %b, %b + %mac = fadd <2 x half> %mul, %a + store <2 x half> %mac, ptr addrspace(1) %out, align 4 + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" } diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll index d7a6be511069177..f8c9827ecf7a995 100644 --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -32,12 +32,12 @@ define amdgpu_kernel void @s_abs_v2i16(ptr addrspace(1) %out, <2 x i16> %val) #0 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 op_sel_hi:[1,0] +; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; VI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, ; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; VI-DAG: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_sub_u16_sdwa v{{[0-9]+}}, [[ZERO]], v{{[0-9]+}} ; VI-DAG: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}} ; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[TWO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NOT: v_and_b32 diff --git a/update.bat b/update.bat deleted file mode 100644 index b7cd2c4ba9d9729..000000000000000 --- a/update.bat +++ /dev/null @@ -1,21 +0,0 @@ -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/fract-match.ll -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/idiv-licm.ll -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/idot4u.ll -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/llvm.frexp.ll -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/permute_i8.ll -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll -llvm\utils\update_llc_test_checks.py llvm/test/CodeGen/AMDGPU/v_mac_f16.ll From b5e2d06ee9b85b697e640a3bfa198db51eed1132 Mon Sep 17 00:00:00 2001 From: Brian Favela Date: Fri, 7 Jun 2024 16:59:50 -0400 Subject: [PATCH 4/6] Addressing review concerns --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 082aeeea2c7cc3f..64b7e2d3a4147ef 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -43,9 +43,7 @@ class SDWAOperand; class SDWADstOperand; using SDWAOperandsVector = SmallVector; - -// helper typedef to make code cleaner -typedef MapVector SDWAOperandsMap; +using SDWAOperandsMap = MapVector; class SIPeepholeSDWA : public MachineFunctionPass { private: @@ -347,13 +345,14 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, return nullptr; } - for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) { + + for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg())) { // If there exist use of subreg of Reg then return nullptr if (!isSameReg(UseMO, *Reg)) return nullptr; // Check that all instructions the use Reg can be converted - if (!isConvertibleToSDWA(*(UseMO.getParent()), ST, TII)) { + if (!isConvertibleToSDWA(UseMI, ST, TII)) { return nullptr; } } From 0ef2512497d8c132e964a20dfff5dfa3d22646a1 Mon Sep 17 00:00:00 2001 From: Brian Favela Date: Fri, 7 Jun 2024 17:23:07 -0400 Subject: [PATCH 5/6] Address review and update tests --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 11 ++++----- llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll | 25 +++++++++----------- llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 3 +-- 3 files changed, 17 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 64b7e2d3a4147ef..8f56793d7bcc632 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -345,20 +345,19 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, return nullptr; } - for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg())) { - // If there exist use of subreg of Reg then return nullptr - if (!isSameReg(UseMO, *Reg)) - return nullptr; - - // Check that all instructions the use Reg can be converted + // Check that all instructions that use Reg can be converted if (!isConvertibleToSDWA(UseMI, ST, TII)) { return nullptr; } } + // Now that it's guaranteed all uses are legal, iterate over the uses again // to add them for later conversion. for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) { + // Should not get a subregister here + assert(isSameReg(UseMO, *Reg)); + SDWAOperandsMap& potentialMatchesMap = *PotentialMatches; MachineInstr* UseMI = UseMO.getParent(); potentialMatchesMap[UseMI].push_back(this); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index cfdb4f7a07d024e..a2e30603b6afcd4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -659,31 +659,28 @@ define amdgpu_kernel void @fmuladd_v2f16( ; VI-FLUSH-NEXT: s_mov_b32 s14, s10 ; VI-FLUSH-NEXT: s_mov_b32 s15, s11 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s12, s2 +; VI-FLUSH-NEXT: s_mov_b32 s13, s3 ; VI-FLUSH-NEXT: s_mov_b32 s16, s4 ; VI-FLUSH-NEXT: s_mov_b32 s17, s5 +; VI-FLUSH-NEXT: s_mov_b32 s18, s10 +; VI-FLUSH-NEXT: s_mov_b32 s19, s11 ; VI-FLUSH-NEXT: s_mov_b32 s4, s6 ; VI-FLUSH-NEXT: s_mov_b32 s5, s7 ; VI-FLUSH-NEXT: s_mov_b32 s6, s10 ; VI-FLUSH-NEXT: s_mov_b32 s7, s11 -; VI-FLUSH-NEXT: s_mov_b32 s12, s2 -; VI-FLUSH-NEXT: s_mov_b32 s13, s3 -; VI-FLUSH-NEXT: s_mov_b32 s18, s10 -; VI-FLUSH-NEXT: s_mov_b32 s19, s11 -; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0 -; VI-FLUSH-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; VI-FLUSH-NEXT: buffer_load_dword v2, off, s[16:19], 0 ; VI-FLUSH-NEXT: s_mov_b32 s8, s0 ; VI-FLUSH-NEXT: s_mov_b32 s9, s1 -; VI-FLUSH-NEXT: s_waitcnt vmcnt(2) -; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(1) -; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v5, v4 +; VI-FLUSH-NEXT: v_mac_f16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-FLUSH-NEXT: v_mac_f16_e32 v0, v2, v1 -; VI-FLUSH-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-FLUSH-NEXT: v_mac_f16_e32 v1, v0, v2 +; VI-FLUSH-NEXT: v_or_b32_e32 v0, v1, v3 ; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-FLUSH-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index bd0808e7e359db5..0f2eedb1923d636 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1557,8 +1557,7 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX89-NEXT: s_waitcnt vmcnt(1) ; GFX89-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX89-NEXT: v_mac_f16_sdwa v4, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX89-NEXT: v_mac_f16_sdwa v4, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX89-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX89-NEXT: v_mac_f16_e32 v2, v3, v2 ; GFX89-NEXT: v_or_b32_e32 v2, v2, v4 From dcde769952628eeff37a7198bb05a323f9810542 Mon Sep 17 00:00:00 2001 From: Brian Favela Date: Wed, 12 Jun 2024 16:47:36 -0400 Subject: [PATCH 6/6] Formatting changes --- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 8f56793d7bcc632..f47731bf6aac3f9 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -337,20 +337,16 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, const GCNSubtarget &ST, SDWAOperandsMap *PotentialMatches) { - // If PotentialMatches is not null, then fill out the map for all uses, - // if all can be converted if (PotentialMatches != nullptr) { + // Fill out the map for all uses if all can be converted MachineOperand *Reg = getReplacedOperand(); - if (!Reg->isReg() || !Reg->isDef()) { + if (!Reg->isReg() || !Reg->isDef()) return nullptr; - } - for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg())) { + for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg())) // Check that all instructions that use Reg can be converted - if (!isConvertibleToSDWA(UseMI, ST, TII)) { + if (!isConvertibleToSDWA(UseMI, ST, TII)) return nullptr; - } - } // Now that it's guaranteed all uses are legal, iterate over the uses again // to add them for later conversion. @@ -358,8 +354,8 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII, // Should not get a subregister here assert(isSameReg(UseMO, *Reg)); - SDWAOperandsMap& potentialMatchesMap = *PotentialMatches; - MachineInstr* UseMI = UseMO.getParent(); + SDWAOperandsMap &potentialMatchesMap = *PotentialMatches; + MachineInstr *UseMI = UseMO.getParent(); potentialMatchesMap[UseMI].push_back(this); } return nullptr;