-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Adding multiple use analysis to SIPeepholeSDWA #94800
[AMDGPU] Adding multiple use analysis to SIPeepholeSDWA #94800
Conversation
Thank you for submitting a Pull Request (PR) to the LLVM Project! This PR will be automatically labeled and the relevant teams will be If you wish to, you can add reviewers by using the "Reviewers" section on this page. If this is not working for you, it is probably because you do not have write If you have received no comments on your PR for a week, you can request a review If you have further questions, they may be answered by the LLVM GitHub User Guide. You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums. |
@llvm/pr-subscribers-llvm-globalisel Author: Brian Favela (bfavela) ChangesAllow for multiple uses of an operand where each instruction can be promoted to SDWA. For instance: ; v_and_b32 v2, lit(0x0000ffff), v2 Can be folded to: Patch is 188.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/94800.diff 19 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 1fadd8ce45b1f..082aeeea2c7cc 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -37,20 +37,24 @@ STATISTIC(NumSDWAInstructionsPeepholed,
namespace {
+bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST,
+ const SIInstrInfo *TII);
class SDWAOperand;
class SDWADstOperand;
-class SIPeepholeSDWA : public MachineFunctionPass {
-public:
- using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
+using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
+// helper typedef to make code cleaner
+typedef MapVector<MachineInstr *, SDWAOperandsVector> SDWAOperandsMap;
+
+class SIPeepholeSDWA : public MachineFunctionPass {
private:
MachineRegisterInfo *MRI;
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
- MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches;
+ SDWAOperandsMap PotentialMatches;
SmallVector<MachineInstr *, 8> ConvertedInstructions;
std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
@@ -65,7 +69,6 @@ class SIPeepholeSDWA : public MachineFunctionPass {
bool runOnMachineFunction(MachineFunction &MF) override;
void matchSDWAOperands(MachineBasicBlock &MBB);
std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
- bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
void pseudoOpConvertToVOP2(MachineInstr &MI,
const GCNSubtarget &ST) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
@@ -93,7 +96,9 @@ class SDWAOperand {
virtual ~SDWAOperand() = default;
- virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
+ virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches = nullptr) = 0;
virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
MachineOperand *getTargetOperand() const { return Target; }
@@ -126,7 +131,9 @@ class SDWASrcOperand : public SDWAOperand {
: SDWAOperand(TargetOp, ReplacedOp),
SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
- MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+ MachineInstr *potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches = nullptr) override;
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
SdwaSel getSrcSel() const { return SrcSel; }
@@ -153,7 +160,9 @@ class SDWADstOperand : public SDWAOperand {
SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
: SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
- MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+ MachineInstr *potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches = nullptr) override;
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
SdwaSel getDstSel() const { return DstSel; }
@@ -327,7 +336,37 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
return Mods;
}
-MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
+MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches) {
+ // If PotentialMatches is not null, then fill out the map for all uses,
+ // if all can be converted
+ if (PotentialMatches != nullptr) {
+ MachineOperand *Reg = getReplacedOperand();
+ if (!Reg->isReg() || !Reg->isDef()) {
+ return nullptr;
+ }
+
+ for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
+ // If there exist use of subreg of Reg then return nullptr
+ if (!isSameReg(UseMO, *Reg))
+ return nullptr;
+
+ // Check that all instructions the use Reg can be converted
+ if (!isConvertibleToSDWA(*(UseMO.getParent()), ST, TII)) {
+ return nullptr;
+ }
+ }
+ // Now that it's guaranteed all uses are legal, iterate over the uses again
+ // to add them for later conversion.
+ for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
+ SDWAOperandsMap& potentialMatchesMap = *PotentialMatches;
+ MachineInstr* UseMI = UseMO.getParent();
+ potentialMatchesMap[UseMI].push_back(this);
+ }
+ return nullptr;
+ }
+
// For SDWA src operand potential instruction is one that use register
// defined by parent instruction
MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
@@ -420,7 +459,9 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
return true;
}
-MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
+MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches) {
// For SDWA dst operand potential instruction is one that defines register
// that this operand uses
MachineRegisterInfo *MRI = getMRI();
@@ -919,8 +960,10 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
}
-bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
- const GCNSubtarget &ST) const {
+namespace {
+bool isConvertibleToSDWA(MachineInstr &MI,
+ const GCNSubtarget &ST,
+ const SIInstrInfo* TII) {
// Check if this is already an SDWA instruction
unsigned Opc = MI.getOpcode();
if (TII->isSDWA(Opc))
@@ -980,6 +1023,7 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
return true;
}
+} // namespace
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
const SDWAOperandsVector &SDWAOperands) {
@@ -1215,7 +1259,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
matchSDWAOperands(MBB);
for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
- MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+ MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
if (PotentialMI &&
(PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
@@ -1228,8 +1272,8 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
- MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
- if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
+ MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches);
+ if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) {
PotentialMatches[PotentialMI].push_back(Operand.get());
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 02781e763f44a..eb20178f9f4d8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -771,7 +771,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v6, 8
+; VI-NEXT: v_mov_b32_e32 v6, 9
+; VI-NEXT: v_mov_b32_e32 v7, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -779,28 +780,28 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v1, v[0:1]
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: v_mov_b32_e32 v2, 9
+; VI-NEXT: v_mov_b32_e32 v2, 0xff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v1
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; VI-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1
; VI-NEXT: v_add_u16_e32 v9, 9, v1
-; VI-NEXT: v_add_u16_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; VI-NEXT: v_add_u16_e32 v7, 9, v7
+; VI-NEXT: v_add_u16_sdwa v10, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
; VI-NEXT: v_add_u16_e32 v8, 9, v8
-; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_and_b32_e32 v10, 0xff, v10
-; VI-NEXT: v_lshlrev_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_and_b32_e32 v1, 0xff, v8
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_and_b32_e32 v6, 0xff, v6
+; VI-NEXT: v_lshlrev_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v10
; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v10
+; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v6
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_or_b32_e32 v2, v0, v2
; VI-NEXT: v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 06930388901b0..4df5fa18e2942 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -1271,46 +1271,45 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX8-LABEL: v_fshl_v4i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_not_b32_e32 v7, v2
+; GFX8-NEXT: v_mov_b32_e32 v9, 1
+; GFX8-NEXT: v_and_b32_e32 v6, 7, v2
+; GFX8-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX8-NEXT: v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2
-; GFX8-NEXT: v_and_b32_e32 v8, 7, v2
-; GFX8-NEXT: v_not_b32_e32 v2, v2
-; GFX8-NEXT: v_mov_b32_e32 v10, 1
-; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX8-NEXT: v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_lshlrev_b16_e32 v8, v8, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v11
+; GFX8-NEXT: v_lshlrev_b16_e32 v6, v6, v0
+; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v10
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1
-; GFX8-NEXT: v_or_b32_e32 v2, v8, v2
-; GFX8-NEXT: v_and_b32_e32 v8, 7, v5
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX8-NEXT: v_and_b32_e32 v7, 7, v5
; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX8-NEXT: v_and_b32_e32 v5, 7, v5
-; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_mov_b32_e32 v9, 0xff
-; GFX8-NEXT: v_lshlrev_b16_e32 v3, v8, v3
+; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_lshlrev_b16_e32 v3, v7, v3
; GFX8-NEXT: v_lshrrev_b16_e32 v4, v5, v4
+; GFX8-NEXT: v_mov_b32_e32 v8, 0xff
; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX8-NEXT: v_and_b32_e32 v4, 7, v6
-; GFX8-NEXT: v_not_b32_e32 v5, v6
-; GFX8-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v5, 7, v5
-; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v6
-; GFX8-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6
-; GFX8-NEXT: v_not_b32_e32 v6, v7
-; GFX8-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX8-NEXT: v_and_b32_e32 v5, 7, v7
-; GFX8-NEXT: v_and_b32_e32 v6, 7, v6
-; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT: v_lshrrev_b16_e32 v1, v6, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, 7
+; GFX8-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX8-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v8
+; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8
+; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v7
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 8
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v4
+; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v5
+; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
@@ -1321,47 +1320,46 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX9-LABEL: v_fshl_v4i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_not_b32_e32 v7, v2
+; GFX9-NEXT: v_mov_b32_e32 v9, 1
+; GFX9-NEXT: v_and_b32_e32 v6, 7, v2
+; GFX9-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX9-NEXT: v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2
-; GFX9-NEXT: v_and_b32_e32 v8, 7, v2
-; GFX9-NEXT: v_not_b32_e32 v2, v2
-; GFX9-NEXT: v_mov_b32_e32 v10, 1
-; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX9-NEXT: v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_lshlrev_b16_e32 v8, v8, v0
-; GFX9-NEXT: v_lshrrev_b16_e32 v2, v2, v11
+; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v0
+; GFX9-NEXT: v_lshrrev_b16_e32 v7, v7, v10
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1
-; GFX9-NEXT: v_or_b32_e32 v2, v8, v2
-; GFX9-NEXT: v_and_b32_e32 v8, 7, v5
+; GFX9-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX9-NEXT: v_and_b32_e32 v7, 7, v5
; GFX9-NEXT: v_not_b32_e32 v5, v5
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX9-NEXT: v_and_b32_e32 v5, 7, v5
-; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_mov_b32_e32 v9, 0xff
-; GFX9-NEXT: v_lshlrev_b16_e32 v3, v8, v3
+; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, v7, v3
; GFX9-NEXT: v_lshrrev_b16_e32 v4, v5, v4
+; GFX9-NEXT: v_mov_b32_e32 v8, 0xff
; GFX9-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX9-NEXT: v_and_b32_e32 v4, 7, v6
-; GFX9-NEXT: v_not_b32_e32 v5, v6
-; GFX9-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v5, 7, v5
-; GFX9-NEXT: v_lshrrev_b16_e32 v6, 1, v6
-; GFX9-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshrrev_b16_e32 v5, v5, v6
-; GFX9-NEXT: v_not_b32_e32 v6, v7
-; GFX9-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX9-NEXT: v_and_b32_e32 v5, 7, v7
-; GFX9-NEXT: v_and_b32_e32 v6, 7, v6
-; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT: v_lshrrev_b16_e32 v1, v6, v1
+; GFX9-NEXT: v_mov_b32_e32 v4, 7
+; GFX9-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX9-NEXT: v_lshrrev_b16_e32 v10, 1, v10
+; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b16_e32 v7, v7, v10
+; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1
+; GFX9-NEXT: v_or_b32_e32 v5, v5, v7
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 8
; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_and_or_b32 v1, v2, v9, v1
-; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v4
+; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v5
; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX9-NEXT: v_and_or_b32 v1, v6, v8, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0
@@ -1370,42 +1368,41 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX10-LABEL: v_fshl_v4i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2
-; GFX10-NEXT: v_and_b32_e32 v10, 7, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v2
+; GFX10-NEXT: v_and_b32_e32 v9, 7, v2
+; GFX10-NEXT: v_and_b32_e...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Brian Favela (bfavela) ChangesAllow for multiple uses of an operand where each instruction can be promoted to SDWA. For instance: ; v_and_b32 v2, lit(0x0000ffff), v2 Can be folded to: Patch is 188.70 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/94800.diff 19 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 1fadd8ce45b1f..082aeeea2c7cc 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -37,20 +37,24 @@ STATISTIC(NumSDWAInstructionsPeepholed,
namespace {
+bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST,
+ const SIInstrInfo *TII);
class SDWAOperand;
class SDWADstOperand;
-class SIPeepholeSDWA : public MachineFunctionPass {
-public:
- using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
+using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
+// helper typedef to make code cleaner
+typedef MapVector<MachineInstr *, SDWAOperandsVector> SDWAOperandsMap;
+
+class SIPeepholeSDWA : public MachineFunctionPass {
private:
MachineRegisterInfo *MRI;
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
- MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches;
+ SDWAOperandsMap PotentialMatches;
SmallVector<MachineInstr *, 8> ConvertedInstructions;
std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
@@ -65,7 +69,6 @@ class SIPeepholeSDWA : public MachineFunctionPass {
bool runOnMachineFunction(MachineFunction &MF) override;
void matchSDWAOperands(MachineBasicBlock &MBB);
std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
- bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
void pseudoOpConvertToVOP2(MachineInstr &MI,
const GCNSubtarget &ST) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
@@ -93,7 +96,9 @@ class SDWAOperand {
virtual ~SDWAOperand() = default;
- virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
+ virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches = nullptr) = 0;
virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
MachineOperand *getTargetOperand() const { return Target; }
@@ -126,7 +131,9 @@ class SDWASrcOperand : public SDWAOperand {
: SDWAOperand(TargetOp, ReplacedOp),
SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
- MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+ MachineInstr *potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches = nullptr) override;
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
SdwaSel getSrcSel() const { return SrcSel; }
@@ -153,7 +160,9 @@ class SDWADstOperand : public SDWAOperand {
SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
: SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
- MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+ MachineInstr *potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches = nullptr) override;
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
SdwaSel getDstSel() const { return DstSel; }
@@ -327,7 +336,37 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
return Mods;
}
-MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
+MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches) {
+ // If PotentialMatches is not null, then fill out the map for all uses,
+ // if all can be converted
+ if (PotentialMatches != nullptr) {
+ MachineOperand *Reg = getReplacedOperand();
+ if (!Reg->isReg() || !Reg->isDef()) {
+ return nullptr;
+ }
+
+ for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
+ // If there exist use of subreg of Reg then return nullptr
+ if (!isSameReg(UseMO, *Reg))
+ return nullptr;
+
+ // Check that all instructions the use Reg can be converted
+ if (!isConvertibleToSDWA(*(UseMO.getParent()), ST, TII)) {
+ return nullptr;
+ }
+ }
+ // Now that it's guaranteed all uses are legal, iterate over the uses again
+ // to add them for later conversion.
+ for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
+ SDWAOperandsMap& potentialMatchesMap = *PotentialMatches;
+ MachineInstr* UseMI = UseMO.getParent();
+ potentialMatchesMap[UseMI].push_back(this);
+ }
+ return nullptr;
+ }
+
// For SDWA src operand potential instruction is one that use register
// defined by parent instruction
MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
@@ -420,7 +459,9 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
return true;
}
-MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
+MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
+ const GCNSubtarget &ST,
+ SDWAOperandsMap *PotentialMatches) {
// For SDWA dst operand potential instruction is one that defines register
// that this operand uses
MachineRegisterInfo *MRI = getMRI();
@@ -919,8 +960,10 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
}
-bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
- const GCNSubtarget &ST) const {
+namespace {
+bool isConvertibleToSDWA(MachineInstr &MI,
+ const GCNSubtarget &ST,
+ const SIInstrInfo* TII) {
// Check if this is already an SDWA instruction
unsigned Opc = MI.getOpcode();
if (TII->isSDWA(Opc))
@@ -980,6 +1023,7 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
return true;
}
+} // namespace
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
const SDWAOperandsVector &SDWAOperands) {
@@ -1215,7 +1259,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
matchSDWAOperands(MBB);
for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
- MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+ MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
if (PotentialMI &&
(PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
@@ -1228,8 +1272,8 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
- MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
- if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
+ MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches);
+ if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) {
PotentialMatches[PotentialMI].push_back(Operand.get());
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 02781e763f44a..eb20178f9f4d8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -771,7 +771,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v6, 8
+; VI-NEXT: v_mov_b32_e32 v6, 9
+; VI-NEXT: v_mov_b32_e32 v7, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -779,28 +780,28 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v1, v[0:1]
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: v_mov_b32_e32 v2, 9
+; VI-NEXT: v_mov_b32_e32 v2, 0xff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v1
-; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v1
+; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1
+; VI-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1
; VI-NEXT: v_add_u16_e32 v9, 9, v1
-; VI-NEXT: v_add_u16_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; VI-NEXT: v_add_u16_e32 v7, 9, v7
+; VI-NEXT: v_add_u16_sdwa v10, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_add_u16_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
; VI-NEXT: v_add_u16_e32 v8, 9, v8
-; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_and_b32_e32 v10, 0xff, v10
-; VI-NEXT: v_lshlrev_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; VI-NEXT: v_and_b32_e32 v1, 0xff, v8
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_and_b32_e32 v6, 0xff, v6
+; VI-NEXT: v_lshlrev_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v10
; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v10
+; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v6
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_or_b32_e32 v2, v0, v2
; VI-NEXT: v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 06930388901b0..4df5fa18e2942 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -1271,46 +1271,45 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX8-LABEL: v_fshl_v4i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_not_b32_e32 v7, v2
+; GFX8-NEXT: v_mov_b32_e32 v9, 1
+; GFX8-NEXT: v_and_b32_e32 v6, 7, v2
+; GFX8-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX8-NEXT: v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2
-; GFX8-NEXT: v_and_b32_e32 v8, 7, v2
-; GFX8-NEXT: v_not_b32_e32 v2, v2
-; GFX8-NEXT: v_mov_b32_e32 v10, 1
-; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX8-NEXT: v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_lshlrev_b16_e32 v8, v8, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v11
+; GFX8-NEXT: v_lshlrev_b16_e32 v6, v6, v0
+; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v10
; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1
-; GFX8-NEXT: v_or_b32_e32 v2, v8, v2
-; GFX8-NEXT: v_and_b32_e32 v8, 7, v5
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX8-NEXT: v_and_b32_e32 v7, 7, v5
; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX8-NEXT: v_and_b32_e32 v5, 7, v5
-; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_mov_b32_e32 v9, 0xff
-; GFX8-NEXT: v_lshlrev_b16_e32 v3, v8, v3
+; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_lshlrev_b16_e32 v3, v7, v3
; GFX8-NEXT: v_lshrrev_b16_e32 v4, v5, v4
+; GFX8-NEXT: v_mov_b32_e32 v8, 0xff
; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX8-NEXT: v_and_b32_e32 v4, 7, v6
-; GFX8-NEXT: v_not_b32_e32 v5, v6
-; GFX8-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v5, 7, v5
-; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v6
-; GFX8-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6
-; GFX8-NEXT: v_not_b32_e32 v6, v7
-; GFX8-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX8-NEXT: v_and_b32_e32 v5, 7, v7
-; GFX8-NEXT: v_and_b32_e32 v6, 7, v6
-; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT: v_lshrrev_b16_e32 v1, v6, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, 7
+; GFX8-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX8-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v8
+; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8
+; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v7
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 8
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v4
+; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v5
+; GFX8-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
@@ -1321,47 +1320,46 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX9-LABEL: v_fshl_v4i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_not_b32_e32 v7, v2
+; GFX9-NEXT: v_mov_b32_e32 v9, 1
+; GFX9-NEXT: v_and_b32_e32 v6, 7, v2
+; GFX9-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX9-NEXT: v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2
-; GFX9-NEXT: v_and_b32_e32 v8, 7, v2
-; GFX9-NEXT: v_not_b32_e32 v2, v2
-; GFX9-NEXT: v_mov_b32_e32 v10, 1
-; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX9-NEXT: v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_lshlrev_b16_e32 v8, v8, v0
-; GFX9-NEXT: v_lshrrev_b16_e32 v2, v2, v11
+; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v0
+; GFX9-NEXT: v_lshrrev_b16_e32 v7, v7, v10
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1
-; GFX9-NEXT: v_or_b32_e32 v2, v8, v2
-; GFX9-NEXT: v_and_b32_e32 v8, 7, v5
+; GFX9-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX9-NEXT: v_and_b32_e32 v7, 7, v5
; GFX9-NEXT: v_not_b32_e32 v5, v5
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX9-NEXT: v_and_b32_e32 v5, 7, v5
-; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_mov_b32_e32 v9, 0xff
-; GFX9-NEXT: v_lshlrev_b16_e32 v3, v8, v3
+; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_lshlrev_b16_e32 v3, v7, v3
; GFX9-NEXT: v_lshrrev_b16_e32 v4, v5, v4
+; GFX9-NEXT: v_mov_b32_e32 v8, 0xff
; GFX9-NEXT: v_or_b32_e32 v3, v3, v4
-; GFX9-NEXT: v_and_b32_e32 v4, 7, v6
-; GFX9-NEXT: v_not_b32_e32 v5, v6
-; GFX9-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v5, 7, v5
-; GFX9-NEXT: v_lshrrev_b16_e32 v6, 1, v6
-; GFX9-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: v_lshrrev_b16_e32 v5, v5, v6
-; GFX9-NEXT: v_not_b32_e32 v6, v7
-; GFX9-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX9-NEXT: v_and_b32_e32 v5, 7, v7
-; GFX9-NEXT: v_and_b32_e32 v6, 7, v6
-; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT: v_lshrrev_b16_e32 v1, v6, v1
+; GFX9-NEXT: v_mov_b32_e32 v4, 7
+; GFX9-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-NEXT: v_and_b32_e32 v7, 7, v7
+; GFX9-NEXT: v_lshrrev_b16_e32 v10, 1, v10
+; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b16_e32 v7, v7, v10
+; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1
+; GFX9-NEXT: v_or_b32_e32 v5, v5, v7
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 8
; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT: v_and_or_b32 v1, v2, v9, v1
-; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v4
+; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v5
; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX9-NEXT: v_and_or_b32 v1, v6, v8, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0
@@ -1370,42 +1368,41 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX10-LABEL: v_fshl_v4i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2
-; GFX10-NEXT: v_and_b32_e32 v10, 7, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v2
+; GFX10-NEXT: v_and_b32_e32 v9, 7, v2
+; GFX10-NEXT: v_and_b32_e...
[truncated]
|
} | ||
// Now that it's guaranteed all uses are legal, iterate over the uses again | ||
// to add them for later conversion. | ||
for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems more important to only do this once for multiple use operands an instruction
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The "convertToSDWA" iterates on every operand that can be converted. If you have multiple uses in an instruction that are the same operand, it won't catch them. I added a test for this as well.
…e if it can be folded ; v_and_b32 v2, lit(0x0000ffff), v2 ; v_and_b32 v3, 6, v2 ; v_and_b32 v2, 1, v2 Can be folded to: ; v_and_b32 v3, 6, sel_lo(v2) ; v_and_b32 v2, 1, sel_lo(v2)
06c8c5b
to
0ef2512
Compare
@bfavela Congratulations on having your first Pull Request (PR) merged into the LLVM Project! Your changes will be combined with recent changes from other authors, then tested Please check whether problems have been caused by your change specifically, as How to do this, and the rest of the post-merge process, is covered in detail here. If your change does cause a problem, it may be reverted, or you can revert it yourself. If you don't get any reports, no action is required from you. Your changes are working as expected, well done! |
Allow for multiple uses of an operand where each instruction can be promoted to SDWA.
For instance:
; v_and_b32 v2, lit(0x0000ffff), v2
; v_and_b32 v3, 6, v2
; v_and_b32 v2, 1, v2
Can be folded to:
; v_and_b32 v3, 6, sel_lo(v2)
; v_and_b32 v2, 1, sel_lo(v2)