Skip to content

Commit

Permalink
[AMDGPU] Adding multiple use analysis to SIPeepholeSDWA (#94800)
Browse files Browse the repository at this point in the history
Allow for multiple uses of an operand where each instruction can be
promoted to SDWA.

For instance:

; v_and_b32 v2, lit(0x0000ffff), v2
; v_and_b32 v3, 6, v2
; v_and_b32 v2, 1, v2

Can be folded to:
; v_and_b32 v3, 6, sel_lo(v2)
; v_and_b32 v2, 1, sel_lo(v2)
  • Loading branch information
bfavela authored Jun 14, 2024
1 parent b1932b8 commit e7e90dd
Show file tree
Hide file tree
Showing 18 changed files with 1,250 additions and 1,192 deletions.
68 changes: 53 additions & 15 deletions llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,22 @@ STATISTIC(NumSDWAInstructionsPeepholed,

namespace {

bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST,
const SIInstrInfo *TII);
class SDWAOperand;
class SDWADstOperand;

class SIPeepholeSDWA : public MachineFunctionPass {
public:
using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
using SDWAOperandsMap = MapVector<MachineInstr *, SDWAOperandsVector>;

class SIPeepholeSDWA : public MachineFunctionPass {
private:
MachineRegisterInfo *MRI;
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;

MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches;
SDWAOperandsMap PotentialMatches;
SmallVector<MachineInstr *, 8> ConvertedInstructions;

std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
Expand All @@ -65,7 +67,6 @@ class SIPeepholeSDWA : public MachineFunctionPass {
bool runOnMachineFunction(MachineFunction &MF) override;
void matchSDWAOperands(MachineBasicBlock &MBB);
std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
void pseudoOpConvertToVOP2(MachineInstr &MI,
const GCNSubtarget &ST) const;
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
Expand Down Expand Up @@ -93,7 +94,9 @@ class SDWAOperand {

virtual ~SDWAOperand() = default;

virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII,
const GCNSubtarget &ST,
SDWAOperandsMap *PotentialMatches = nullptr) = 0;
virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;

MachineOperand *getTargetOperand() const { return Target; }
Expand Down Expand Up @@ -126,7 +129,9 @@ class SDWASrcOperand : public SDWAOperand {
: SDWAOperand(TargetOp, ReplacedOp),
SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}

MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
MachineInstr *potentialToConvert(const SIInstrInfo *TII,
const GCNSubtarget &ST,
SDWAOperandsMap *PotentialMatches = nullptr) override;
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;

SdwaSel getSrcSel() const { return SrcSel; }
Expand All @@ -153,7 +158,9 @@ class SDWADstOperand : public SDWAOperand {
SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
: SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}

MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
MachineInstr *potentialToConvert(const SIInstrInfo *TII,
const GCNSubtarget &ST,
SDWAOperandsMap *PotentialMatches = nullptr) override;
bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;

SdwaSel getDstSel() const { return DstSel; }
Expand Down Expand Up @@ -327,7 +334,33 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
return Mods;
}

MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
const GCNSubtarget &ST,
SDWAOperandsMap *PotentialMatches) {
if (PotentialMatches != nullptr) {
// Fill out the map for all uses if all can be converted
MachineOperand *Reg = getReplacedOperand();
if (!Reg->isReg() || !Reg->isDef())
return nullptr;

for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg()))
// Check that all instructions that use Reg can be converted
if (!isConvertibleToSDWA(UseMI, ST, TII))
return nullptr;

// Now that it's guaranteed all uses are legal, iterate over the uses again
// to add them for later conversion.
for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
// Should not get a subregister here
assert(isSameReg(UseMO, *Reg));

SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
MachineInstr *UseMI = UseMO.getParent();
potentialMatchesMap[UseMI].push_back(this);
}
return nullptr;
}

// For SDWA src operand potential instruction is one that use register
// defined by parent instruction
MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
Expand Down Expand Up @@ -420,7 +453,9 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
return true;
}

MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
const GCNSubtarget &ST,
SDWAOperandsMap *PotentialMatches) {
// For SDWA dst operand potential instruction is one that defines register
// that this operand uses
MachineRegisterInfo *MRI = getMRI();
Expand Down Expand Up @@ -919,8 +954,10 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
}

bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
const GCNSubtarget &ST) const {
namespace {
bool isConvertibleToSDWA(MachineInstr &MI,
const GCNSubtarget &ST,
const SIInstrInfo* TII) {
// Check if this is already an SDWA instruction
unsigned Opc = MI.getOpcode();
if (TII->isSDWA(Opc))
Expand Down Expand Up @@ -980,6 +1017,7 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,

return true;
}
} // namespace

bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
const SDWAOperandsVector &SDWAOperands) {
Expand Down Expand Up @@ -1215,7 +1253,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
matchSDWAOperands(MBB);
for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
if (PotentialMI &&
(PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
Expand All @@ -1228,8 +1266,8 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {

for (const auto &OperandPair : SDWAOperands) {
const auto &Operand = OperandPair.second;
MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST, &PotentialMatches);
if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII)) {
PotentialMatches[PotentialMI].push_back(Operand.get());
}
}
Expand Down
27 changes: 14 additions & 13 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
Original file line number Diff line number Diff line change
Expand Up @@ -771,36 +771,37 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_mov_b32_e32 v6, 8
; VI-NEXT: v_mov_b32_e32 v6, 9
; VI-NEXT: v_mov_b32_e32 v7, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v1, v[0:1]
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 9
; VI-NEXT: v_mov_b32_e32 v2, 0xff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v1
; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1
; VI-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1
; VI-NEXT: v_add_u16_e32 v9, 9, v1
; VI-NEXT: v_add_u16_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_add_u16_e32 v7, 9, v7
; VI-NEXT: v_add_u16_sdwa v10, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_u16_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2
; VI-NEXT: v_add_u16_e32 v8, 9, v8
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_and_b32_e32 v10, 0xff, v10
; VI-NEXT: v_lshlrev_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_and_b32_e32 v1, 0xff, v8
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: v_and_b32_e32 v6, 0xff, v6
; VI-NEXT: v_lshlrev_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v10
; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v10
; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v6
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_or_b32_e32 v2, v0, v2
; VI-NEXT: v_mov_b32_e32 v0, s2
Expand Down
Loading

1 comment on commit e7e90dd

@jrbyrnes
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this is causing an expensive check failure that is exposed by ded9564

See #97135

Please sign in to comment.