Skip to content

Commit

Permalink
[CodeGen][MachineLICM] Use RegUnits in HoistRegionPostRA (llvm#94608)
Browse files Browse the repository at this point in the history
Those BitVectors get expensive on targets like AMDGPU with thousands of
registers, and RegAliasIterator is also expensive.

We can move all liveness calculations to use RegUnits instead to speed
it up for targets where RegAliasIterator is expensive, like AMDGPU.
On targets where RegAliasIterator is cheap, this alternative can be a little more expensive, but I believe the tradeoff is worth it.
  • Loading branch information
Pierre-vh authored Jun 11, 2024
1 parent 546c816 commit d4b8b72
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 47 deletions.
143 changes: 98 additions & 45 deletions llvm/lib/CodeGen/MachineLICM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,8 +223,8 @@ namespace {
void HoistPostRA(MachineInstr *MI, unsigned Def, MachineLoop *CurLoop,
MachineBasicBlock *CurPreheader);

void ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
BitVector &PhysRegClobbers, SmallSet<int, 32> &StoredFIs,
void ProcessMI(MachineInstr *MI, BitVector &RUDefs, BitVector &RUClobbers,
SmallSet<int, 32> &StoredFIs,
SmallVectorImpl<CandidateInfo> &Candidates,
MachineLoop *CurLoop);

Expand Down Expand Up @@ -423,10 +423,47 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
return false;
}

static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI,
BitVector &RUs,
const uint32_t *Mask) {
// Iterate over the RegMask raw to avoid constructing a BitVector, which is
// expensive as it implies dynamically allocating memory.
//
// We also work backwards.
const unsigned NumRegs = TRI.getNumRegs();
const unsigned MaskWords = (NumRegs + 31) / 32;
for (unsigned K = 0; K < MaskWords; ++K) {
// We want to set the bits that aren't in RegMask, so flip it.
uint32_t Word = ~Mask[K];

// Iterate all set bits, starting from the right.
while (Word) {
const unsigned SetBitIdx = countr_zero(Word);

// The bits are numbered from the LSB in each word.
const unsigned PhysReg = (K * 32) + SetBitIdx;

// Clear the bit at SetBitIdx. Doing it this way appears to generate less
// instructions on x86. This works because negating a number will flip all
// the bits after SetBitIdx. So (Word & -Word) == (1 << SetBitIdx), but
// faster.
Word ^= Word & -Word;

if (PhysReg == NumRegs)
return;

if (PhysReg) {
for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI)
RUs.set(*RUI);
}
}
}
}

/// Examine the instruction for potentai LICM candidate. Also
/// gather register def and frame object update information.
void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
BitVector &PhysRegClobbers,
void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &RUDefs,
BitVector &RUClobbers,
SmallSet<int, 32> &StoredFIs,
SmallVectorImpl<CandidateInfo> &Candidates,
MachineLoop *CurLoop) {
Expand All @@ -448,7 +485,7 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
// We can't hoist an instruction defining a physreg that is clobbered in
// the loop.
if (MO.isRegMask()) {
PhysRegClobbers.setBitsNotInMask(MO.getRegMask());
applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, MO.getRegMask());
continue;
}

Expand All @@ -460,16 +497,22 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
assert(Reg.isPhysical() && "Not expecting virtual register!");

if (!MO.isDef()) {
if (Reg && (PhysRegDefs.test(Reg) || PhysRegClobbers.test(Reg)))
// If it's using a non-loop-invariant register, then it's obviously not
// safe to hoist.
HasNonInvariantUse = true;
if (!HasNonInvariantUse) {
for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
// If it's using a non-loop-invariant register, then it's obviously
// not safe to hoist.
if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) {
HasNonInvariantUse = true;
break;
}
}
}
continue;
}

if (MO.isImplicit()) {
for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
PhysRegClobbers.set(*AI);
for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
RUClobbers.set(*RUI);
if (!MO.isDead())
// Non-dead implicit def? This cannot be hoisted.
RuledOut = true;
Expand All @@ -488,19 +531,18 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
// If we have already seen another instruction that defines the same
// register, then this is not safe. Two defs is indicated by setting a
// PhysRegClobbers bit.
for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) {
if (PhysRegDefs.test(*AS))
PhysRegClobbers.set(*AS);
for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
if (RUDefs.test(*RUI)) {
RUClobbers.set(*RUI);
RuledOut = true;
} else if (RUClobbers.test(*RUI)) {
// MI defined register is seen defined by another instruction in
// the loop, it cannot be a LICM candidate.
RuledOut = true;
}

RUDefs.set(*RUI);
}
// Need a second loop because MCRegAliasIterator can visit the same
// register twice.
for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS)
PhysRegDefs.set(*AS);

if (PhysRegClobbers.test(Reg))
// MI defined register is seen defined by another instruction in
// the loop, it cannot be a LICM candidate.
RuledOut = true;
}

// Only consider reloads for now and remats which do not have register
Expand All @@ -521,9 +563,9 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
if (!Preheader)
return;

unsigned NumRegs = TRI->getNumRegs();
BitVector PhysRegDefs(NumRegs); // Regs defined once in the loop.
BitVector PhysRegClobbers(NumRegs); // Regs defined more than once.
unsigned NumRegUnits = TRI->getNumRegUnits();
BitVector RUDefs(NumRegUnits); // RUs defined once in the loop.
BitVector RUClobbers(NumRegUnits); // RUs defined more than once.

SmallVector<CandidateInfo, 32> Candidates;
SmallSet<int, 32> StoredFIs;
Expand All @@ -540,22 +582,21 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
// FIXME: That means a reload that're reused in successor block(s) will not
// be LICM'ed.
for (const auto &LI : BB->liveins()) {
for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI)
PhysRegDefs.set(*AI);
for (MCRegUnitIterator RUI(LI.PhysReg, TRI); RUI.isValid(); ++RUI)
RUDefs.set(*RUI);
}

// Funclet entry blocks will clobber all registers
if (const uint32_t *Mask = BB->getBeginClobberMask(TRI))
PhysRegClobbers.setBitsNotInMask(Mask);
applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, Mask);

SpeculationState = SpeculateUnknown;
for (MachineInstr &MI : *BB)
ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates,
CurLoop);
ProcessMI(&MI, RUDefs, RUClobbers, StoredFIs, Candidates, CurLoop);
}

// Gather the registers read / clobbered by the terminator.
BitVector TermRegs(NumRegs);
BitVector TermRUs(NumRegUnits);
MachineBasicBlock::iterator TI = Preheader->getFirstTerminator();
if (TI != Preheader->end()) {
for (const MachineOperand &MO : TI->operands()) {
Expand All @@ -564,8 +605,8 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
Register Reg = MO.getReg();
if (!Reg)
continue;
for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
TermRegs.set(*AI);
for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
TermRUs.set(*RUI);
}
}

Expand All @@ -583,24 +624,36 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
continue;

unsigned Def = Candidate.Def;
if (!PhysRegClobbers.test(Def) && !TermRegs.test(Def)) {
bool Safe = true;
MachineInstr *MI = Candidate.MI;
for (const MachineOperand &MO : MI->all_uses()) {
if (!MO.getReg())
continue;
Register Reg = MO.getReg();
if (PhysRegDefs.test(Reg) ||
PhysRegClobbers.test(Reg)) {
bool Safe = true;
for (MCRegUnitIterator RUI(Def, TRI); RUI.isValid(); ++RUI) {
if (RUClobbers.test(*RUI) || TermRUs.test(*RUI)) {
Safe = false;
break;
}
}

if (!Safe)
continue;

MachineInstr *MI = Candidate.MI;
for (const MachineOperand &MO : MI->all_uses()) {
if (!MO.getReg())
continue;
for (MCRegUnitIterator RUI(MO.getReg(), TRI); RUI.isValid(); ++RUI) {
if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) {
// If it's using a non-loop-invariant register, then it's obviously
// not safe to hoist.
Safe = false;
break;
}
}
if (Safe)
HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader);

if (!Safe)
break;
}

if (Safe)
HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader);
}
}

Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/indirect-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
; GCN-NEXT: v_writelane_b32 v40, s62, 30
; GCN-NEXT: v_writelane_b32 v40, s63, 31
; GCN-NEXT: s_mov_b64 s[6:7], exec
; GCN-NEXT: s_movk_i32 s4, 0x7b
; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_readfirstlane_b32 s8, v0
; GCN-NEXT: v_readfirstlane_b32 s9, v1
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc
; GCN-NEXT: s_movk_i32 s4, 0x7b
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
; GCN-NEXT: s_xor_b64 exec, exec, s[10:11]
Expand Down Expand Up @@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
; GISEL-NEXT: v_writelane_b32 v40, s62, 30
; GISEL-NEXT: v_writelane_b32 v40, s63, 31
; GISEL-NEXT: s_mov_b64 s[6:7], exec
; GISEL-NEXT: s_movk_i32 s4, 0x7b
; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
; GISEL-NEXT: v_readfirstlane_b32 s8, v0
; GISEL-NEXT: v_readfirstlane_b32 s9, v1
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc
; GISEL-NEXT: s_movk_i32 s4, 0x7b
; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
; GISEL-NEXT: ; implicit-def: $vgpr0
; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11]
Expand Down

0 comments on commit d4b8b72

Please sign in to comment.