Skip to content

Commit

Permalink
[ARM][LowOverheadLoops] Remove dead loop update instructions.
Browse files Browse the repository at this point in the history
After creating a low-overhead loop, the loop update instruction was still
lingering around hurting performance. This removes dead loop update
instructions, which in our case are mostly SUBS instructions.

To support this, some helper functions were added to MachineLoopUtils and
ReachingDefAnalysis to analyse live-ins of loop exit blocks and find uses
before a particular loop instruction, respectively.

This is a first version that removes a SUBS instruction when there are no other
uses inside and outside the loop block, but there are some more interesting
cases in test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll which
shows that there is room for improvement. For example, we can't handle this
case yet:

    ..
    dlstp.32  lr, r2
  .LBB0_1:
    mov r3, r2
    subs  r2, #4
    vldrh.u32 q2, [r1], #8
    vmov  q1, q0
    vmla.u32  q0, q2, r0
    letp  lr, .LBB0_1
  @ %bb.2:
    vctp.32 r3
    ..

which is a lot more tricky because r2 is not only used by the subs, but also by
the mov to r3, which is used outside the low-overhead loop by the vctp
instruction, and that requires a bit of a different approach, and I will follow
up on this.

Differential Revision: https://reviews.llvm.org/D71007
  • Loading branch information
Sjoerd Meijer committed Dec 11, 2019
1 parent bd0f271 commit d97cf1f
Show file tree
Hide file tree
Showing 11 changed files with 638 additions and 9 deletions.
5 changes: 5 additions & 0 deletions llvm/include/llvm/CodeGen/MachineLoopUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#define LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H

namespace llvm {
class MachineLoop;
class MachineBasicBlock;
class MachineRegisterInfo;
class TargetInstrInfo;
Expand All @@ -36,6 +37,10 @@ MachineBasicBlock *PeelSingleBlockLoop(LoopPeelDirection Direction,
MachineRegisterInfo &MRI,
const TargetInstrInfo *TII);

/// Return true if PhysReg is live outside the loop, i.e. determine if it
/// is live in the loop exit blocks, and false otherwise.
bool isRegLiveInExitBlocks(MachineLoop *Loop, int PhysReg);

} // namespace llvm

#endif // LLVM_LIB_CODEGEN_MACHINELOOPUTILS_H
7 changes: 7 additions & 0 deletions llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,13 @@ class ReachingDefAnalysis : public MachineFunctionPass {
/// use or a live out.
bool isRegUsedAfter(MachineInstr *MI, int PhysReg);

/// Provides the first instruction before MI that uses PhysReg
MachineInstr *getInstWithUseBefore(MachineInstr *MI, int PhysReg);

/// Provides all instructions before MI that uses PhysReg
void getAllInstWithUseBefore(MachineInstr *MI, int PhysReg,
SmallVectorImpl<MachineInstr*> &Uses);

/// Provides the clearance - the number of instructions since the closest
/// reaching def instuction of PhysReg that reaches MI.
int getClearance(MachineInstr *MI, MCPhysReg PhysReg);
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/CodeGen/MachineLoopUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//

#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineLoopUtils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
Expand Down Expand Up @@ -130,3 +131,14 @@ MachineBasicBlock *llvm::PeelSingleBlockLoop(LoopPeelDirection Direction,

return NewBB;
}

bool llvm::isRegLiveInExitBlocks(MachineLoop *Loop, int PhysReg) {
SmallVector<MachineBasicBlock *, 4> ExitBlocks;
Loop->getExitBlocks(ExitBlocks);

for (auto *MBB : ExitBlocks)
if (MBB->isLiveIn(PhysReg))
return true;

return false;
}
26 changes: 25 additions & 1 deletion llvm/lib/CodeGen/ReachingDefAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ int ReachingDefAnalysis::getClearance(MachineInstr *MI, MCPhysReg PhysReg) {
}

void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def, int PhysReg,
SmallVectorImpl<MachineInstr*> &Uses) {
SmallVectorImpl<MachineInstr*> &Uses) {
MachineBasicBlock *MBB = Def->getParent();
MachineBasicBlock::iterator MI = MachineBasicBlock::iterator(Def);
while (++MI != MBB->end()) {
Expand Down Expand Up @@ -272,3 +272,27 @@ bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI, int PhysReg) {
return false;
}

MachineInstr *ReachingDefAnalysis::getInstWithUseBefore(MachineInstr *MI,
int PhysReg) {
auto I = MachineBasicBlock::reverse_iterator(MI);
auto E = MI->getParent()->rend();
I++;

for ( ; I != E; I++)
for (auto &MO : I->operands())
if (MO.isReg() && MO.isUse() && MO.getReg() == PhysReg)
return &*I;

return nullptr;
}

void ReachingDefAnalysis::getAllInstWithUseBefore(MachineInstr *MI,
int PhysReg, SmallVectorImpl<MachineInstr*> &Uses) {
MachineInstr *Use = nullptr;
MachineInstr *Pos = MI;

while ((Use = getInstWithUseBefore(Pos, PhysReg))) {
Uses.push_back(Use);
Pos = Use;
}
}
75 changes: 73 additions & 2 deletions llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "ARMSubtarget.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineLoopUtils.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/ReachingDefAnalysis.h"
Expand Down Expand Up @@ -163,6 +164,7 @@ namespace {
ReachingDefAnalysis *RDA = nullptr;
const ARMBaseInstrInfo *TII = nullptr;
MachineRegisterInfo *MRI = nullptr;
const TargetRegisterInfo *TRI = nullptr;
std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;

public:
Expand Down Expand Up @@ -200,6 +202,8 @@ namespace {

void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;

void RemoveLoopUpdate(LowOverheadLoop &LoLoop);

void RemoveVPTBlocks(LowOverheadLoop &LoLoop);

MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
Expand Down Expand Up @@ -383,6 +387,7 @@ bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
MRI = &MF->getRegInfo();
TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
TRI = ST.getRegisterInfo();
BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF));
BBUtils->computeAllBlockSizes();
BBUtils->adjustBBOffsetsAfter(&MF->front());
Expand Down Expand Up @@ -511,7 +516,7 @@ void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
MIB.addImm(0);
MIB.addImm(ARMCC::AL);
MIB.addReg(ARM::NoRegister);

MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
ARM::tBcc : ARM::t2Bcc;
Expand Down Expand Up @@ -631,6 +636,70 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
return &*MIB;
}

// Goal is to optimise and clean-up these loops:
//
// vector.body:
// renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
// renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3(tied-def 0), 4
// ..
// $lr = MVE_DLSTP_32 renamable $r3
//
// The SUB is the old update of the loop iteration count expression, which
// is no longer needed. This sub is removed when the element count, which is in
// r3 in this example, is defined by an instruction in the loop, and it has
// no uses.
//
void ARMLowOverheadLoops::RemoveLoopUpdate(LowOverheadLoop &LoLoop) {
Register ElemCount = LoLoop.VCTP->getOperand(1).getReg();
MachineInstr *LastInstrInBlock = &LoLoop.VCTP->getParent()->back();

LLVM_DEBUG(dbgs() << "ARM Loops: Trying to remove loop update stmt\n");

if (LoLoop.ML->getNumBlocks() != 1) {
LLVM_DEBUG(dbgs() << "ARM Loops: single block loop expected\n");
return;
}

LLVM_DEBUG(dbgs() << "ARM Loops: Analyzing MO: ";
LoLoop.VCTP->getOperand(1).dump());

// Find the definition we are interested in removing, if there is one.
MachineInstr *Def = RDA->getReachingMIDef(LastInstrInBlock, ElemCount);
if (!Def)
return;

// Bail if we define CPSR and it is not dead
if (!Def->registerDefIsDead(ARM::CPSR, TRI)) {
LLVM_DEBUG(dbgs() << "ARM Loops: CPSR is not dead\n");
return;
}

// Bail if elemcount is used in exit blocks, i.e. if it is live-in.
if (isRegLiveInExitBlocks(LoLoop.ML, ElemCount)) {
LLVM_DEBUG(dbgs() << "ARM Loops: Elemcount is live-out, can't remove stmt\n");
return;
}

// Bail if there are uses after this Def in the block.
SmallVector<MachineInstr*, 4> Uses;
RDA->getReachingLocalUses(Def, ElemCount, Uses);
if (Uses.size()) {
LLVM_DEBUG(dbgs() << "ARM Loops: Local uses in block, can't remove stmt\n");
return;
}

Uses.clear();
RDA->getAllInstWithUseBefore(Def, ElemCount, Uses);

// Remove Def if there are no uses, or if the only use is the VCTP
// instruction.
if (!Uses.size() || (Uses.size() == 1 && Uses[0] == LoLoop.VCTP)) {
LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop update instruction: ";
Def->dump());
Def->eraseFromParent();
}
}

void ARMLowOverheadLoops::RemoveVPTBlocks(LowOverheadLoop &LoLoop) {
LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP);
LoLoop.VCTP->eraseFromParent();
Expand Down Expand Up @@ -703,8 +772,10 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
RemoveDeadBranch(LoLoop.Start);
LoLoop.End = ExpandLoopEnd(LoLoop);
RemoveDeadBranch(LoLoop.End);
if (LoLoop.IsTailPredicationLegal())
if (LoLoop.IsTailPredicationLegal()) {
RemoveLoopUpdate(LoLoop);
RemoveVPTBlocks(LoLoop);
}
}
}

Expand Down
171 changes: 171 additions & 0 deletions llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s

# There are 2 SUBS, and the 2nd one is identified as the def.
# Thus, the 1st is a use, and we shouldn't optimise away the SUBS.

# CHECK: bb.1.vector.body:
# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
# CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
# CHECK: $lr = MVE_LETP renamable $lr, %bb.1

--- |
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv8.1m.main-arm-unknown-eabi"

define dso_local void @use_before_def(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
entry:
%cmp8 = icmp sgt i32 %N, 0
%0 = add i32 %N, 3
%1 = lshr i32 %0, 2
%2 = shl nuw i32 %1, 2
%3 = add i32 %2, -4
%4 = lshr i32 %3, 2
%5 = add nuw nsw i32 %4, 1
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup

vector.ph: ; preds = %entry
call void @llvm.set.loop.iterations.i32(i32 %5)
br label %vector.body

vector.body: ; preds = %vector.body, %vector.ph
%lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
%lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
%lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
%6 = phi i32 [ %5, %vector.ph ], [ %11, %vector.body ]
%7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
%lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
%8 = call <4 x i1> @llvm.arm.vctp32(i32 %7)
%9 = sub i32 %7, 4
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3
%wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef), !tbaa !3
%10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8), !tbaa !3
%scevgep = getelementptr i32, i32* %lsr.iv, i32 4
%scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
%scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
%11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
%12 = icmp ne i32 %11, 0
br i1 %12, label %vector.body, label %for.cond.cleanup, !llvm.loop !7

for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
declare void @llvm.set.loop.iterations.i32(i32) #1
declare <4 x i1> @llvm.arm.vctp32(i32) #2
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #3
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #4
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3
declare void @llvm.stackprotector(i8*, i8**) #5

attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" }
attributes #1 = { noduplicate nounwind }
attributes #2 = { nounwind readnone }
attributes #3 = { argmemonly nounwind willreturn }
attributes #4 = { argmemonly nounwind readonly willreturn }
attributes #5 = { nounwind }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 1, !"min_enum_size", i32 4}
!2 = !{!"clang version 10.0.0 (http://github.com/llvm/llvm-project 2589b6d9edda73280fe1dc1d944ee34e22fe9a6f)"}
!3 = !{!4, !4, i64 0}
!4 = !{!"int", !5, i64 0}
!5 = !{!"omnipotent char", !6, i64 0}
!6 = !{!"Simple C++ TBAA"}
!7 = distinct !{!7, !8}
!8 = !{!"llvm.loop.isvectorized", i32 1}

...
---
name: use_before_def
alignment: 2
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
failedISel: false
tracksRegLiveness: true
hasWinCFI: false
registers: []
liveins:
- { reg: '$r0', virtual-reg: '' }
- { reg: '$r1', virtual-reg: '' }
- { reg: '$r2', virtual-reg: '' }
- { reg: '$r3', virtual-reg: '' }
frameInfo:
isFrameAddressTaken: false
isReturnAddressTaken: false
hasStackMap: false
hasPatchPoint: false
stackSize: 8
offsetAdjustment: 0
maxAlignment: 4
adjustsStack: false
hasCalls: false
stackProtector: ''
maxCallFrameSize: 0
cvBytesOfCalleeSavedRegisters: 0
hasOpaqueSPAdjustment: false
hasVAStart: false
hasMustTailInVarArgFunc: false
localFrameSize: 0
savePoint: ''
restorePoint: ''
fixedStack: []
stack:
- { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
callSites: []
constants: []
machineFunctionInfo: {}
body: |
bb.0.entry:
successors: %bb.1(0x80000000)
liveins: $r0, $r1, $r2, $r3, $lr
frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
frame-setup CFI_INSTRUCTION def_cfa_offset 8
frame-setup CFI_INSTRUCTION offset $lr, -4
frame-setup CFI_INSTRUCTION offset $r7, -8
$r7 = frame-setup tMOVr $sp, 14, $noreg
frame-setup CFI_INSTRUCTION def_cfa_register $r7
tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
t2IT 11, 8, implicit-def $itstate
tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate
renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg
renamable $lr = t2MOVi 1, 14, $noreg, $noreg
renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
t2DoLoopStart renamable $lr
bb.1.vector.body:
successors: %bb.1(0x7c000000), %bb.2(0x04000000)
liveins: $lr, $r0, $r1, $r2, $r3
renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
MVE_VPST 4, implicit $vpr
renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4, !tbaa !3)
renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4, !tbaa !3)
renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
MVE_VPST 8, implicit $vpr
renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1719, align 4, !tbaa !3)
renamable $lr = t2LoopDec killed renamable $lr, 1
renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
tB %bb.2, 14, $noreg
bb.2.for.cond.cleanup:
tPOP_RET 14, $noreg, def $r7, def $pc
...
Loading

0 comments on commit d97cf1f

Please sign in to comment.