Skip to content

Commit

Permalink
[LoopVectorize] Refine runtime memory check costs when there is an ou…
Browse files Browse the repository at this point in the history
…ter loop

When we generate runtime memory checks for an inner loop it's
possible that these checks are invariant in the outer loop and
so will get hoisted out. In such cases, the effective cost of
the checks should reduce to reflect the outer loop trip count.

This fixes a 25% performance regression introduced by commit

49b0e6d

when building the SPEC2017 x264 benchmark with PGO, where we
decided the inner loop trip count wasn't high enough to warrant
the (incorrect) high cost of the runtime checks. Also, when
runtime memory checks consist entirely of diff checks these are
likely to be outer loop invariant.
  • Loading branch information
david-arm committed Dec 20, 2023
1 parent a4caa47 commit a152314
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 10 deletions.
37 changes: 33 additions & 4 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2070,7 +2070,7 @@ class GeneratedRTChecks {
}
}

InstructionCost getCost() {
InstructionCost getCost(Loop *OuterLoop) {
if (SCEVCheckBlock || MemCheckBlock)
LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");

Expand All @@ -2091,16 +2091,45 @@ class GeneratedRTChecks {
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
RTCheckCost += C;
}
if (MemCheckBlock)
if (MemCheckBlock) {
InstructionCost MemCheckCost = 0;
for (Instruction &I : *MemCheckBlock) {
if (MemCheckBlock->getTerminator() == &I)
continue;
InstructionCost C =
TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
RTCheckCost += C;
MemCheckCost += C;
}

// If the runtime memory checks are being created inside an outer loop
// we should find out if these checks are outer loop invariant. If so,
// the checks will be hoisted out and so the effective cost will reduce
// according to the outer loop trip count.
if (OuterLoop) {
ScalarEvolution *SE = MemCheckExp.getSE();
const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
if (SE->isLoopInvariant(Cond, OuterLoop)) {
if (std::optional<unsigned> OuterTC =
getSmallBestKnownTC(*SE, OuterLoop))
MemCheckCost /= *OuterTC;
else {
// It seems reasonable to assume that we can reduce the effective
// cost of the checks even when we know nothing about the trip
// count. Here I've assumed that the outer loop executes at least
// twice.
MemCheckCost /= 2;
}

// Let's ensure the cost is always at least 1.
if (MemCheckCost == 0)
MemCheckCost = 1;
}
}

RTCheckCost += MemCheckCost;
}

if (SCEVCheckBlock || MemCheckBlock)
LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
<< "\n");
Expand Down Expand Up @@ -9754,7 +9783,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
std::optional<unsigned> VScale, Loop *L,
ScalarEvolution &SE,
ScalarEpilogueLowering SEL) {
InstructionCost CheckCost = Checks.getCost();
InstructionCost CheckCost = Checks.getCost(L->getParentLoop());
if (!CheckCost.isValid())
return false;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux-gnu"
define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
; CHECK: Calculating cost of runtime checks:
; CHECK: Total cost of runtime checks: 6
; CHECK: Total cost of runtime checks: 3
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
entry:
br label %outer.loop
Expand Down Expand Up @@ -43,7 +43,7 @@ outer.exit:
define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
; CHECK: Calculating cost of runtime checks:
; CHECK: Total cost of runtime checks: 6
; CHECK: Total cost of runtime checks: 2
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
entry:
br label %outer.loop
Expand Down Expand Up @@ -79,7 +79,7 @@ outer.exit:
define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
; CHECK: Calculating cost of runtime checks:
; CHECK: Total cost of runtime checks: 6
; CHECK: Total cost of runtime checks: 1
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
entry:
br label %outer.loop
Expand Down Expand Up @@ -115,7 +115,7 @@ outer.exit:
define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
; CHECK: Calculating cost of runtime checks:
; CHECK: Total cost of runtime checks: 6
; CHECK: Total cost of runtime checks: 2
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
entry:
br label %outer.loop
Expand Down Expand Up @@ -151,8 +151,8 @@ outer.exit:
define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
; CHECK: Calculating cost of runtime checks:
; CHECK: Total cost of runtime checks: 6
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
; CHECK: Total cost of runtime checks: 2
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4
entry:
br label %outer.loop

Expand Down

0 comments on commit a152314

Please sign in to comment.