forked from llvm/llvm-project
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[LoopVectorize] Refine runtime memory check costs when there is an ou…
…ter loop (llvm#76034) When we generate runtime memory checks for an inner loop it's possible that these checks are invariant in the outer loop and so will get hoisted out. In such cases, the effective cost of the checks should reduce to reflect the outer loop trip count. This fixes a 25% performance regression introduced by commit 49b0e6d when building the SPEC2017 x264 benchmark with PGO, where we decided the inner loop trip count wasn't high enough to warrant the (incorrect) high cost of the runtime checks. Also, when runtime memory checks consist entirely of diff checks these are likely to be outer loop invariant. (cherry picked from commit 962fbaf)
- Loading branch information
Showing
2 changed files
with
273 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
217 changes: 217 additions & 0 deletions
217
llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,217 @@ | ||
; REQUIRES: asserts | ||
; RUN: opt -p loop-vectorize -debug-only=loop-vectorize -S -disable-output < %s 2>&1 | FileCheck %s | ||
|
||
target triple = "aarch64-unknown-linux-gnu" | ||
|
||
define void @no_outer_loop(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %off, i64 noundef %n) { | ||
; CHECK-LABEL: LV: Checking a loop in 'no_outer_loop' | ||
; CHECK: Calculating cost of runtime checks: | ||
; CHECK-NOT: We expect runtime memory checks to be hoisted out of the outer loop. | ||
; CHECK: Total cost of runtime checks: 4 | ||
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 | ||
entry: | ||
br label %inner.loop | ||
|
||
inner.loop: | ||
%inner.iv = phi i64 [ 0, %entry ], [ %inner.iv.next, %inner.loop ] | ||
%add.us = add nuw nsw i64 %inner.iv, %off | ||
%arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us | ||
%0 = load i8, ptr %arrayidx.us, align 1 | ||
%arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us | ||
%1 = load i8, ptr %arrayidx7.us, align 1 | ||
%add9.us = add i8 %1, %0 | ||
store i8 %add9.us, ptr %arrayidx7.us, align 1 | ||
%inner.iv.next = add nuw nsw i64 %inner.iv, 1 | ||
%exitcond.not = icmp eq i64 %inner.iv.next, %n | ||
br i1 %exitcond.not, label %inner.exit, label %inner.loop | ||
|
||
inner.exit: | ||
ret void | ||
} | ||
|
||
define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) { | ||
; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc' | ||
; CHECK: Calculating cost of runtime checks: | ||
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 3 | ||
; CHECK: Total cost of runtime checks: 3 | ||
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 | ||
entry: | ||
br label %outer.loop | ||
|
||
outer.loop: | ||
%outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] | ||
%mul.us = mul nsw i64 %outer.iv, %n | ||
br label %inner.loop | ||
|
||
inner.loop: | ||
%inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] | ||
%add.us = add nuw nsw i64 %inner.iv, %mul.us | ||
%arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us | ||
%0 = load i8, ptr %arrayidx.us, align 1 | ||
%arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us | ||
%1 = load i8, ptr %arrayidx7.us, align 1 | ||
%add9.us = add i8 %1, %0 | ||
store i8 %add9.us, ptr %arrayidx7.us, align 1 | ||
%inner.iv.next = add nuw nsw i64 %inner.iv, 1 | ||
%exitcond.not = icmp eq i64 %inner.iv.next, %n | ||
br i1 %exitcond.not, label %inner.exit, label %inner.loop | ||
|
||
inner.exit: | ||
%outer.iv.next = add nuw nsw i64 %outer.iv, 1 | ||
%exitcond27.not = icmp eq i64 %outer.iv.next, %m | ||
br i1 %exitcond27.not, label %outer.exit, label %outer.loop | ||
|
||
outer.exit: | ||
ret void | ||
} | ||
|
||
|
||
define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) { | ||
; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3' | ||
; CHECK: Calculating cost of runtime checks: | ||
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2 | ||
; CHECK: Total cost of runtime checks: 2 | ||
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 | ||
entry: | ||
br label %outer.loop | ||
|
||
outer.loop: | ||
%outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] | ||
%mul.us = mul nsw i64 %outer.iv, %n | ||
br label %inner.loop | ||
|
||
inner.loop: | ||
%inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] | ||
%add.us = add nuw nsw i64 %inner.iv, %mul.us | ||
%arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us | ||
%0 = load i8, ptr %arrayidx.us, align 1 | ||
%arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us | ||
%1 = load i8, ptr %arrayidx7.us, align 1 | ||
%add9.us = add i8 %1, %0 | ||
store i8 %add9.us, ptr %arrayidx7.us, align 1 | ||
%inner.iv.next = add nuw nsw i64 %inner.iv, 1 | ||
%exitcond.not = icmp eq i64 %inner.iv.next, %n | ||
br i1 %exitcond.not, label %inner.exit, label %inner.loop | ||
|
||
inner.exit: | ||
%outer.iv.next = add nuw nsw i64 %outer.iv, 1 | ||
%exitcond26.not = icmp eq i64 %outer.iv.next, 3 | ||
br i1 %exitcond26.not, label %outer.exit, label %outer.loop | ||
|
||
outer.exit: | ||
ret void | ||
} | ||
|
||
|
||
define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) { | ||
; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64' | ||
; CHECK: Calculating cost of runtime checks: | ||
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 1 | ||
; CHECK: Total cost of runtime checks: 1 | ||
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 | ||
entry: | ||
br label %outer.loop | ||
|
||
outer.loop: | ||
%outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] | ||
%mul.us = mul nsw i64 %outer.iv, %n | ||
br label %inner.loop | ||
|
||
inner.loop: | ||
%inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] | ||
%add.us = add nuw nsw i64 %inner.iv, %mul.us | ||
%arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us | ||
%0 = load i8, ptr %arrayidx.us, align 1 | ||
%arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us | ||
%1 = load i8, ptr %arrayidx7.us, align 1 | ||
%add9.us = add i8 %1, %0 | ||
store i8 %add9.us, ptr %arrayidx7.us, align 1 | ||
%inner.iv.next = add nuw nsw i64 %inner.iv, 1 | ||
%exitcond.not = icmp eq i64 %inner.iv.next, %n | ||
br i1 %exitcond.not, label %inner.exit, label %inner.loop | ||
|
||
inner.exit: | ||
%outer.iv.next = add nuw nsw i64 %outer.iv, 1 | ||
%exitcond26.not = icmp eq i64 %outer.iv.next, 64 | ||
br i1 %exitcond26.not, label %outer.exit, label %outer.loop | ||
|
||
outer.exit: | ||
ret void | ||
} | ||
|
||
|
||
define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) { | ||
; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3' | ||
; CHECK: Calculating cost of runtime checks: | ||
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2 | ||
; CHECK: Total cost of runtime checks: 2 | ||
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16 | ||
entry: | ||
br label %outer.loop | ||
|
||
outer.loop: | ||
%outer.iv = phi i64 [ %outer.iv.next, %inner.exit ], [ 0, %entry ] | ||
%mul.us = mul nsw i64 %outer.iv, %n | ||
br label %inner.loop | ||
|
||
inner.loop: | ||
%inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] | ||
%add.us = add nuw nsw i64 %inner.iv, %mul.us | ||
%arrayidx.us = getelementptr inbounds i8, ptr %b, i64 %add.us | ||
%0 = load i8, ptr %arrayidx.us, align 1 | ||
%arrayidx7.us = getelementptr inbounds i8, ptr %a, i64 %add.us | ||
%1 = load i8, ptr %arrayidx7.us, align 1 | ||
%add9.us = add i8 %1, %0 | ||
store i8 %add9.us, ptr %arrayidx7.us, align 1 | ||
%inner.iv.next = add nuw nsw i64 %inner.iv, 1 | ||
%exitcond.not = icmp eq i64 %inner.iv.next, %n | ||
br i1 %exitcond.not, label %inner.exit, label %inner.loop | ||
|
||
inner.exit: | ||
%outer.iv.next = add nuw nsw i64 %outer.iv, 1 | ||
%exitcond26.not = icmp eq i64 %outer.iv.next, %m | ||
br i1 %exitcond26.not, label %outer.exit, label %outer.loop, !prof !0 | ||
|
||
outer.exit: | ||
ret void | ||
} | ||
|
||
|
||
define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) { | ||
; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks' | ||
; CHECK: Calculating cost of runtime checks: | ||
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 2 | ||
; CHECK: Total cost of runtime checks: 2 | ||
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4 | ||
entry: | ||
br label %outer.loop | ||
|
||
outer.loop: | ||
%outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ] | ||
%0 = mul nsw i64 %outer.iv, %n | ||
br label %inner.loop | ||
|
||
inner.loop: | ||
%iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ] | ||
%1 = add nuw nsw i64 %iv.inner, %0 | ||
%arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %1 | ||
%2 = load i32, ptr %arrayidx.us, align 4 | ||
%arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %1 | ||
%3 = load i32, ptr %arrayidx8.us, align 4 | ||
%add9.us = add nsw i32 %3, %2 | ||
store i32 %add9.us, ptr %arrayidx8.us, align 4 | ||
%iv.inner.next = add nuw nsw i64 %iv.inner, 1 | ||
%inner.exit.cond = icmp eq i64 %iv.inner.next, %n | ||
br i1 %inner.exit.cond, label %inner.exit, label %inner.loop | ||
|
||
inner.exit: | ||
%outer.iv.next = add nuw nsw i64 %outer.iv, 1 | ||
%outer.exit.cond = icmp eq i64 %outer.iv.next, 3 | ||
br i1 %outer.exit.cond, label %outer.exit, label %outer.loop | ||
|
||
outer.exit: | ||
ret void | ||
} | ||
|
||
|
||
!0 = !{!"branch_weights", i32 10, i32 20} |