diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1f8cbf7f340876..20305416079775 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -177,6 +177,14 @@ static cl::opt TinyTripCountVectorThreshold( "value are vectorized only if no scalar iteration overheads " "are incurred.")); +// Indicates that an epilogue is undesired, predication is preferred. +// This means that the vectorizer will try to fold the loop-tail (epilogue) +// into the loop and predicate the loop body accordingly. +static cl::opt PreferPredicateOverEpilog( + "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, + cl::desc("Indicate that an epilogue is undesired, predication should be " + "used instead.")); + static cl::opt MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " @@ -906,7 +914,7 @@ enum ScalarEpilogueLowering { CM_ScalarEpilogueNotAllowedLowTripLoop, // Loop hint predicate indicating an epilogue is undesired. - CM_ScalarEpilogueNotNeededPredicatePragma + CM_ScalarEpilogueNotNeededUsePredicate }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -4804,9 +4812,9 @@ Optional LoopVectorizationCostModel::computeMaxVF() { switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: return computeFeasibleMaxVF(TC); - case CM_ScalarEpilogueNotNeededPredicatePragma: + case CM_ScalarEpilogueNotNeededUsePredicate: LLVM_DEBUG( - dbgs() << "LV: vector predicate hint found.\n" + dbgs() << "LV: vector predicate hint/switch found.\n" << "LV: Not allowing scalar epilogue, creating predicated " << "vector loop.\n"); break; @@ -7298,8 +7306,8 @@ getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, (F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) SEL = CM_ScalarEpilogueNotAllowedOptSize; - else if (Hints.getPredicate()) - SEL = CM_ScalarEpilogueNotNeededPredicatePragma; + else if (PreferPredicateOverEpilog || Hints.getPredicate()) + SEL = CM_ScalarEpilogueNotNeededUsePredicate; return SEL; } diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll index d7767385c78229..eb0b499f512eff 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -loop-vectorize -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck -check-prefix=PREDFLAG %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -13,7 +14,15 @@ define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noal ; CHECK: %index.next = add i64 %index, 8 ; CHECK: %12 = icmp eq i64 %index.next, 432 ; CHECK: br i1 %12, label %middle.block, label %vector.body, !llvm.loop !0 - +; PREDFLAG-LABEL: tail_folding_enabled( +; PREDFLAG: vector.body: +; PREDFLAG: %wide.masked.load = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32( +; PREDFLAG: %wide.masked.load1 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32( +; PREDFLAG: %8 = add nsw <8 x i32> %wide.masked.load1, %wide.masked.load +; PREDFLAG: call void @llvm.masked.store.v8i32.p0v8i32( +; PREDFLAG: %index.next = add i64 %index, 8 +; PREDFLAG: %12 = icmp eq i64 %index.next, 432 +; PREDFLAG: br i1 %12, label %middle.block, label %vector.body, !llvm.loop !0 entry: br label %for.body @@ -40,6 +49,15 @@ define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noa ; CHECK-NOT: @llvm.masked.load.v8i32.p0v8i32( ; CHECK-NOT: @llvm.masked.store.v8i32.p0v8i32( ; CHECK: br i1 %44, label {{.*}}, label %vector.body +; PREDFLAG-LABEL: tail_folding_disabled( +; PREDFLAG: vector.body: +; PREDFLAG: %wide.masked.load = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32( +; PREDFLAG: %wide.masked.load1 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32( +; PREDFLAG: %8 = add nsw <8 x i32> %wide.masked.load1, %wide.masked.load +; PREDFLAG: call void @llvm.masked.store.v8i32.p0v8i32( +; PREDFLAG: %index.next = add i64 %index, 8 +; PREDFLAG: %12 = icmp eq i64 %index.next, 432 +; PREDFLAG: br i1 %12, label %middle.block, label %vector.body, !llvm.loop !4 entry: br label %for.body