Skip to content

Commit

Permalink
Recommit #2: "[LV] Induction Variable does not remain scalar under ta…
Browse files Browse the repository at this point in the history
…il-folding."

This was reverted because of a miscompilation. At closer inspection, the
problem was actually visible in a changed llvm regression test too. This
one-line follow up fix/recommit will splat the IV, which is what we are trying
to avoid if unnecessary in general, if tail-folding is requested even if all
users are scalar instructions after vectorisation. Because with tail-folding,
the splat IV will be used by the predicate of the masked loads/stores
instructions. The previous version omitted this, which caused the
miscompilation. The original commit message was:

If tail-folding of the scalar remainder loop is applied, the primary induction
variable is splat to a vector and used by the masked load/store vector
instructions, thus the IV does not remain scalar. Because we now mark
that the IV does not remain scalar for these cases, we don't emit the vector IV
if it is not used. Thus, the vectoriser produces less dead code.

Thanks to Ayal Zaks for the direction how to fix this.
  • Loading branch information
Sjoerd Meijer committed May 13, 2020
1 parent 897d8ee commit 9529597
Show file tree
Hide file tree
Showing 29 changed files with 3,213 additions and 1,364 deletions.
14 changes: 10 additions & 4 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1909,11 +1909,12 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
return;
}

// If we haven't yet vectorized the induction variable, splat the scalar
// induction variable, and build the necessary step vectors.
// TODO: Don't do it unless the vectorized IV is really required.
// All IV users are scalar instructions, so only emit a scalar IV, not a
// vectorised IV. Except when we tail-fold, then the splat IV feeds the
// predicate used by the masked loads/stores.
Value *ScalarIV = CreateScalarIV(Step);
CreateSplatIV(ScalarIV, Step);
if (!Cost->isScalarEpilogueAllowed())
CreateSplatIV(ScalarIV, Step);
buildScalarSteps(ScalarIV, Step, EntryVal, ID);
}

Expand Down Expand Up @@ -4594,6 +4595,11 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
continue;

// If tail-folding is applied, the primary induction variable will be used
// to feed a vector compare.
if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
continue;

// Determine if all users of the induction variable are scalar after
// vectorization.
auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,6 @@

; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph
; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0
; FORCED-NEXT: %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer
; FORCED-NEXT: %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
; FORCED-NEXT: %0 = add i32 %index, 0
; FORCED-NEXT: %1 = extractvalue { i64, i64 } %sv, 0
; FORCED-NEXT: %2 = extractvalue { i64, i64 } %sv, 0
Expand Down Expand Up @@ -68,9 +65,6 @@ declare float @pow(float, float) readnone nounwind

; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph
; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0
; FORCED-NEXT: %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer
; FORCED-NEXT: %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
; FORCED-NEXT: %0 = add i32 %index, 0
; FORCED-NEXT: %1 = extractvalue { float, float } %sv, 0
; FORCED-NEXT: %2 = extractvalue { float, float } %sv, 0
Expand Down
6 changes: 0 additions & 6 deletions llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,9 @@ define void @_Z1dv() local_unnamed_addr #0 {
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[OFFSET_IDX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = add i64 [[TMP0]], [[INDEX]]
; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[OFFSET_IDX4]] to i32
; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> undef, i32 [[TMP18]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION7:%.*]] = add <4 x i32> [[BROADCAST_SPLAT6]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 0
; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[CONV]], [[TMP19]]
; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP20]] to i64
Expand Down
3 changes: 0 additions & 3 deletions llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,6 @@ define i32 @test(float* nocapture readonly %x) {
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1>
; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ target triple = "hexagon"

; CHECK-LABEL: @test1
; CHECK: vector.body:
; CHECK: %induction = add <64 x i32>
; CHECK: icmp ule <64 x i32> %induction
; CHECK: icmp ule <64 x i32> %vec.ind
; CHECK-NOT: load <{{.*}} x i32>


Expand Down
3 changes: 0 additions & 3 deletions llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@ define dso_local void @test(i32* %Arr, i32 signext %Len) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[TMP1]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,10 @@ define void @func_21() {
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ <i32 undef, i32 0>, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[PRED_STORE_CONTINUE4]] ]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> undef, i64 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1>
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <2 x i64> [[INDUCTION]], <i64 4, i64 4>
; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <2 x i64> [[VEC_IND]], <i64 4, i64 4>
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
Expand Down Expand Up @@ -61,6 +59,7 @@ define void @func_21() {
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]]
; CHECK: pred.store.continue4:
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 6
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
; CHECK: middle.block:
Expand Down
19 changes: 19 additions & 0 deletions llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,25 @@ define void @f1() {
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2
; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 2, 2
; CHECK-NEXT: br i1 [[CMP_N]], label [[BB3:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 2, [[MIDDLE_BLOCK]] ], [ 0, [[BB1:%.*]] ]
; CHECK-NEXT: br label [[BB2:%.*]]
; CHECK: bb2:
; CHECK-NEXT: [[C_1_0:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[_TMP9:%.*]], [[BB2]] ]
; CHECK-NEXT: [[_TMP1:%.*]] = zext i16 0 to i64
; CHECK-NEXT: [[_TMP2:%.*]] = getelementptr [1 x %rec8], [1 x %rec8]* @a, i16 0, i64 [[_TMP1]]
; CHECK-NEXT: [[_TMP4:%.*]] = bitcast %rec8* [[_TMP2]] to i16*
; CHECK-NEXT: [[_TMP6:%.*]] = sext i16 [[C_1_0]] to i64
; CHECK-NEXT: [[_TMP7:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[_TMP6]]
; CHECK-NEXT: store i16* [[_TMP4]], i16** [[_TMP7]]
; CHECK-NEXT: [[_TMP9]] = add nsw i16 [[C_1_0]], 1
; CHECK-NEXT: [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2
; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop !2
; CHECK: bb3:
; CHECK-NEXT: ret void
;

bb1:
br label %bb2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,6 @@ define double @sumIfVector(double* nocapture readonly %arr) {
; AVX: vector.body:
; AVX-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
; AVX-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
; AVX-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
; AVX-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
; AVX-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; AVX-NEXT: [[TMP1:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[TMP0]]
; AVX-NEXT: [[TMP2:%.*]] = getelementptr double, double* [[TMP1]], i32 0
Expand Down
Loading

0 comments on commit 9529597

Please sign in to comment.