Skip to content

Commit

Permalink
[LoopVectorize] Extract the last lane from a uniform store
Browse files Browse the repository at this point in the history
Changes VPReplicateRecipe to extract the last lane from an unconditional,
uniform store instruction. collectLoopUniforms will also add stores to
the list of uniform instructions where Legal->isUniformMemOp is true.

setCostBasedWideningDecision now sets the widening decision for
all uniform memory ops to Scalarize, where previously GatherScatter
may have been chosen for scalable stores.

This fixes an assert ("Cannot yet scalarize uniform stores") in
setCostBasedWideningDecision when we have a loop containing a
uniform i1 store and a scalable VF, which we cannot create a scatter for.

Reviewed By: sdesmalen, david-arm, fhahn

Differential Revision: https://reviews.llvm.org/D112725
  • Loading branch information
kmclaughlin-arm committed Nov 9, 2021
1 parent 092cee5 commit 0d748b4
Show file tree
Hide file tree
Showing 9 changed files with 189 additions and 135 deletions.
50 changes: 28 additions & 22 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1770,6 +1770,7 @@ class LoopVectorizationCostModel {
DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;

/// Holds the instructions known to be uniform after vectorization.
/// Entries in Uniforms may demand either the first or last lane.
/// The data is collected per VF.
DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;

Expand Down Expand Up @@ -5409,9 +5410,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
assert(WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment");

// A uniform memory op is itself uniform. We exclude uniform stores
// here as they demand the last lane, not the first one.
if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
// A uniform memory op is itself uniform.
if (Legal->isUniformMemOp(*I)) {
assert(WideningDecision == CM_Scalarize);
return true;
}
Expand All @@ -5436,7 +5436,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
SetVector<Value *> HasUniformUse;

// Scan the loop for instructions which are either a) known to have only
// lane 0 demanded or b) are uses which demand only lane 0 of their operand.
// lane 0 or the last lane demanded or b) are uses which demand only
// lane 0 of their operand.
for (auto *BB : TheLoop->blocks())
for (auto &I : *BB) {
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
Expand Down Expand Up @@ -5468,10 +5469,15 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
if (!Ptr)
continue;

// A uniform memory op is itself uniform. We exclude uniform stores
// here as they demand the last lane, not the first one.
if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
addToWorklistIfAllowed(&I);
// A uniform memory op is itself uniform. Load instructions are added
// to the worklist as they demand the first lane. Since store instructions
// demand the last lane, we instead add these to Uniforms only.
if (Legal->isUniformMemOp(I)) {
if (isa<LoadInst>(I))
addToWorklistIfAllowed(&I);
else if (!isOutOfScope(&I) && !isScalarWithPredication(&I))
Uniforms[VF].insert(&I);
}

if (isUniformDecision(&I, VF)) {
assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
Expand Down Expand Up @@ -7490,17 +7496,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
// relying on instcombine to remove them.
// Load: Scalar load + broadcast
// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
InstructionCost Cost;
if (isa<StoreInst>(&I) && VF.isScalable() &&
isLegalGatherOrScatter(&I)) {
Cost = getGatherScatterCost(&I, VF);
setWideningDecision(&I, VF, CM_GatherScatter, Cost);
} else {
assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
"Cannot yet scalarize uniform stores");
Cost = getUniformMemOpCost(&I, VF);
setWideningDecision(&I, VF, CM_Scalarize, Cost);
}
InstructionCost Cost = getUniformMemOpCost(&I, VF);
setWideningDecision(&I, VF, CM_Scalarize, Cost);
continue;
}

Expand Down Expand Up @@ -9858,6 +9855,16 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
return;
}

// If the instruction is a store to a uniform address, we only need to
// generate the last lane for the last UF part.
Instruction *I = getUnderlyingInstr();
if (State.VF.isVector() && IsUniform && isa<StoreInst>(I)) {
VPLane Lane = VPLane::getLastLaneForVF(State.VF);
State.ILV->scalarizeInstruction(
I, this, *this, VPIteration(State.UF - 1, Lane), IsPredicated, State);
return;
}

// Generate scalar instances for all VF lanes of all UF parts, unless the
// instruction is uniform inwhich case generate only the first lane for each
// of the UF parts.
Expand All @@ -9866,9 +9873,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
"Can't scalarize a scalable vector");
for (unsigned Part = 0; Part < State.UF; ++Part)
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
VPIteration(Part, Lane), IsPredicated,
State);
State.ILV->scalarizeInstruction(I, this, *this, VPIteration(Part, Lane),
IsPredicated, State);
}

void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
Expand Down
126 changes: 122 additions & 4 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,35 @@ target triple = "aarch64-unknown-linux-gnu"

define void @inv_store_i16(i16* noalias %dst, i16* noalias readonly %src, i64 %N) #0 {
; CHECK-LABEL: @inv_store_i16(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK: %[[TMP1:.*]] = insertelement <vscale x 4 x i16*> poison, i16* %dst, i32 0
; CHECK-NEXT: %[[SPLAT_PTRS:.*]] = shufflevector <vscale x 4 x i16*> %[[TMP1]], <vscale x 4 x i16*> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK: %[[VECLOAD:.*]] = load <vscale x 4 x i16>, <vscale x 4 x i16>* %{{.*}}, align 2
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> %[[VECLOAD]], <vscale x 4 x i16*> %[[SPLAT_PTRS]], i32 2
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[SRC:%.*]], i64 [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <vscale x 4 x i16>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, <vscale x 4 x i16>* [[TMP7]], align 2
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP8]], 4
; CHECK-NEXT: [[TMP10:%.*]] = sub i32 [[TMP9]], 1
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD]], i32 [[TMP10]]
; CHECK-NEXT: store i16 [[TMP11]], i16* [[DST:%.*]], align 2
; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
;
entry:
br label %for.body14

Expand Down Expand Up @@ -59,6 +82,98 @@ for.end: ; preds = %for.inc, %entry
ret void
}

define void @uniform_store_i1(i1* noalias %dst, i64* noalias %start, i64 %N) #0 {
; CHECK-LABEL: @uniform_store_i1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i64, i64* [[START:%.*]], i64 [[N_VEC]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64*> poison, i64* [[START]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64*> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64*> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX]], i32 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = add <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 0, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP6]]
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i64, i64* [[START]], <vscale x 2 x i64> [[TMP7]]
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i64, i64* [[START]], i64 [[TMP8]]
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i64, i64* [[START]], i64 [[TMP9]]
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, i64* [[NEXT_GEP2]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64* [[TMP11]] to <vscale x 2 x i64>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, <vscale x 2 x i64>* [[TMP12]], align 4
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, <vscale x 2 x i64*> [[NEXT_GEP]], i64 1
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq <vscale x 2 x i64*> [[TMP13]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 2
; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP16]], 1
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <vscale x 2 x i1> [[TMP14]], i32 [[TMP17]]
; CHECK-NEXT: store i1 [[TMP18]], i1* [[DST:%.*]], align 1
; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]]
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
;
entry:
br label %for.body

for.body:
%first.sroa = phi i64* [ %incdec.ptr, %for.body ], [ %start, %entry ]
%iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
%iv.next = add i64 %iv, 1
%0 = load i64, i64* %first.sroa
%incdec.ptr = getelementptr inbounds i64, i64* %first.sroa, i64 1
%cmp.not = icmp eq i64* %incdec.ptr, %start
store i1 %cmp.not, i1* %dst
%cmp = icmp ult i64 %iv, %N
br i1 %cmp, label %for.body, label %end, !llvm.loop !6

end:
ret void
}

; Ensure conditional i1 stores do not vectorize
define void @cond_store_i1(i1* noalias %dst, i8* noalias %start, i32 %cond, i64 %N) #0 {
; CHECK-LABEL: @cond_store_i1(
; CHECK-NOT: vector.body
;
entry:
br label %for.body

for.body:
%first.sroa = phi i8* [ %incdec.ptr, %if.end ], [ null, %entry ]
%incdec.ptr = getelementptr inbounds i8, i8* %first.sroa, i64 1
%0 = load i8, i8* %incdec.ptr
%tobool.not = icmp eq i8 %0, 10
br i1 %tobool.not, label %if.end, label %if.then

if.then:
%cmp.store = icmp eq i8* %start, %incdec.ptr
store i1 %cmp.store, i1* %dst
br label %if.end

if.end:
%cmp.not = icmp eq i8* %incdec.ptr, %start
br i1 %cmp.not, label %for.end, label %for.body

for.end:
ret void
}

attributes #0 = { "target-features"="+neon,+sve" vscale_range(0, 16) }

!0 = distinct !{!0, !1, !2, !3, !4, !5}
Expand All @@ -68,3 +183,6 @@ attributes #0 = { "target-features"="+neon,+sve" vscale_range(0, 16) }
!4 = !{!"llvm.loop.vectorize.enable", i1 true}
!5 = !{!"llvm.loop.interleave.count", i32 1}

!6 = distinct !{!6, !1, !7, !3, !4, !5}
!7 = !{!"llvm.loop.vectorize.width", i32 2}

4 changes: 2 additions & 2 deletions llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; RUN: opt -S -loop-vectorize -mtriple=s390x-linux-gnu -tiny-trip-count-interleave-threshold=4 -vectorizer-min-trip-count=8 < %s | FileCheck %s
; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -tiny-trip-count-interleave-threshold=4 -vectorizer-min-trip-count=8 < %s | FileCheck %s
; RUN: opt -S -loop-vectorize -mtriple=s390x-linux-gnu -tiny-trip-count-interleave-threshold=4 -vectorizer-min-trip-count=8 -force-vector-width=4 < %s | FileCheck %s
; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -tiny-trip-count-interleave-threshold=4 -vectorizer-min-trip-count=8 -force-vector-width=4 < %s | FileCheck %s

define i32 @main(i32 %arg, i8** nocapture readnone %arg1) #0 {
;CHECK: vector.body:
Expand Down
28 changes: 7 additions & 21 deletions llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S | FileCheck %s

; This is a bugpoint reduction of a test from PR43582:
; https://bugs.llvm.org/show_bug.cgi?id=43582
Expand Down Expand Up @@ -62,25 +62,11 @@ define void @cff_index_load_offsets(i1 %cond, i8 %x, i8* %p) #0 {
; CHECK-NEXT: [[TMP23:%.*]] = or <4 x i32> [[TMP19]], zeroinitializer
; CHECK-NEXT: [[TMP24:%.*]] = or <4 x i32> [[TMP22]], zeroinitializer
; CHECK-NEXT: [[TMP25:%.*]] = or <4 x i32> [[TMP23]], zeroinitializer
; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP24]], i32 0
; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP25]], i32 3
; CHECK-NEXT: store i32 [[TMP26]], i32* undef, align 4, !tbaa [[TBAA4:![0-9]+]]
; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP24]], i32 1
; CHECK-NEXT: store i32 [[TMP27]], i32* undef, align 4, !tbaa [[TBAA4]]
; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[TMP24]], i32 2
; CHECK-NEXT: store i32 [[TMP28]], i32* undef, align 4, !tbaa [[TBAA4]]
; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP24]], i32 3
; CHECK-NEXT: store i32 [[TMP29]], i32* undef, align 4, !tbaa [[TBAA4]]
; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP25]], i32 0
; CHECK-NEXT: store i32 [[TMP30]], i32* undef, align 4, !tbaa [[TBAA4]]
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[TMP25]], i32 1
; CHECK-NEXT: store i32 [[TMP31]], i32* undef, align 4, !tbaa [[TBAA4]]
; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP25]], i32 2
; CHECK-NEXT: store i32 [[TMP32]], i32* undef, align 4, !tbaa [[TBAA4]]
; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[TMP25]], i32 3
; CHECK-NEXT: store i32 [[TMP33]], i32* undef, align 4, !tbaa [[TBAA4]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[SW_EPILOG:%.*]], label [[SCALAR_PH]]
Expand All @@ -91,11 +77,11 @@ define void @cff_index_load_offsets(i1 %cond, i8 %x, i8* %p) #0 {
; CHECK-NEXT: [[P_359:%.*]] = phi i8* [ [[ADD_PTR86:%.*]], [[FOR_BODY68]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[CONV70:%.*]] = zext i8 [[X]] to i32
; CHECK-NEXT: [[SHL71:%.*]] = shl nuw i32 [[CONV70]], 24
; CHECK-NEXT: [[TMP35:%.*]] = load i8, i8* [[P]], align 1, !tbaa [[TBAA1]]
; CHECK-NEXT: [[CONV73:%.*]] = zext i8 [[TMP35]] to i32
; CHECK-NEXT: [[TMP28:%.*]] = load i8, i8* [[P]], align 1, !tbaa [[TBAA1]]
; CHECK-NEXT: [[CONV73:%.*]] = zext i8 [[TMP28]] to i32
; CHECK-NEXT: [[SHL74:%.*]] = shl nuw nsw i32 [[CONV73]], 16
; CHECK-NEXT: [[OR75:%.*]] = or i32 [[SHL74]], [[SHL71]]
; CHECK-NEXT: [[TMP36:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
; CHECK-NEXT: [[TMP29:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
; CHECK-NEXT: [[SHL78:%.*]] = shl nuw nsw i32 undef, 8
; CHECK-NEXT: [[OR79:%.*]] = or i32 [[OR75]], [[SHL78]]
; CHECK-NEXT: [[CONV81:%.*]] = zext i8 undef to i32
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,17 +84,11 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP22]], align 4
; CHECK-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP23]], i32 0
; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP23]], i32 3
; CHECK-NEXT: store i32 [[TMP24]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP23]], i32 1
; CHECK-NEXT: store i32 [[TMP25]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP23]], i32 2
; CHECK-NEXT: store i32 [[TMP26]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP23]], i32 3
; CHECK-NEXT: store i32 [[TMP27]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]]
Expand Down
Loading

0 comments on commit 0d748b4

Please sign in to comment.