Skip to content

Commit

Permalink
[VPlan] Try to narrow wide and replicating recipes to uniform recipes.
Browse files Browse the repository at this point in the history
Use the existing VPlan-based analysis to identify recipes that only have
their first lane demanded and transform them to uniform recpliate
recipes. This simplifies the generated code in some places and prepares
for fixing llvm#122496.
  • Loading branch information
fhahn committed Jan 12, 2025
1 parent 16aa400 commit 1afba19
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 18 deletions.
27 changes: 26 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -596,11 +596,36 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
if (!PhiR)
continue;

// Try to narrow wide and replicating recipes to uniform recipes, based on
// VPlan analysis.
// TODO: Apply to all recipes in the future, to replace legacy uniformity
// analysis.
auto Users = collectUsersRecursively(PhiR);
for (VPUser *U : reverse(Users)) {
auto *Def = dyn_cast<VPSingleDefRecipe>(U);
auto *RepR = dyn_cast<VPReplicateRecipe>(U);
// Skip recipes that shouldn't be narrowed.
if (!isa<VPReplicateRecipe, VPWidenRecipe>(Def) || !Def ||
Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
(RefR && (RepR->isUniform() || RepRr->isPredicated())))
continue;

// Skip recipes that may have other lanes than their first used.
if (!vputils::isUniformAfterVectorization(Def) &&
!vputils::onlyFirstLaneUsed(Def))
continue;

auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
Def->operands(), /*IsUniform*/ true);
Clone->insertAfter(Def);
Def->replaceAllUsesWith(Clone);
}

// Check if any uniform VPReplicateRecipes using the phi recipe are used by
// ExtractFromEnd. Those must be replaced by a regular VPReplicateRecipe to
// ensure the final value is available.
// TODO: Remove once uniformity analysis is done on VPlan.
for (VPUser *U : collectUsersRecursively(PhiR)) {
for (VPUser *U : Users) {
auto *ExitIRI = dyn_cast<VPIRInstruction>(U);
VPValue *Op;
if (!ExitIRI || !match(ExitIRI->getOperand(0),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,6 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[M]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[CONV6]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
Expand All @@ -142,9 +140,9 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP22:%.*]] = icmp ule <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP23:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP22]], <vscale x 2 x i1> zeroinitializer
; CHECK-NEXT: [[TMP24:%.*]] = select <vscale x 2 x i1> [[TMP23]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]], <vscale x 2 x i64> splat (i64 1)
; CHECK-NEXT: [[TMP25:%.*]] = sdiv <vscale x 2 x i64> [[BROADCAST_SPLAT]], [[TMP24]]
; CHECK-NEXT: [[TMP26:%.*]] = extractelement <vscale x 2 x i64> [[TMP25]], i32 0
; CHECK-NEXT: [[TMP24:%.*]] = extractelement <vscale x 2 x i1> [[TMP23]], i32 0
; CHECK-NEXT: [[TMP25:%.*]] = select i1 [[TMP24]], i64 [[CONV6]], i64 1
; CHECK-NEXT: [[TMP26:%.*]] = sdiv i64 [[M]], [[TMP25]]
; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP26]], [[CONV61]]
; CHECK-NEXT: [[TMP29:%.*]] = sub i64 [[TMP21]], [[TMP28]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,12 @@ define void @gep_use_in_dead_block(ptr noalias %dst, ptr %src) {
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i16> [[WIDE_LOAD]], splat (i16 10)
; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true)
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[TMP8]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr [[TMP12]], i32 2, <4 x i1> [[TMP7]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/Transforms/LoopVectorize/scalable-assume.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
; CHECK-LABEL: @test1(
; CHECK: vector.body:
; CHECK: [[FCMP1:%.*]] = fcmp ogt <vscale x 2 x float>
; CHECK-NEXT: [[FCMP2:%.*]] = fcmp ogt <vscale x 2 x float>
; CHECK-NEXT: [[FCMP1L0:%.*]] = extractelement <vscale x 2 x i1> [[FCMP1]], i32 0
; CHECK-NEXT: tail call void @llvm.assume(i1 [[FCMP1L0]])
; CHECK-NEXT: [[FCMP2L0:%.*]] = extractelement <vscale x 2 x i1> [[FCMP2]], i32 0
; CHECK-NEXT: tail call void @llvm.assume(i1 [[FCMP2L0]])
; CHECK: [[E1:%.*]] = extractelement <vscale x 2 x float> {{.+}}, i32 0
; CHECK-NEXT: [[FCMP1:%.*]] = fcmp ogt float [[E1]]
; CHECK-NEXT: [[E2:%.*]] = extractelement <vscale x 2 x float> {{.+}}, i32 0
; CHECK-NEXT: [[FCMP2:%.*]] = fcmp ogt float [[E2]]
; CHECK-NEXT: tail call void @llvm.assume(i1 [[FCMP1]])
; CHECK-NEXT: tail call void @llvm.assume(i1 [[FCMP2]])
entry:
br label %for.body

Expand Down

0 comments on commit 1afba19

Please sign in to comment.