-
Notifications
You must be signed in to change notification settings - Fork 12.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Represent externally used values as original scalars, if profitable. #100904
[SLP]Represent externally used values as original scalars, if profitable. #100904
Conversation
Created using spr 1.3.5
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesCurrently SLP vectorizer tries to keep only GEPs as scalar, if they are Patch is 211.36 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/100904.diff 51 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 6501a14d87789..1279f02bd7ddd 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1233,7 +1233,7 @@ class BoUpSLP {
NonScheduledFirst.clear();
EntryToLastInstruction.clear();
ExternalUses.clear();
- ExternalUsesAsGEPs.clear();
+ ExternalUsesAsOriginalScalar.clear();
for (auto &Iter : BlocksSchedules) {
BlockScheduling *BS = Iter.second.get();
BS->clear();
@@ -3434,7 +3434,7 @@ class BoUpSLP {
/// A list of GEPs which can be reaplced by scalar GEPs instead of
/// extractelement instructions.
- SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
+ SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
/// Values used only by @llvm.assume calls.
SmallPtrSet<const Value *, 32> EphValues;
@@ -10503,6 +10503,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
SmallDenseSet<Value *, 4> UsedInserts;
DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
+ DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
@@ -10611,52 +10612,90 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
}
}
}
- // Leave the GEPs as is, they are free in most cases and better to keep them
- // as GEPs.
+
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
+ // If we plan to rewrite the tree in a smaller type, we will need to sign
+ // extend the extracted value back to the original type. Here, we account
+ // for the extract and the added cost of the sign extend if needed.
+ InstructionCost ExtraCost = TTI::TCC_Free;
+ auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
+ const TreeEntry *Entry = getTreeEntry(EU.Scalar);
+ auto It = MinBWs.find(Entry);
+ if (It != MinBWs.end()) {
+ auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
+ unsigned Extend =
+ It->second.second ? Instruction::SExt : Instruction::ZExt;
+ VecTy = getWidenedType(MinTy, BundleWidth);
+ ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
+ VecTy, EU.Lane);
+ } else {
+ ExtraCost = TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
+ CostKind, EU.Lane);
+ }
+ // Leave the scalar instructions as is if they are cheaper than extracts.
+ if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
+ Entry->getOpcode() == Instruction::Load) {
if (!ValueToExtUses) {
ValueToExtUses.emplace();
for_each(enumerate(ExternalUses), [&](const auto &P) {
+ // Ignore phis in loops.
+ if (auto *Phi = dyn_cast_if_present<PHINode>(P.value().User)) {
+ auto *I = cast<Instruction>(P.value().Scalar);
+ const Loop *L = LI->getLoopFor(Phi->getParent());
+ if (L && (Phi->getParent() == I->getParent() ||
+ L == LI->getLoopFor(I->getParent())))
+ return;
+ }
+
ValueToExtUses->try_emplace(P.value().Scalar, P.index());
});
}
- // Can use original GEP, if no operands vectorized or they are marked as
- // externally used already.
- bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
- if (!getTreeEntry(V))
- return true;
- auto It = ValueToExtUses->find(V);
- if (It != ValueToExtUses->end()) {
- // Replace all uses to avoid compiler crash.
- ExternalUses[It->second].User = nullptr;
+ // Can use original instruction, if no operands vectorized or they are
+ // marked as externally used already.
+ auto *Inst = cast<Instruction>(EU.Scalar);
+ bool CanBeUsedAsScalar = all_of(Inst->operands(), [&](Value *V) {
+ if (!getTreeEntry(V)) {
+ // Some extractelements might be not vectorized, but
+ // transformed into shuffle and removed from the function,
+ // consider it here.
+ if (auto *EE = dyn_cast<ExtractElementInst>(V))
+ return !EE->hasOneUse() || !MustGather.contains(EE);
return true;
}
- return false;
+ return ValueToExtUses->contains(V);
});
- if (CanBeUsedAsGEP) {
- ExtractCost += TTI->getInstructionCost(GEP, CostKind);
- ExternalUsesAsGEPs.insert(EU.Scalar);
- continue;
+ if (CanBeUsedAsScalar) {
+ InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
+ bool KeepScalar = ScalarCost <= ExtraCost;
+ if (KeepScalar && ScalarCost != TTI::TCC_Free &&
+ ExtraCost - ScalarCost <= TTI::TCC_Basic) {
+ unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
+ return ValueToExtUses->contains(V);
+ });
+ auto It = ExtractsCount.find(Entry);
+ if (It != ExtractsCount.end())
+ ScalarUsesCount -= It->getSecond().size();
+ // Keep original scalar if number of externally used instructions in
+ // the same entry is not power of 2. It may help to do some extra
+ // vectorization for now.
+ KeepScalar = ScalarUsesCount <= 1 || !isPowerOf2_32(ScalarUsesCount);
+ }
+ if (KeepScalar) {
+ ExternalUsesAsOriginalScalar.insert(EU.Scalar);
+ for_each(Inst->operands(), [&](Value *V) {
+ auto It = ValueToExtUses->find(V);
+ if (It != ValueToExtUses->end()) {
+ // Replace all uses to avoid compiler crash.
+ ExternalUses[It->second].User = nullptr;
+ }
+ });
+ ExtraCost = ScalarCost;
+ ExtractsCount[Entry].insert(Inst);
+ }
}
}
- // If we plan to rewrite the tree in a smaller type, we will need to sign
- // extend the extracted value back to the original type. Here, we account
- // for the extract and the added cost of the sign extend if needed.
- auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
- auto It = MinBWs.find(getTreeEntry(EU.Scalar));
- if (It != MinBWs.end()) {
- auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
- unsigned Extend =
- It->second.second ? Instruction::SExt : Instruction::ZExt;
- VecTy = getWidenedType(MinTy, BundleWidth);
- ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
- VecTy, EU.Lane);
- } else {
- ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
- CostKind, EU.Lane);
- }
+ ExtractCost += ExtraCost;
}
// Add reduced value cost, if resized.
if (!VectorizedVals.empty()) {
@@ -13856,8 +13895,7 @@ Value *BoUpSLP::vectorizeTree(
DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
// Maps extract Scalar to the corresponding extractelement instruction in the
// basic block. Only one extractelement per block should be emitted.
- DenseMap<Value *,
- DenseMap<BasicBlock *, std::pair<Instruction *, Instruction *>>>
+ DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>>
ScalarToEEs;
SmallDenseSet<Value *, 4> UsedInserts;
DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
@@ -13887,45 +13925,44 @@ Value *BoUpSLP::vectorizeTree(
if (Scalar->getType() != Vec->getType()) {
Value *Ex = nullptr;
Value *ExV = nullptr;
- auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
- bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
+ auto *Inst = dyn_cast<Instruction>(Scalar);
+ bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
auto It = ScalarToEEs.find(Scalar);
if (It != ScalarToEEs.end()) {
// No need to emit many extracts, just move the only one in the
// current block.
- auto EEIt = It->second.find(Builder.GetInsertBlock());
+ auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
+ : Builder.GetInsertBlock());
if (EEIt != It->second.end()) {
- Instruction *I = EEIt->second.first;
- if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
+ Value *PrevV = EEIt->second.first;
+ if (auto *I = dyn_cast<Instruction>(PrevV);
+ I && !ReplaceInst &&
+ Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
Builder.GetInsertPoint()->comesBefore(I)) {
I->moveBefore(*Builder.GetInsertPoint()->getParent(),
Builder.GetInsertPoint());
- if (auto *CI = EEIt->second.second)
+ if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
CI->moveAfter(I);
}
- Ex = I;
+ Ex = PrevV;
ExV = EEIt->second.second ? EEIt->second.second : Ex;
}
}
if (!Ex) {
// "Reuse" the existing extract to improve final codegen.
- if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
+ if (ReplaceInst) {
+ // Leave the instruction as is, if it cheaper extracts and all
+ // operands are scalar.
+ auto *CloneInst = Inst->clone();
+ CloneInst->insertBefore(Inst);
+ if (Inst->hasName())
+ CloneInst->takeName(Inst);
+ Ex = CloneInst;
+ } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
Value *V = ES->getVectorOperand();
if (const TreeEntry *ETE = getTreeEntry(V))
V = ETE->VectorizedValue;
Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
- } else if (ReplaceGEP) {
- // Leave the GEPs as is, they are free in most cases and better to
- // keep them as GEPs.
- auto *CloneGEP = GEP->clone();
- if (isa<Instruction>(Vec))
- CloneGEP->insertBefore(*Builder.GetInsertBlock(),
- Builder.GetInsertPoint());
- else
- CloneGEP->insertBefore(GEP);
- if (GEP->hasName())
- CloneGEP->takeName(GEP);
- Ex = CloneGEP;
} else {
Ex = Builder.CreateExtractElement(Vec, Lane);
}
@@ -13935,14 +13972,15 @@ Value *BoUpSLP::vectorizeTree(
if (Scalar->getType() != Ex->getType())
ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
MinBWs.find(E)->second.second);
- if (auto *I = dyn_cast<Instruction>(Ex))
- ScalarToEEs[Scalar].try_emplace(
- Builder.GetInsertBlock(),
- std::make_pair(I, cast<Instruction>(ExV)));
+ auto *I = dyn_cast<Instruction>(Ex);
+ ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
+ : &F->getEntryBlock(),
+ std::make_pair(Ex, ExV));
}
// The then branch of the previous if may produce constants, since 0
// operand might be a constant.
- if (auto *ExI = dyn_cast<Instruction>(Ex)) {
+ if (auto *ExI = dyn_cast<Instruction>(Ex);
+ ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
GatherShuffleExtractSeq.insert(ExI);
CSEBlocks.insert(ExI->getParent());
}
@@ -13963,9 +14001,10 @@ Value *BoUpSLP::vectorizeTree(
continue;
assert((ExternallyUsedValues.count(Scalar) ||
Scalar->hasNUsesOrMore(UsesLimit) ||
+ ExternalUsesAsOriginalScalar.contains(Scalar) ||
any_of(Scalar->users(),
[&](llvm::User *U) {
- if (ExternalUsesAsGEPs.contains(U))
+ if (ExternalUsesAsOriginalScalar.contains(U))
return true;
TreeEntry *UseEntry = getTreeEntry(U);
return UseEntry &&
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/external-non-inst-use.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/external-non-inst-use.ll
index d4e3fb3e24853..0d6eb7b5e08aa 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/external-non-inst-use.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/external-non-inst-use.ll
@@ -4,8 +4,9 @@
define i16 @foo(ptr %p1, ptr %p2) {
; CHECK-LABEL: @foo(
; CHECK-NEXT: entry:
-; CHECK-NEXT: store i32 0, ptr [[P1:%.*]], align 1
-; CHECK-NEXT: [[CONST_MAT:%.*]] = or i32 0, 0
+; CHECK-NEXT: [[CONST:%.*]] = bitcast i32 0 to i32
+; CHECK-NEXT: store i32 [[CONST]], ptr [[P1:%.*]], align 1
+; CHECK-NEXT: [[CONST_MAT:%.*]] = or i32 [[CONST]], 0
; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[P2:%.*]], align 1
; CHECK-NEXT: ret i16 0
;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
index 70cdd08548b2d..8f6d5d8f2d7ec 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
@@ -1236,20 +1236,20 @@ define void @crash_no_tracked_instructions(ptr %arg, ptr %arg.2, ptr %arg.3, i1
; CHECK: bb22:
; CHECK-NEXT: [[T23:%.*]] = fmul float [[T20]], 9.900000e+01
; CHECK-NEXT: [[T25:%.*]] = getelementptr inbounds float, ptr [[T19]], i64 2
+; CHECK-NEXT: [[T26:%.*]] = fmul float [[T23]], 1.000000e+01
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[T23]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], <float 9.900000e+01, float 1.000000e+01>
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; CHECK-NEXT: store float [[TMP4]], ptr [[T25]], align 4
+; CHECK-NEXT: store float [[T26]], ptr [[T25]], align 4
; CHECK-NEXT: [[T27:%.*]] = load float, ptr [[ARG_2:%.*]], align 8
-; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP3]], <float 2.000000e+01, float 2.000000e+01>
+; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP3]], <float 2.000000e+01, float 2.000000e+01>
; CHECK-NEXT: br label [[BB30]]
; CHECK: bb30:
-; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[BB22]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP4]], [[BB22]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[BB36:%.*]]
; CHECK: bb36:
-; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], <float 3.000000e+00, float 3.000000e+00>
-; CHECK-NEXT: store <2 x float> [[TMP7]], ptr [[ARG_3]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x float> [[TMP5]], <float 3.000000e+00, float 3.000000e+00>
+; CHECK-NEXT: store <2 x float> [[TMP6]], ptr [[ARG_3]], align 4
; CHECK-NEXT: br label [[BB41:%.*]]
; CHECK: bb41:
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
index f85f658fed4d5..d89d628670360 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
@@ -29,151 +29,215 @@ define i64 @straight(ptr nocapture noundef readonly %p, i32 noundef %st) {
; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2
; CHECK-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <64 x i16> [[TMP8]], <64 x i16> [[TMP9]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <64 x i16> [[TMP10]], <64 x i16> [[TMP11]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,...
[truncated]
|
Ping! |
Ping! |
; FORCED: bb5: | ||
; FORCED-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ [[TMP4]], [[BB4]] ] | ||
; FORCED-NEXT: ret void | ||
; | ||
bb: | ||
br label %bb1 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
revert this change? it looks NFC
; CHECK-NEXT: [[TMP19:%.*]] = fadd <4 x float> [[TMP10]], [[TMP17]] | ||
; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x float> [[TMP18]], <4 x float> [[TMP19]], <4 x i32> <i32 0, i32 5, i32 2, i32 7> | ||
; CHECK-NEXT: [[TMP21:%.*]] = fptosi <4 x float> [[TMP20]] to <4 x i32> | ||
; CHECK-NEXT: store <4 x i32> [[TMP21]], ptr [[ARRAYIDX1]], align 4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NFC?
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) | ||
; CHECK-NEXT: [[OP_RDX16:%.*]] = add i64 [[TMP9]], [[TMP8]] | ||
; CHECK-NEXT: [[OP_RDX16:%.*]] = add i64 [[TMP7]], [[TMP8]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NFC?
Created using spr 1.3.5
Ping! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Hi @alexey-bataev, Under the latest commit of LLVM 19.1.3 (
I bisected this performance regression to the introduction of commit 32994cc, however, from local testing, the regression was later fixed by this pull request. Would it be possible to please backport this pull request (b10ecfa) and dependent changes to LLVM 19 on the grounds of improved performance? I am requesting your help since I am personally not familiar with the SLP Vectorizer code. In an attempt to provide motivation, I identified the following series of commits which allow this pull request to be cleanly picked into LLVM 19 at ab51ecc:
Using the above set of commits, all LIT tests pass. Here are some results which demonstrate a measurable improvement back to LLVM 18 levels under LLVM 19 for the two
If any additional information or testing is desired, please let me know. I'd be happy to provide. Thank you, |
Hi, I rather doubt it is a good idea to push so many commits to fix a single regression. Better to ask release maintainers if it worth doing for the 19x release or better to wait for 20 |
Hi @alexey-bataev ,
That is understandable, thank you for your timely response & suggestion. I will ping the current release manager here which, according to https://llvm.org/docs/HowToReleaseLLVM.html#official-testing, should be Tobias Hieta for LLVM 19. Hi @tru , Per Alexey's response above, may you please provide your opinion on whether or not it would be worth cherry-picking in the commits I've listed to LLVM 19 to resolve a performance regression with particular SPEC CPU tests? We at Sony Interactive Entertainment understand there are quite a few users of the SPEC benchmark who may also notice this regression in their internal testing, however, from our point-of-view, we are by no means pushing for this, we just think it would be overall beneficial. Please let me know what you think. Thank you, |
Hi @tru , Just following up on your thoughts to #100904 (comment). Thank you, |
Currently SLP vectorizer tries to keep only GEPs as scalar, if they are
vectorized but used externally. Same approach can be used for all scalar
values. This patch tries to keep original scalars if all its operands
remain scalar or externally used, the cost of the original scalar is
lower than the cost of the extractelement instruction, or if the number
of externally used scalars in the same entry is power of 2. Last
criterion allows better revectorization for multiply used scalars.