From 4ac2721e51131b3a160fee5ae0fcbd695d090e86 Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 9 Apr 2024 16:36:08 +0100 Subject: [PATCH] [AArch64] Add costs for ST3 and ST4 instructions, modelled as store(shuffle). (#87934) This tries to add some costs for the shuffle in a ST3/ST4 instruction, which are represented in LLVM IR as store(interleaving shuffle). In order to detect the store, it needs to add a CxtI context instruction to check the users of the shuffle. LD3 and LD4 are added, LD2 should be a zip1 shuffle, which will be added in another patch. It should help fix some of the regressions from #87510. --- .../llvm/Analysis/TargetTransformInfo.h | 26 +++++------ .../llvm/Analysis/TargetTransformInfoImpl.h | 44 +++++++++++-------- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 3 +- llvm/lib/Analysis/TargetTransformInfo.cpp | 6 +-- .../AArch64/AArch64TargetTransformInfo.cpp | 28 ++++++++---- .../AArch64/AArch64TargetTransformInfo.h | 3 +- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 3 +- .../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 3 +- .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 3 +- llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 3 +- .../Hexagon/HexagonTargetTransformInfo.cpp | 3 +- .../Hexagon/HexagonTargetTransformInfo.h | 3 +- .../Target/PowerPC/PPCTargetTransformInfo.cpp | 3 +- .../Target/PowerPC/PPCTargetTransformInfo.h | 3 +- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 3 +- .../Target/RISCV/RISCVTargetTransformInfo.h | 3 +- .../SystemZ/SystemZTargetTransformInfo.cpp | 10 ++--- .../SystemZ/SystemZTargetTransformInfo.h | 3 +- .../lib/Target/X86/X86TargetTransformInfo.cpp | 10 ++--- llvm/lib/Target/X86/X86TargetTransformInfo.h | 3 +- .../Transforms/Vectorize/VectorCombine.cpp | 5 ++- .../CostModel/AArch64/shuffle-store.ll | 42 +++++++++--------- 22 files changed, 121 insertions(+), 92 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index fa9392b86c15b9f..58c69ac939763a4 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1291,12 +1291,11 @@ class TargetTransformInfo { /// passed through \p Args, which helps improve the cost estimation in some /// cases, like in broadcast loads. /// NOTE: For subvector extractions Tp represents the source type. - InstructionCost - getShuffleCost(ShuffleKind Kind, VectorType *Tp, - ArrayRef Mask = std::nullopt, - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, - int Index = 0, VectorType *SubTp = nullptr, - ArrayRef Args = std::nullopt) const; + InstructionCost getShuffleCost( + ShuffleKind Kind, VectorType *Tp, ArrayRef Mask = std::nullopt, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, int Index = 0, + VectorType *SubTp = nullptr, ArrayRef Args = std::nullopt, + const Instruction *CxtI = nullptr) const; /// Represents a hint about the context in which a cast is used. /// @@ -2008,11 +2007,10 @@ class TargetTransformInfo::Concept { const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const = 0; - virtual InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, - ArrayRef Mask, - TTI::TargetCostKind CostKind, - int Index, VectorType *SubTp, - ArrayRef Args) = 0; + virtual InstructionCost + getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, + ArrayRef Args, const Instruction *CxtI) = 0; virtual InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, CastContextHint CCH, TTI::TargetCostKind CostKind, @@ -2647,8 +2645,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args) override { - return Impl.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args); + ArrayRef Args, + const Instruction *CxtI) override { + return Impl.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args, + CxtI); } InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, CastContextHint CCH, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 63c2ef8912b29cf..5b40e49714069fc 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -579,10 +579,12 @@ class TargetTransformInfoImplBase { return InstructionCost::getInvalid(); } - InstructionCost - getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty, ArrayRef Mask, - TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args = std::nullopt) const { + InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty, + ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, + VectorType *SubTp, + ArrayRef Args = std::nullopt, + const Instruction *CxtI = nullptr) const { return 1; } @@ -1341,13 +1343,13 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase { if (Shuffle->isExtractSubvectorMask(SubIndex)) return TargetTTI->getShuffleCost(TTI::SK_ExtractSubvector, VecSrcTy, Mask, CostKind, SubIndex, VecTy, - Operands); + Operands, Shuffle); if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) return TargetTTI->getShuffleCost( TTI::SK_InsertSubvector, VecTy, Mask, CostKind, SubIndex, FixedVectorType::get(VecTy->getScalarType(), NumSubElts), - Operands); + Operands, Shuffle); int ReplicationFactor, VF; if (Shuffle->isReplicationMask(ReplicationFactor, VF)) { @@ -1374,7 +1376,7 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase { return TargetTTI->getShuffleCost( IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, VecTy, - AdjustMask, CostKind, 0, nullptr); + AdjustMask, CostKind, 0, nullptr, {}, Shuffle); } // Narrowing shuffle - perform shuffle at original wider width and @@ -1383,13 +1385,13 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase { InstructionCost ShuffleCost = TargetTTI->getShuffleCost( IsUnary ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, - VecSrcTy, AdjustMask, CostKind, 0, nullptr); + VecSrcTy, AdjustMask, CostKind, 0, nullptr, {}, Shuffle); SmallVector ExtractMask(Mask.size()); std::iota(ExtractMask.begin(), ExtractMask.end(), 0); - return ShuffleCost + TargetTTI->getShuffleCost(TTI::SK_ExtractSubvector, - VecSrcTy, ExtractMask, - CostKind, 0, VecTy); + return ShuffleCost + TargetTTI->getShuffleCost( + TTI::SK_ExtractSubvector, VecSrcTy, + ExtractMask, CostKind, 0, VecTy, {}, Shuffle); } if (Shuffle->isIdentity()) @@ -1397,35 +1399,39 @@ class TargetTransformInfoImplCRTPBase : public TargetTransformInfoImplBase { if (Shuffle->isReverse()) return TargetTTI->getShuffleCost(TTI::SK_Reverse, VecTy, Mask, CostKind, - 0, nullptr, Operands); + 0, nullptr, Operands, Shuffle); if (Shuffle->isSelect()) return TargetTTI->getShuffleCost(TTI::SK_Select, VecTy, Mask, CostKind, - 0, nullptr, Operands); + 0, nullptr, Operands, Shuffle); if (Shuffle->isTranspose()) return TargetTTI->getShuffleCost(TTI::SK_Transpose, VecTy, Mask, - CostKind, 0, nullptr, Operands); + CostKind, 0, nullptr, Operands, + Shuffle); if (Shuffle->isZeroEltSplat()) return TargetTTI->getShuffleCost(TTI::SK_Broadcast, VecTy, Mask, - CostKind, 0, nullptr, Operands); + CostKind, 0, nullptr, Operands, + Shuffle); if (Shuffle->isSingleSource()) return TargetTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, Mask, - CostKind, 0, nullptr, Operands); + CostKind, 0, nullptr, Operands, + Shuffle); if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) return TargetTTI->getShuffleCost( TTI::SK_InsertSubvector, VecTy, Mask, CostKind, SubIndex, - FixedVectorType::get(VecTy->getScalarType(), NumSubElts), Operands); + FixedVectorType::get(VecTy->getScalarType(), NumSubElts), Operands, + Shuffle); if (Shuffle->isSplice(SubIndex)) return TargetTTI->getShuffleCost(TTI::SK_Splice, VecTy, Mask, CostKind, - SubIndex, nullptr, Operands); + SubIndex, nullptr, Operands, Shuffle); return TargetTTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, Mask, - CostKind, 0, nullptr, Operands); + CostKind, 0, nullptr, Operands, Shuffle); } case Instruction::ExtractElement: { auto *EEI = dyn_cast(U); diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 2a5638dd1d3c6c8..06a19c75cf873a3 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1018,7 +1018,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args = std::nullopt) { + ArrayRef Args = std::nullopt, + const Instruction *CxtI = nullptr) { switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) { case TTI::SK_Broadcast: if (auto *FVT = dyn_cast(Tp)) diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 5f933b4587843cb..33c899fe8899907 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -916,9 +916,9 @@ InstructionCost TargetTransformInfo::getAltInstrCost( InstructionCost TargetTransformInfo::getShuffleCost( ShuffleKind Kind, VectorType *Ty, ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args) const { - InstructionCost Cost = - TTIImpl->getShuffleCost(Kind, Ty, Mask, CostKind, Index, SubTp, Args); + ArrayRef Args, const Instruction *CxtI) const { + InstructionCost Cost = TTIImpl->getShuffleCost(Kind, Ty, Mask, CostKind, + Index, SubTp, Args, CxtI); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index ee7137b92445bb1..fc48338628b3af5 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3815,18 +3815,29 @@ InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { return LegalizationCost * LT.first; } -InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, - VectorType *Tp, - ArrayRef Mask, - TTI::TargetCostKind CostKind, - int Index, VectorType *SubTp, - ArrayRef Args) { +InstructionCost AArch64TTIImpl::getShuffleCost( + TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, + ArrayRef Args, const Instruction *CxtI) { std::pair LT = getTypeLegalizationCost(Tp); + // If we have a Mask, and the LT is being legalized somehow, split the Mask // into smaller vectors and sum the cost of each shuffle. if (!Mask.empty() && isa(Tp) && LT.second.isVector() && Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) { + + // Check for ST3/ST4 instructions, which are represented in llvm IR as + // store(interleaving-shuffle). The shuffle cost could potentially be free, + // but we model it with a cost of LT.first so that LD3/LD3 have a higher + // cost than just the store. + if (CxtI && CxtI->hasOneUse() && isa(*CxtI->user_begin()) && + (ShuffleVectorInst::isInterleaveMask( + Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) || + ShuffleVectorInst::isInterleaveMask( + Mask, 3, Tp->getElementCount().getKnownMinValue() * 2))) + return LT.first; + unsigned TpNumElts = Mask.size(); unsigned LTNumElts = LT.second.getVectorNumElements(); unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; @@ -3874,7 +3885,7 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, if (NumSources <= 2) Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, - NTp, NMask, CostKind, 0, nullptr, Args); + NTp, NMask, CostKind, 0, nullptr, Args, CxtI); else if (any_of(enumerate(NMask), [&](const auto &ME) { return ME.value() % LTNumElts == ME.index(); })) @@ -4055,7 +4066,8 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // Restore optimal kind. if (IsExtractSubvector) Kind = TTI::SK_ExtractSubvector; - return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); + return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args, + CxtI); } static bool containsDecreasingPointers(Loop *TheLoop, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index de39dea2be43e11..dba384481f6a349 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -393,7 +393,8 @@ class AArch64TTIImpl : public BasicTTIImplBase { ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args = std::nullopt); + ArrayRef Args = std::nullopt, + const Instruction *CxtI = nullptr); InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 31077dbc0b2cc44..84320d296a037be 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1127,7 +1127,8 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT, ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args) { + ArrayRef Args, + const Instruction *CxtI) { Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp); // Treat extractsubvector as single op permutation. bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index cd8e9fd10bbf216..0dab3a982779435 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -234,7 +234,8 @@ class GCNTTIImpl final : public BasicTTIImplBase { ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args = std::nullopt); + ArrayRef Args = std::nullopt, + const Instruction *CxtI = nullptr); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 3be894ad3bef2ce..ee87f7f0e555ef2 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1212,7 +1212,8 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args) { + ArrayRef Args, + const Instruction *CxtI) { Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); // Treat extractsubvector as single op permutation. bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index bb4b321b5300916..04b32194f806f65 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -220,7 +220,8 @@ class ARMTTIImpl : public BasicTTIImplBase { ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args = std::nullopt); + ArrayRef Args = std::nullopt, + const Instruction *CxtI = nullptr); bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index 458b8717256f240..f47fcff5d602596 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -230,7 +230,8 @@ InstructionCost HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, Type *SubTp, - ArrayRef Args) { + ArrayRef Args, + const Instruction *CxtI) { return 1; } diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index fdb34f308e641e1..9689f2f5bb865c2 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -122,7 +122,8 @@ class HexagonTTIImpl : public BasicTTIImplBase { ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, Type *SubTp, - ArrayRef Args = std::nullopt); + ArrayRef Args = std::nullopt, + const Instruction *CxtI = nullptr); InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 57e1019adb7410f..3fa35efc2d15916 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -607,7 +607,8 @@ InstructionCost PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, Type *SubTp, - ArrayRef Args) { + ArrayRef Args, + const Instruction *CxtI) { InstructionCost CostFactor = vectorCostAdjustmentFactor(Instruction::ShuffleVector, Tp, nullptr); diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index c3ade9968c336a0..36006dd7df7396a 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -112,7 +112,8 @@ class PPCTTIImpl : public BasicTTIImplBase { ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, Type *SubTp, - ArrayRef Args = std::nullopt); + ArrayRef Args = std::nullopt, + const Instruction *CxtI = nullptr); InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index aeec06313c7535e..55637b8ea47f9c5 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -329,7 +329,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args) { + ArrayRef Args, + const Instruction *CxtI) { Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); std::pair LT = getTypeLegalizationCost(Tp); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index c0169ea1ad53767..e0c0e6517b6f1f9 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -146,7 +146,8 @@ class RISCVTTIImpl : public BasicTTIImplBase { ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args = std::nullopt); + ArrayRef Args = std::nullopt, + const Instruction *CxtI = nullptr); InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind); diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 5bdbaf47064d6c0..17e534f405c082b 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -601,12 +601,10 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost( Args, CxtI); } -InstructionCost SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, - VectorType *Tp, - ArrayRef Mask, - TTI::TargetCostKind CostKind, - int Index, VectorType *SubTp, - ArrayRef Args) { +InstructionCost SystemZTTIImpl::getShuffleCost( + TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, + ArrayRef Args, const Instruction *CxtI) { Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp); if (ST->hasVector()) { unsigned NumVectors = getNumVectorRegs(Tp); diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index 2cccdf6d17dacf4..1d824d353d8fb1a 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -95,7 +95,8 @@ class SystemZTTIImpl : public BasicTTIImplBase { ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args = std::nullopt); + ArrayRef Args = std::nullopt, + const Instruction *CxtI = nullptr); unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy); unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy); unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 5d1810b5bc2c6f4..b466624e1334882 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1468,12 +1468,10 @@ X86TTIImpl::getAltInstrCost(VectorType *VecTy, unsigned Opcode0, return InstructionCost::getInvalid(); } -InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, - VectorType *BaseTp, - ArrayRef Mask, - TTI::TargetCostKind CostKind, - int Index, VectorType *SubTp, - ArrayRef Args) { +InstructionCost X86TTIImpl::getShuffleCost( + TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, + ArrayRef Args, const Instruction *CxtI) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are widened to type v4i32. std::pair LT = getTypeLegalizationCost(BaseTp); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 985b00438ce8783..8ef9b4f86ffd7c2 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -150,7 +150,8 @@ class X86TTIImpl : public BasicTTIImplBase { ArrayRef Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, - ArrayRef Args = std::nullopt); + ArrayRef Args = std::nullopt, + const Instruction *CxtI = nullptr); InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 61e3f0ff55f7b2d..633b46e2dc8ba60 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1478,8 +1478,9 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) { TTI::CastContextHint::None, CostKind) + TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy, TTI::CastContextHint::None, CostKind); - OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, - CastDstTy, Mask, CostKind); + OldCost += + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, CastDstTy, Mask, + CostKind, 0, nullptr, std::nullopt, &I); InstructionCost NewCost = TTI.getShuffleCost( TargetTransformInfo::SK_PermuteTwoSrc, CastSrcTy, Mask, CostKind); diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-store.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-store.ll index ebf913ece3a9f83..12de334574f5cf8 100644 --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-store.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-store.ll @@ -85,33 +85,33 @@ define void @vst3(ptr %p) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <6 x i8> %v8i8, ptr %p, align 8 ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <12 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <12 x i8> %v16i8, ptr %p, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <24 x i8> %v32i8, ptr %p, align 32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <48 x i8> %v64i8, ptr %p, align 64 ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <6 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <6 x i16> %v8i16, ptr %p, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <12 x i16> %v16i16, ptr %p, align 32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <24 x i16> %v32i16, ptr %p, align 64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <48 x i16> %v64i16, ptr %p, align 128 -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <6 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <6 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <6 x i32> %v8i32, ptr %p, align 32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <12 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <12 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <12 x i32> %v16i32, ptr %p, align 64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <24 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <24 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <24 x i32> %v32i32, ptr %p, align 128 -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v64i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <48 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <48 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <48 x i32> %v64i32, ptr %p, align 256 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <6 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <6 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <6 x i64> %v8i64, ptr %p, align 64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <12 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <12 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <12 x i64> %v16i64, ptr %p, align 128 -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <24 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <24 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <24 x i64> %v32i64, ptr %p, align 256 -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v64i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <48 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v64i64 = shufflevector <32 x i64> undef, <32 x i64> undef, <48 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: store <48 x i64> %v64i64, ptr %p, align 512 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -161,25 +161,25 @@ define void @vst4(ptr %p) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i8> %v8i8, ptr %p, align 8 ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> %v16i8, ptr %p, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <32 x i8> %v32i8, ptr %p, align 32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <64 x i8> %v64i8, ptr %p, align 64 ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> %v8i16, ptr %p, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <16 x i16> %v16i16, ptr %p, align 32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <32 x i16> %v32i16, ptr %p, align 64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <64 x i16> %v64i16, ptr %p, align 128 ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <8 x i32> %v8i32, ptr %p, align 32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <16 x i32> %v16i32, ptr %p, align 64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v32i32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: store <32 x i32> %v32i32, ptr %p, align 128 -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v64i32 = shufflevector <64 x i32> undef, <64 x i32> undef, <64 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <64 x i32> %v64i32, ptr %p, align 256 ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <8 x i64> %v8i64, ptr %p, align 64