diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 69899fd8e66848..1192fbdc1c9d85 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -8457,7 +8457,8 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, case SVE::BI__builtin_sve_svlen_u64: { SVETypeFlags TF(Builtin->TypeModifier); auto VTy = cast(getSVEType(TF)); - auto NumEls = llvm::ConstantInt::get(Ty, VTy->getElementCount().Min); + auto *NumEls = + llvm::ConstantInt::get(Ty, VTy->getElementCount().getKnownMinValue()); Function *F = CGM.getIntrinsic(Intrinsic::vscale, Ty); return Builder.CreateMul(NumEls, Builder.CreateCall(F)); diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index d90cffb4bb95c4..8a85a24910e4e2 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -726,7 +726,7 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) { { ASTContext::BuiltinVectorTypeInfo Info = CGM.getContext().getBuiltinVectorTypeInfo(BT); - unsigned NumElemsPerVG = (Info.EC.Min * Info.NumVectors) / 2; + unsigned NumElemsPerVG = (Info.EC.getKnownMinValue() * Info.NumVectors) / 2; // Debuggers can't extract 1bit from a vector, so will display a // bitpattern for svbool_t instead. diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp index 9c072d41607566..aede8a53ba9094 100644 --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -586,7 +586,8 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) { ASTContext::BuiltinVectorTypeInfo Info = Context.getBuiltinVectorTypeInfo(cast(Ty)); return llvm::ScalableVectorType::get(ConvertType(Info.ElementType), - Info.EC.Min * Info.NumVectors); + Info.EC.getKnownMinValue() * + Info.NumVectors); } case BuiltinType::Dependent: #define BUILTIN_TYPE(Id, SingletonId) diff --git a/clang/lib/Tooling/Syntax/BuildTree.cpp b/clang/lib/Tooling/Syntax/BuildTree.cpp index b07e9c3faf9d53..a9f326439a2a57 100644 --- a/clang/lib/Tooling/Syntax/BuildTree.cpp +++ b/clang/lib/Tooling/Syntax/BuildTree.cpp @@ -197,18 +197,58 @@ static syntax::NodeKind getOperatorNodeKind(const CXXOperatorCallExpr &E) { llvm_unreachable("Unknown OverloadedOperatorKind enum"); } +/// Get the start of the qualified name. In the examples below it gives the +/// location of the `^`: +/// `int ^a;` +/// `int *^a;` +/// `int ^a::S::f(){}` +static SourceLocation getQualifiedNameStart(NamedDecl *D) { + assert((isa(D)) && + "only DeclaratorDecl and TypedefNameDecl are supported."); + + auto DN = D->getDeclName(); + bool IsAnonymous = DN.isIdentifier() && !DN.getAsIdentifierInfo(); + if (IsAnonymous) + return SourceLocation(); + + if (const auto *DD = dyn_cast(D)) { + if (DD->getQualifierLoc()) { + return DD->getQualifierLoc().getBeginLoc(); + } + } + + return D->getLocation(); +} + +/// Gets the range of the initializer inside an init-declarator C++ [dcl.decl]. +/// `int a;` -> range of ``, +/// `int *a = nullptr` -> range of `= nullptr`. +/// `int a{}` -> range of `{}`. +/// `int a()` -> range of `()`. +static SourceRange getInitializerRange(Decl *D) { + if (auto *V = dyn_cast(D)) { + auto *I = V->getInit(); + // Initializers in range-based-for are not part of the declarator + if (I && !V->isCXXForRangeDecl()) + return I->getSourceRange(); + } + + return SourceRange(); +} + /// Gets the range of declarator as defined by the C++ grammar. E.g. /// `int a;` -> range of `a`, /// `int *a;` -> range of `*a`, /// `int a[10];` -> range of `a[10]`, /// `int a[1][2][3];` -> range of `a[1][2][3]`, /// `int *a = nullptr` -> range of `*a = nullptr`. -/// FIMXE: \p Name must be a source range, e.g. for `operator+`. +/// `int S::f(){}` -> range of `S::f()`. +/// FIXME: \p Name must be a source range. static SourceRange getDeclaratorRange(const SourceManager &SM, TypeLoc T, SourceLocation Name, SourceRange Initializer) { SourceLocation Start = GetStartLoc().Visit(T); - SourceLocation End = T.getSourceRange().getEnd(); + SourceLocation End = T.getEndLoc(); assert(End.isValid()); if (Name.isValid()) { if (Start.isInvalid()) @@ -378,11 +418,9 @@ class syntax::TreeBuilder { /// Returns true if \p D is the last declarator in a chain and is thus /// reponsible for creating SimpleDeclaration for the whole chain. - template - bool isResponsibleForCreatingDeclaration(const T *D) const { - static_assert((std::is_base_of::value || - std::is_base_of::value), - "only DeclaratorDecl and TypedefNameDecl are supported."); + bool isResponsibleForCreatingDeclaration(const Decl *D) const { + assert((isa(D)) && + "only DeclaratorDecl and TypedefNameDecl are supported."); const Decl *Next = D->getNextDeclInContext(); @@ -390,15 +428,14 @@ class syntax::TreeBuilder { if (Next == nullptr) { return true; } - const auto *NextT = dyn_cast(Next); // Next sibling is not the same type, this one is responsible. - if (NextT == nullptr) { + if (D->getKind() != Next->getKind()) { return true; } // Next sibling doesn't begin at the same loc, it must be a different // declaration, so this declarator is responsible. - if (NextT->getBeginLoc() != D->getBeginLoc()) { + if (Next->getBeginLoc() != D->getBeginLoc()) { return true; } @@ -1405,43 +1442,12 @@ class BuildTreeVisitor : public RecursiveASTVisitor { } private: - template SourceLocation getQualifiedNameStart(T *D) { - static_assert((std::is_base_of::value || - std::is_base_of::value), - "only DeclaratorDecl and TypedefNameDecl are supported."); - - auto DN = D->getDeclName(); - bool IsAnonymous = DN.isIdentifier() && !DN.getAsIdentifierInfo(); - if (IsAnonymous) - return SourceLocation(); - - if (const auto *DD = dyn_cast(D)) { - if (DD->getQualifierLoc()) { - return DD->getQualifierLoc().getBeginLoc(); - } - } - - return D->getLocation(); - } - - SourceRange getInitializerRange(Decl *D) { - if (auto *V = dyn_cast(D)) { - auto *I = V->getInit(); - // Initializers in range-based-for are not part of the declarator - if (I && !V->isCXXForRangeDecl()) - return I->getSourceRange(); - } - - return SourceRange(); - } - /// Folds SimpleDeclarator node (if present) and in case this is the last /// declarator in the chain it also folds SimpleDeclaration node. template bool processDeclaratorAndDeclaration(T *D) { - SourceRange Initializer = getInitializerRange(D); - auto Range = getDeclaratorRange(Builder.sourceManager(), - D->getTypeSourceInfo()->getTypeLoc(), - getQualifiedNameStart(D), Initializer); + auto Range = getDeclaratorRange( + Builder.sourceManager(), D->getTypeSourceInfo()->getTypeLoc(), + getQualifiedNameStart(D), getInitializerRange(D)); // There doesn't have to be a declarator (e.g. `void foo(int)` only has // declaration, but no declarator). @@ -1464,10 +1470,8 @@ class BuildTreeVisitor : public RecursiveASTVisitor { auto ReturnedType = L.getReturnLoc(); // Build node for the declarator, if any. - auto ReturnDeclaratorRange = - getDeclaratorRange(this->Builder.sourceManager(), ReturnedType, - /*Name=*/SourceLocation(), - /*Initializer=*/SourceLocation()); + auto ReturnDeclaratorRange = SourceRange(GetStartLoc().Visit(ReturnedType), + ReturnedType.getEndLoc()); syntax::SimpleDeclarator *ReturnDeclarator = nullptr; if (ReturnDeclaratorRange.isValid()) { ReturnDeclarator = new (allocator()) syntax::SimpleDeclarator; diff --git a/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp b/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp index a07187e22e930c..aab20008a49748 100644 --- a/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp +++ b/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp @@ -3123,6 +3123,35 @@ SimpleDeclaration )txt"})); } +TEST_P(SyntaxTreeTest, OutOfLineMemberFunctionDefinition) { + if (!GetParam().isCXX11OrLater()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( + R"cpp( +struct S { + void f(); +}; +[[void S::f(){}]] +)cpp", + {R"txt( +SimpleDeclaration +|-'void' +|-SimpleDeclarator Declarator +| |-NestedNameSpecifier +| | |-IdentifierNameSpecifier ListElement +| | | `-'S' +| | `-'::' ListDelimiter +| |-'f' +| `-ParametersAndQualifiers +| |-'(' OpenParen +| `-')' CloseParen +`-CompoundStatement + |-'{' OpenParen + `-'}' CloseParen +)txt"})); +} + TEST_P(SyntaxTreeTest, ConversionMemberFunction) { if (!GetParam().isCXX()) { return; @@ -3792,6 +3821,53 @@ TranslationUnit Detached )txt")); } +TEST_P(SyntaxTreeTest, InitDeclarator_Brace) { + if (!GetParam().isCXX11OrLater()) { + return; + } + EXPECT_TRUE(treeDumpEqual( + R"cpp( +int a {}; +)cpp", + R"txt( +TranslationUnit Detached +`-SimpleDeclaration + |-'int' + |-SimpleDeclarator Declarator + | |-'a' + | `-UnknownExpression + | `-UnknownExpression + | |-'{' + | `-'}' + `-';' +)txt")); +} + +TEST_P(SyntaxTreeTest, InitDeclarator_Paren) { + if (!GetParam().isCXX()) { + return; + } + EXPECT_TRUE(treeDumpEqualOnAnnotations( + R"cpp( +struct S { + S(int); +}; +[[S s(1);]] +)cpp", + {R"txt( +SimpleDeclaration +|-'S' +|-SimpleDeclarator Declarator +| `-UnknownExpression +| |-'s' +| |-'(' +| |-IntegerLiteralExpression +| | `-'1' LiteralToken +| `-')' +`-';' +)txt"})); +} + TEST_P(SyntaxTreeTest, ArrayDeclarator_Simple) { EXPECT_TRUE(treeDumpEqual( R"cpp( diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index a3e624842700b5..ffbec74c61d029 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -130,8 +130,8 @@ class IntrinsicCostAttributes { unsigned Factor); IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, ElementCount Factor) - : IntrinsicCostAttributes(Id, CI, Factor.Min) { - assert(!Factor.Scalable); + : IntrinsicCostAttributes(Id, CI, Factor.getKnownMinValue()) { + assert(!Factor.isScalable()); } IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI, diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index 527bba67b2579d..074960e7ced203 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -115,7 +115,7 @@ struct VFShape { Parameters.push_back( VFParameter({CI.arg_size(), VFParamKind::GlobalPredicate})); - return {EC.Min, EC.Scalable, Parameters}; + return {EC.getKnownMinValue(), EC.isScalable(), Parameters}; } /// Sanity check on the Parameters in the VFShape. bool hasValidParameterList() const; diff --git a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h index a8a436337e0756..1d2251dbededb6 100644 --- a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h +++ b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h @@ -202,6 +202,11 @@ class ReachingDefAnalysis : public MachineFunctionPass { void getGlobalUses(MachineInstr *MI, int PhysReg, InstSet &Uses) const; + /// Collect all possible definitions of the value stored in PhysReg, which is + /// used by MI. + void getGlobalReachingDefs(MachineInstr *MI, int PhysReg, + InstSet &Defs) const; + /// Return whether From can be moved forwards to just before To. bool isSafeToMoveForwards(MachineInstr *From, MachineInstr *To) const; diff --git a/llvm/include/llvm/CodeGen/ValueTypes.h b/llvm/include/llvm/CodeGen/ValueTypes.h index db8161caf7d2e9..2e172bf422537b 100644 --- a/llvm/include/llvm/CodeGen/ValueTypes.h +++ b/llvm/include/llvm/CodeGen/ValueTypes.h @@ -304,7 +304,7 @@ namespace llvm { /// Given a vector type, return the minimum number of elements it contains. unsigned getVectorMinNumElements() const { - return getVectorElementCount().Min; + return getVectorElementCount().getKnownMinValue(); } /// Return the size of the specified value type in bits. @@ -383,7 +383,7 @@ namespace llvm { EVT getHalfNumVectorElementsVT(LLVMContext &Context) const { EVT EltVT = getVectorElementType(); auto EltCnt = getVectorElementCount(); - assert(!(EltCnt.Min & 1) && "Splitting vector, but not in half!"); + assert(EltCnt.isKnownEven() && "Splitting vector, but not in half!"); return EVT::getVectorVT(Context, EltVT, EltCnt / 2); } @@ -398,7 +398,8 @@ namespace llvm { EVT getPow2VectorType(LLVMContext &Context) const { if (!isPow2VectorType()) { ElementCount NElts = getVectorElementCount(); - NElts.Min = 1 << Log2_32_Ceil(NElts.Min); + unsigned NewMinCount = 1 << Log2_32_Ceil(NElts.getKnownMinValue()); + NElts = ElementCount::get(NewMinCount, NElts.isScalable()); return EVT::getVectorVT(Context, getVectorElementType(), NElts); } else { diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h index 579275ab1f8222..a58f381300161f 100644 --- a/llvm/include/llvm/IR/DataLayout.h +++ b/llvm/include/llvm/IR/DataLayout.h @@ -696,9 +696,9 @@ inline TypeSize DataLayout::getTypeSizeInBits(Type *Ty) const { case Type::ScalableVectorTyID: { VectorType *VTy = cast(Ty); auto EltCnt = VTy->getElementCount(); - uint64_t MinBits = EltCnt.Min * - getTypeSizeInBits(VTy->getElementType()).getFixedSize(); - return TypeSize(MinBits, EltCnt.Scalable); + uint64_t MinBits = EltCnt.getKnownMinValue() * + getTypeSizeInBits(VTy->getElementType()).getFixedSize(); + return TypeSize(MinBits, EltCnt.isScalable()); } default: llvm_unreachable("DataLayout::getTypeSizeInBits(): Unsupported type"); diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h index 1837f6808f24dc..25ece7a060fd45 100644 --- a/llvm/include/llvm/IR/DerivedTypes.h +++ b/llvm/include/llvm/IR/DerivedTypes.h @@ -426,16 +426,16 @@ class VectorType : public Type { unsigned getNumElements() const { ElementCount EC = getElementCount(); #ifdef STRICT_FIXED_SIZE_VECTORS - assert(!EC.Scalable && + assert(!EC.isScalable() && "Request for fixed number of elements from scalable vector"); - return EC.Min; + return EC.getKnownMinValue(); #else - if (EC.Scalable) + if (EC.isScalable()) WithColor::warning() << "The code that requested the fixed number of elements has made " "the assumption that this vector is not scalable. This assumption " "was not correct, and this may lead to broken code\n"; - return EC.Min; + return EC.getKnownMinValue(); #endif } @@ -512,8 +512,8 @@ class VectorType : public Type { /// input type and the same element type. static VectorType *getHalfElementsVectorType(VectorType *VTy) { auto EltCnt = VTy->getElementCount(); - assert ((EltCnt.Min & 1) == 0 && - "Cannot halve vector with odd number of elements."); + assert(EltCnt.isKnownEven() && + "Cannot halve vector with odd number of elements."); return VectorType::get(VTy->getElementType(), EltCnt/2); } @@ -521,7 +521,8 @@ class VectorType : public Type { /// input type and the same element type. static VectorType *getDoubleElementsVectorType(VectorType *VTy) { auto EltCnt = VTy->getElementCount(); - assert((EltCnt.Min * 2ull) <= UINT_MAX && "Too many elements in vector"); + assert((EltCnt.getKnownMinValue() * 2ull) <= UINT_MAX && + "Too many elements in vector"); return VectorType::get(VTy->getElementType(), EltCnt * 2); } diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index ac7ce75a9f310b..7b41dced564d4c 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -2046,8 +2046,9 @@ class ShuffleVectorInst : public Instruction { /// Examples: shufflevector <4 x n> A, <4 x n> B, <1,2,3> /// shufflevector <4 x n> A, <4 x n> B, <1,2,3,4,5> bool changesLength() const { - unsigned NumSourceElts = - cast(Op<0>()->getType())->getElementCount().Min; + unsigned NumSourceElts = cast(Op<0>()->getType()) + ->getElementCount() + .getKnownMinValue(); unsigned NumMaskElts = ShuffleMask.size(); return NumSourceElts != NumMaskElts; } diff --git a/llvm/include/llvm/Support/MachineValueType.h b/llvm/include/llvm/Support/MachineValueType.h index 172d4fd8c27530..d88efcef731f7a 100644 --- a/llvm/include/llvm/Support/MachineValueType.h +++ b/llvm/include/llvm/Support/MachineValueType.h @@ -424,7 +424,7 @@ namespace llvm { MVT getHalfNumVectorElementsVT() const { MVT EltVT = getVectorElementType(); auto EltCnt = getVectorElementCount(); - assert(!(EltCnt.Min & 1) && "Splitting vector, but not in half!"); + assert(EltCnt.isKnownEven() && "Splitting vector, but not in half!"); return getVectorVT(EltVT, EltCnt / 2); } @@ -742,7 +742,7 @@ namespace llvm { /// Given a vector type, return the minimum number of elements it contains. unsigned getVectorMinNumElements() const { - return getVectorElementCount().Min; + return getVectorElementCount().getKnownMinValue(); } /// Returns the size of the specified MVT in bits. @@ -1207,9 +1207,9 @@ namespace llvm { } static MVT getVectorVT(MVT VT, ElementCount EC) { - if (EC.Scalable) - return getScalableVectorVT(VT, EC.Min); - return getVectorVT(VT, EC.Min); + if (EC.isScalable()) + return getScalableVectorVT(VT, EC.getKnownMinValue()); + return getVectorVT(VT, EC.getKnownMinValue()); } /// Return the value type corresponding to the specified type. This returns diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h index 3f67e0cfc63ee8..fe564b634ec725 100644 --- a/llvm/include/llvm/Support/TypeSize.h +++ b/llvm/include/llvm/Support/TypeSize.h @@ -26,6 +26,11 @@ namespace llvm { template struct DenseMapInfo; class ElementCount { +private: + unsigned Min; // Minimum number of vector elements. + bool Scalable; // If true, NumElements is a multiple of 'Min' determined + // at runtime rather than compile time. + public: /// Prevent code from using initializer-list contructors like /// ElementCount EC = {, }. The static `get*` @@ -35,10 +40,6 @@ class ElementCount { ElementCount(unsigned Min, bool Scalable) : Min(Min), Scalable(Scalable) {} public: - unsigned Min; // Minimum number of vector elements. - bool Scalable; // If true, NumElements is a multiple of 'Min' determined - // at runtime rather than compile time. - ElementCount() = default; ElementCount operator*(unsigned RHS) { @@ -58,6 +59,16 @@ class ElementCount { bool operator==(unsigned RHS) const { return Min == RHS && !Scalable; } bool operator!=(unsigned RHS) const { return !(*this == RHS); } + ElementCount &operator*=(unsigned RHS) { + Min *= RHS; + return *this; + } + + ElementCount &operator/=(unsigned RHS) { + Min /= RHS; + return *this; + } + ElementCount NextPowerOf2() const { return {(unsigned)llvm::NextPowerOf2(Min), Scalable}; } @@ -81,11 +92,21 @@ class ElementCount { /// ///@{ No elements.. bool isZero() const { return Min == 0; } + /// At least one element. + bool isNonZero() const { return Min != 0; } + /// A return value of true indicates we know at compile time that the number + /// of elements (vscale * Min) is definitely even. However, returning false + /// does not guarantee that the total number of elements is odd. + bool isKnownEven() const { return (Min & 0x1) == 0; } /// Exactly one element. bool isScalar() const { return !Scalable && Min == 1; } /// One or more elements. bool isVector() const { return (Scalable && Min != 0) || Min > 1; } ///@} + + unsigned getKnownMinValue() const { return Min; } + + bool isScalable() const { return Scalable; } }; /// Stream operator function for `ElementCount`. @@ -322,10 +343,11 @@ template <> struct DenseMapInfo { return ElementCount::getFixed(~0U - 1); } static unsigned getHashValue(const ElementCount& EltCnt) { - if (EltCnt.Scalable) - return (EltCnt.Min * 37U) - 1U; + unsigned HashVal = EltCnt.getKnownMinValue() * 37U; + if (EltCnt.isScalable()) + return (HashVal - 1U); - return EltCnt.Min * 37U; + return HashVal; } static bool isEqual(const ElementCount& LHS, const ElementCount& RHS) { diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 3746bc66e426a4..eb41257bf5ad52 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -4550,7 +4550,7 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, unsigned MaskNumElts = Mask.size(); ElementCount InVecEltCount = InVecTy->getElementCount(); - bool Scalable = InVecEltCount.Scalable; + bool Scalable = InVecEltCount.isScalable(); SmallVector Indices; Indices.assign(Mask.begin(), Mask.end()); @@ -4559,7 +4559,7 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, // replace that input vector with undef. if (!Scalable) { bool MaskSelects0 = false, MaskSelects1 = false; - unsigned InVecNumElts = InVecEltCount.Min; + unsigned InVecNumElts = InVecEltCount.getKnownMinValue(); for (unsigned i = 0; i != MaskNumElts; ++i) { if (Indices[i] == -1) continue; @@ -4588,7 +4588,8 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, // is not known at compile time for scalable vectors if (!Scalable && Op0Const && !Op1Const) { std::swap(Op0, Op1); - ShuffleVectorInst::commuteShuffleMask(Indices, InVecEltCount.Min); + ShuffleVectorInst::commuteShuffleMask(Indices, + InVecEltCount.getKnownMinValue()); } // A splat of an inserted scalar constant becomes a vector constant: diff --git a/llvm/lib/Analysis/VFABIDemangling.cpp b/llvm/lib/Analysis/VFABIDemangling.cpp index 56155b28132eb6..4cdffa63135fb7 100644 --- a/llvm/lib/Analysis/VFABIDemangling.cpp +++ b/llvm/lib/Analysis/VFABIDemangling.cpp @@ -442,7 +442,7 @@ Optional VFABI::tryDemangleForVFABI(StringRef MangledName, if (!F) return None; const ElementCount EC = getECFromSignature(F->getFunctionType()); - VF = EC.Min; + VF = EC.getKnownMinValue(); } // Sanity checks. diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index f8a5cecc16a816..b592412ed0b6f3 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -4808,7 +4808,8 @@ static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly) { auto *VTy = cast(Op->getOperand(0)->getType()); unsigned IdxOp = Op->getOpcode() == Instruction::InsertElement ? 2 : 1; auto *Idx = dyn_cast(Op->getOperand(IdxOp)); - if (!Idx || Idx->getZExtValue() >= VTy->getElementCount().Min) + if (!Idx || + Idx->getZExtValue() >= VTy->getElementCount().getKnownMinValue()) return true; return false; } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 329cbe4020a3f4..f2630903dba1c0 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -970,7 +970,7 @@ void ModuleBitcodeWriter::writeTypeTable() { // VECTOR [numelts, eltty] or // [numelts, eltty, scalable] Code = bitc::TYPE_CODE_VECTOR; - TypeVals.push_back(VT->getElementCount().Min); + TypeVals.push_back(VT->getElementCount().getKnownMinValue()); TypeVals.push_back(VE.getTypeID(VT->getElementType())); if (isa(VT)) TypeVals.push_back(true); diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 86b5d2055f5500..2034fd0730ebdb 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -6957,10 +6957,10 @@ class VectorPromoteHelper { if (UseSplat) return ConstantVector::getSplat(EC, Val); - if (!EC.Scalable) { + if (!EC.isScalable()) { SmallVector ConstVec; UndefValue *UndefVal = UndefValue::get(Val->getType()); - for (unsigned Idx = 0; Idx != EC.Min; ++Idx) { + for (unsigned Idx = 0; Idx != EC.getKnownMinValue(); ++Idx) { if (Idx == ExtractIdx) ConstVec.push_back(Val); else diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp index cb53ea47e79fc9..5a4837079bed98 100644 --- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -389,6 +389,19 @@ ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, int PhysReg, } } +void +ReachingDefAnalysis::getGlobalReachingDefs(MachineInstr *MI, int PhysReg, + InstSet &Defs) const { + if (auto *Def = getUniqueReachingMIDef(MI, PhysReg)) { + Defs.insert(Def); + return; + } + + SmallPtrSet Visited; + for (auto *MBB : MI->getParent()->predecessors()) + getLiveOuts(MBB, PhysReg, Defs); +} + void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, int PhysReg, InstSet &Defs) const { SmallPtrSet VisitedBBs; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 59edd03b7ec82c..31ac5d92ffe63e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18994,7 +18994,7 @@ static SDValue combineConcatVectorOfCasts(SDNode *N, SelectionDAG &DAG) { // check the other type in the cast to make sure this is really legal. EVT VT = N->getValueType(0); EVT SrcEltVT = SrcVT.getVectorElementType(); - unsigned NumElts = SrcVT.getVectorElementCount().Min * N->getNumOperands(); + ElementCount NumElts = SrcVT.getVectorElementCount() * N->getNumOperands(); EVT ConcatSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcEltVT, NumElts); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); switch (CastOpcode) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 85dc150e146130..1a2c77974c2b93 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -428,10 +428,10 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, // vector widening case (e.g. <2 x float> -> <4 x float>). Extract the // elements we want. if (PartEVT.getVectorElementType() == ValueVT.getVectorElementType()) { - assert((PartEVT.getVectorElementCount().Min > - ValueVT.getVectorElementCount().Min) && - (PartEVT.getVectorElementCount().Scalable == - ValueVT.getVectorElementCount().Scalable) && + assert((PartEVT.getVectorElementCount().getKnownMinValue() > + ValueVT.getVectorElementCount().getKnownMinValue()) && + (PartEVT.getVectorElementCount().isScalable() == + ValueVT.getVectorElementCount().isScalable()) && "Cannot narrow, it would be a lossy transformation"); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val, DAG.getVectorIdxConstant(0, DL)); @@ -3751,7 +3751,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { if (IsVectorGEP && !N.getValueType().isVector()) { LLVMContext &Context = *DAG.getContext(); EVT VT = EVT::getVectorVT(Context, N.getValueType(), VectorElementCount); - if (VectorElementCount.Scalable) + if (VectorElementCount.isScalable()) N = DAG.getSplatVector(VT, dl, N); else N = DAG.getSplatBuildVector(VT, dl, N); @@ -3824,7 +3824,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { if (!IdxN.getValueType().isVector() && IsVectorGEP) { EVT VT = EVT::getVectorVT(*Context, IdxN.getValueType(), VectorElementCount); - if (VectorElementCount.Scalable) + if (VectorElementCount.isScalable()) IdxN = DAG.getSplatVector(VT, dl, IdxN); else IdxN = DAG.getSplatBuildVector(VT, dl, IdxN); diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index c71bf7c74808e2..fe9feb5f116b6a 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -499,14 +499,14 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI, // Return false if any operands requested for folding are not foldable (not // part of the stackmap's live values). for (unsigned Op : Ops) { - // Caller is expected to avoid passing in tied operands - assert(!MI.getOperand(Op).isTied()); if (Op < NumDefs) { assert(DefToFoldIdx == MI.getNumOperands() && "Folding multiple defs"); DefToFoldIdx = Op; } else if (Op < StartIdx) { return nullptr; } + if (MI.getOperand(Op).isTied()) + return nullptr; } MachineInstr *NewMI = diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 40bb45a584dbdb..958bb7939046b3 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -964,23 +964,24 @@ static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT, // Scalable vectors cannot be scalarized, so splitting or widening is // required. - if (VT.isScalableVector() && !isPowerOf2_32(EC.Min)) + if (VT.isScalableVector() && !isPowerOf2_32(EC.getKnownMinValue())) llvm_unreachable( "Splitting or widening of non-power-of-2 MVTs is not implemented."); // FIXME: We don't support non-power-of-2-sized vectors for now. // Ideally we could break down into LHS/RHS like LegalizeDAG does. - if (!isPowerOf2_32(EC.Min)) { + if (!isPowerOf2_32(EC.getKnownMinValue())) { // Split EC to unit size (scalable property is preserved). - NumVectorRegs = EC.Min; - EC = EC / NumVectorRegs; + NumVectorRegs = EC.getKnownMinValue(); + EC = ElementCount::getFixed(1); } // Divide the input until we get to a supported size. This will // always end up with an EC that represent a scalar or a scalable // scalar. - while (EC.Min > 1 && !TLI->isTypeLegal(MVT::getVectorVT(EltTy, EC))) { - EC.Min >>= 1; + while (EC.getKnownMinValue() > 1 && + !TLI->isTypeLegal(MVT::getVectorVT(EltTy, EC))) { + EC /= 2; NumVectorRegs <<= 1; } @@ -1315,13 +1316,15 @@ void TargetLoweringBase::computeRegisterProperties( } case TypeWidenVector: - if (isPowerOf2_32(EC.Min)) { + if (isPowerOf2_32(EC.getKnownMinValue())) { // Try to widen the vector. for (unsigned nVT = i + 1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { MVT SVT = (MVT::SimpleValueType) nVT; if (SVT.getVectorElementType() == EltVT && SVT.isScalableVector() == IsScalable && - SVT.getVectorElementCount().Min > EC.Min && isTypeLegal(SVT)) { + SVT.getVectorElementCount().getKnownMinValue() > + EC.getKnownMinValue() && + isTypeLegal(SVT)) { TransformToType[i] = SVT; RegisterTypeForVT[i] = SVT; NumRegistersForVT[i] = 1; @@ -1365,10 +1368,10 @@ void TargetLoweringBase::computeRegisterProperties( ValueTypeActions.setTypeAction(VT, TypeScalarizeVector); else if (PreferredAction == TypeSplitVector) ValueTypeActions.setTypeAction(VT, TypeSplitVector); - else if (EC.Min > 1) + else if (EC.getKnownMinValue() > 1) ValueTypeActions.setTypeAction(VT, TypeSplitVector); else - ValueTypeActions.setTypeAction(VT, EC.Scalable + ValueTypeActions.setTypeAction(VT, EC.isScalable() ? TypeScalarizeScalableVector : TypeScalarizeVector); } else { @@ -1426,7 +1429,8 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT // This handles things like <2 x float> -> <4 x float> and // <4 x i1> -> <4 x i32>. LegalizeTypeAction TA = getTypeAction(Context, VT); - if (EltCnt.Min != 1 && (TA == TypeWidenVector || TA == TypePromoteInteger)) { + if (EltCnt.getKnownMinValue() != 1 && + (TA == TypeWidenVector || TA == TypePromoteInteger)) { EVT RegisterEVT = getTypeToTransformTo(Context, VT); if (isTypeLegal(RegisterEVT)) { IntermediateVT = RegisterEVT; @@ -1443,7 +1447,7 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT // Scalable vectors cannot be scalarized, so handle the legalisation of the // types like done elsewhere in SelectionDAG. - if (VT.isScalableVector() && !isPowerOf2_32(EltCnt.Min)) { + if (VT.isScalableVector() && !isPowerOf2_32(EltCnt.getKnownMinValue())) { LegalizeKind LK; EVT PartVT = VT; do { @@ -1452,15 +1456,15 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT PartVT = LK.second; } while (LK.first != TypeLegal); - NumIntermediates = - VT.getVectorElementCount().Min / PartVT.getVectorElementCount().Min; + NumIntermediates = VT.getVectorElementCount().getKnownMinValue() / + PartVT.getVectorElementCount().getKnownMinValue(); // FIXME: This code needs to be extended to handle more complex vector // breakdowns, like nxv7i64 -> nxv8i64 -> 4 x nxv2i64. Currently the only // supported cases are vectors that are broken down into equal parts // such as nxv6i64 -> 3 x nxv2i64. - assert(NumIntermediates * PartVT.getVectorElementCount().Min == - VT.getVectorElementCount().Min && + assert((PartVT.getVectorElementCount() * NumIntermediates) == + VT.getVectorElementCount() && "Expected an integer multiple of PartVT"); IntermediateVT = PartVT; RegisterVT = getRegisterType(Context, IntermediateVT); @@ -1469,16 +1473,16 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT // FIXME: We don't support non-power-of-2-sized vectors for now. Ideally // we could break down into LHS/RHS like LegalizeDAG does. - if (!isPowerOf2_32(EltCnt.Min)) { - NumVectorRegs = EltCnt.Min; - EltCnt.Min = 1; + if (!isPowerOf2_32(EltCnt.getKnownMinValue())) { + NumVectorRegs = EltCnt.getKnownMinValue(); + EltCnt = ElementCount::getFixed(1); } // Divide the input until we get to a supported size. This will always // end with a scalar if the target doesn't support vectors. - while (EltCnt.Min > 1 && + while (EltCnt.getKnownMinValue() > 1 && !isTypeLegal(EVT::getVectorVT(Context, EltTy, EltCnt))) { - EltCnt.Min >>= 1; + EltCnt /= 2; NumVectorRegs <<= 1; } diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp index c3974519bfc657..72eda95e72c861 100644 --- a/llvm/lib/CodeGen/ValueTypes.cpp +++ b/llvm/lib/CodeGen/ValueTypes.cpp @@ -122,13 +122,13 @@ EVT EVT::getExtendedVectorElementType() const { unsigned EVT::getExtendedVectorNumElements() const { assert(isExtended() && "Type is not extended!"); ElementCount EC = cast(LLVMTy)->getElementCount(); - if (EC.Scalable) { + if (EC.isScalable()) { WithColor::warning() << "The code that requested the fixed number of elements has made the " "assumption that this vector is not scalable. This assumption was " "not correct, and this may lead to broken code\n"; } - return EC.Min; + return EC.getKnownMinValue(); } ElementCount EVT::getExtendedVectorElementCount() const { @@ -150,9 +150,9 @@ std::string EVT::getEVTString() const { switch (V.SimpleTy) { default: if (isVector()) - return (isScalableVector() ? "nxv" : "v") - + utostr(getVectorElementCount().Min) - + getVectorElementType().getEVTString(); + return (isScalableVector() ? "nxv" : "v") + + utostr(getVectorElementCount().getKnownMinValue()) + + getVectorElementType().getEVTString(); if (isInteger()) return "i" + utostr(getSizeInBits()); if (isFloatingPoint()) diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 0b2ac8582a62b1..8cb1883da68e42 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -656,9 +656,9 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) { VectorType *PTy = cast(Ty); ElementCount EC = PTy->getElementCount(); OS << "<"; - if (EC.Scalable) + if (EC.isScalable()) OS << "vscale x "; - OS << EC.Min << " x "; + OS << EC.getKnownMinValue() << " x "; print(PTy->getElementType(), OS); OS << '>'; return; diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp index e5c4250665e41b..468dce95a29ad0 100644 --- a/llvm/lib/IR/ConstantFold.cpp +++ b/llvm/lib/IR/ConstantFold.cpp @@ -931,7 +931,7 @@ Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *V2, // If the mask is all zeros this is a splat, no need to go through all // elements. if (all_of(Mask, [](int Elt) { return Elt == 0; }) && - !MaskEltCount.Scalable) { + !MaskEltCount.isScalable()) { Type *Ty = IntegerType::get(V1->getContext(), 32); Constant *Elt = ConstantExpr::getExtractElement(V1, ConstantInt::get(Ty, 0)); @@ -942,7 +942,7 @@ Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *V2, if (isa(V1VTy)) return nullptr; - unsigned SrcNumElts = V1VTy->getElementCount().Min; + unsigned SrcNumElts = V1VTy->getElementCount().getKnownMinValue(); // Loop over the shuffle mask, evaluating each element. SmallVector Result; @@ -2056,11 +2056,12 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, SmallVector ResElts; Type *Ty = IntegerType::get(C1->getContext(), 32); // Compare the elements, producing an i1 result or constant expr. - for (unsigned i = 0, e = C1VTy->getElementCount().Min; i != e; ++i) { + for (unsigned I = 0, E = C1VTy->getElementCount().getKnownMinValue(); + I != E; ++I) { Constant *C1E = - ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, i)); + ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, I)); Constant *C2E = - ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, i)); + ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, I)); ResElts.push_back(ConstantExpr::getCompare(pred, C1E, C2E)); } diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index 8d960ea9a5faa6..d84c7bc2da9dbe 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -1300,14 +1300,14 @@ Constant *ConstantVector::getImpl(ArrayRef V) { } Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) { - if (!EC.Scalable) { + if (!EC.isScalable()) { // If this splat is compatible with ConstantDataVector, use it instead of // ConstantVector. if ((isa(V) || isa(V)) && ConstantDataSequential::isElementTypeCompatible(V->getType())) - return ConstantDataVector::getSplat(EC.Min, V); + return ConstantDataVector::getSplat(EC.getKnownMinValue(), V); - SmallVector Elts(EC.Min, V); + SmallVector Elts(EC.getKnownMinValue(), V); return get(Elts); } @@ -1324,7 +1324,7 @@ Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) { Constant *UndefV = UndefValue::get(VTy); V = ConstantExpr::getInsertElement(UndefV, V, ConstantInt::get(I32Ty, 0)); // Build shuffle mask to perform the splat. - SmallVector Zeros(EC.Min, 0); + SmallVector Zeros(EC.getKnownMinValue(), 0); // Splat. return ConstantExpr::getShuffleVector(V, UndefV, Zeros); } @@ -2264,7 +2264,7 @@ Constant *ConstantExpr::getGetElementPtr(Type *Ty, Constant *C, if (VectorType *VecTy = dyn_cast(Idx->getType())) EltCount = VecTy->getElementCount(); - if (EltCount.Min != 0) + if (EltCount.isNonZero()) ReqTy = VectorType::get(ReqTy, EltCount); if (OnlyIfReducedTy == ReqTy) @@ -2284,7 +2284,7 @@ Constant *ConstantExpr::getGetElementPtr(Type *Ty, Constant *C, if (GTI.isStruct() && Idx->getType()->isVectorTy()) { Idx = Idx->getSplatValue(); - } else if (GTI.isSequential() && EltCount.Min != 0 && + } else if (GTI.isSequential() && EltCount.isNonZero() && !Idx->getType()->isVectorTy()) { Idx = ConstantVector::getSplat(EltCount, Idx); } diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index 71faa5002b9ffd..8598acc82804f6 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -781,7 +781,7 @@ unsigned LLVMGetPointerAddressSpace(LLVMTypeRef PointerTy) { } unsigned LLVMGetVectorSize(LLVMTypeRef VectorTy) { - return unwrap(VectorTy)->getElementCount().Min; + return unwrap(VectorTy)->getElementCount().getKnownMinValue(); } /*--.. Operations on other types ...........................................--*/ diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp index 31b227d4a68207..ffb3adcdbf8a07 100644 --- a/llvm/lib/IR/DataLayout.cpp +++ b/llvm/lib/IR/DataLayout.cpp @@ -630,7 +630,7 @@ Align DataLayout::getAlignmentInfo(AlignTypeEnum AlignType, uint32_t BitWidth, // We're only calculating a natural alignment, so it doesn't have to be // based on the full size for scalable vectors. Using the minimum element // count should be enough here. - Alignment *= cast(Ty)->getElementCount().Min; + Alignment *= cast(Ty)->getElementCount().getKnownMinValue(); Alignment = PowerOf2Ceil(Alignment); return Align(Alignment); } diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index b29a00c5fe460e..e701feae22562f 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -714,9 +714,10 @@ static std::string getMangledTypeStr(Type* Ty) { Result += "f"; } else if (VectorType* VTy = dyn_cast(Ty)) { ElementCount EC = VTy->getElementCount(); - if (EC.Scalable) + if (EC.isScalable()) Result += "nx"; - Result += "v" + utostr(EC.Min) + getMangledTypeStr(VTy->getElementType()); + Result += "v" + utostr(EC.getKnownMinValue()) + + getMangledTypeStr(VTy->getElementType()); } else if (Ty) { switch (Ty->getTypeID()) { default: llvm_unreachable("Unhandled type"); diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 33a0f5b09d0b9d..d6eeffd44b3680 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -1003,7 +1003,7 @@ Value *IRBuilderBase::CreateVectorSplat(unsigned NumElts, Value *V, Value *IRBuilderBase::CreateVectorSplat(ElementCount EC, Value *V, const Twine &Name) { - assert(EC.Min > 0 && "Cannot splat to an empty vector!"); + assert(EC.isNonZero() && "Cannot splat to an empty vector!"); // First insert it into an undef vector so we can shuffle it. Type *I32Ty = getInt32Ty(); diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 48f416173dde1d..445fad8bcbf41a 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -1967,7 +1967,8 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2, return false; // Make sure the mask elements make sense. - int V1Size = cast(V1->getType())->getElementCount().Min; + int V1Size = + cast(V1->getType())->getElementCount().getKnownMinValue(); for (int Elem : Mask) if (Elem != UndefMaskElem && Elem >= V1Size * 2) return false; @@ -2026,22 +2027,22 @@ void ShuffleVectorInst::getShuffleMask(const Constant *Mask, ElementCount EC = cast(Mask->getType())->getElementCount(); if (isa(Mask)) { - Result.resize(EC.Min, 0); + Result.resize(EC.getKnownMinValue(), 0); return; } - Result.reserve(EC.Min); + Result.reserve(EC.getKnownMinValue()); - if (EC.Scalable) { + if (EC.isScalable()) { assert((isa(Mask) || isa(Mask)) && "Scalable vector shuffle mask must be undef or zeroinitializer"); int MaskVal = isa(Mask) ? -1 : 0; - for (unsigned I = 0; I < EC.Min; ++I) + for (unsigned I = 0; I < EC.getKnownMinValue(); ++I) Result.emplace_back(MaskVal); return; } - unsigned NumElts = EC.Min; + unsigned NumElts = EC.getKnownMinValue(); if (auto *CDS = dyn_cast(Mask)) { for (unsigned i = 0; i != NumElts; ++i) diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp index c4e06cd979ed69..b6b036d0bbca2a 100644 --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -280,8 +280,8 @@ bool VPIntrinsic::canIgnoreVectorLengthParam() const { // the operation. This function returns true when this is detected statically // in the IR. - // Check whether "W == vscale * EC.Min" - if (EC.Scalable) { + // Check whether "W == vscale * EC.getKnownMinValue()" + if (EC.isScalable()) { // Undig the DL auto ParMod = this->getModule(); if (!ParMod) @@ -291,8 +291,8 @@ bool VPIntrinsic::canIgnoreVectorLengthParam() const { // Compare vscale patterns uint64_t VScaleFactor; if (match(VLParam, m_c_Mul(m_ConstantInt(VScaleFactor), m_VScale(DL)))) - return VScaleFactor >= EC.Min; - return (EC.Min == 1) && match(VLParam, m_VScale(DL)); + return VScaleFactor >= EC.getKnownMinValue(); + return (EC.getKnownMinValue() == 1) && match(VLParam, m_VScale(DL)); } // standard SIMD operation @@ -301,7 +301,7 @@ bool VPIntrinsic::canIgnoreVectorLengthParam() const { return false; uint64_t VLNum = VLConst->getZExtValue(); - if (VLNum >= EC.Min) + if (VLNum >= EC.getKnownMinValue()) return true; return false; diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp index 1cf0d1345ab31a..ce374faa5ced2f 100644 --- a/llvm/lib/IR/Type.cpp +++ b/llvm/lib/IR/Type.cpp @@ -128,7 +128,7 @@ TypeSize Type::getPrimitiveSizeInBits() const { ElementCount EC = VTy->getElementCount(); TypeSize ETS = VTy->getElementType()->getPrimitiveSizeInBits(); assert(!ETS.isScalable() && "Vector type should have fixed-width elements"); - return {ETS.getFixedSize() * EC.Min, EC.Scalable}; + return {ETS.getFixedSize() * EC.getKnownMinValue(), EC.isScalable()}; } default: return TypeSize::Fixed(0); } @@ -598,10 +598,10 @@ VectorType::VectorType(Type *ElType, unsigned EQ, Type::TypeID TID) } VectorType *VectorType::get(Type *ElementType, ElementCount EC) { - if (EC.Scalable) - return ScalableVectorType::get(ElementType, EC.Min); + if (EC.isScalable()) + return ScalableVectorType::get(ElementType, EC.getKnownMinValue()); else - return FixedVectorType::get(ElementType, EC.Min); + return FixedVectorType::get(ElementType, EC.getKnownMinValue()); } bool VectorType::isValidElementType(Type *ElemTy) { diff --git a/llvm/lib/ObjectYAML/DWARFEmitter.cpp b/llvm/lib/ObjectYAML/DWARFEmitter.cpp index bbcf56cc0cec0e..912213517334c1 100644 --- a/llvm/lib/ObjectYAML/DWARFEmitter.cpp +++ b/llvm/lib/ObjectYAML/DWARFEmitter.cpp @@ -95,8 +95,8 @@ Error DWARFYAML::emitDebugStr(raw_ostream &OS, const DWARFYAML::Data &DI) { } Error DWARFYAML::emitDebugAbbrev(raw_ostream &OS, const DWARFYAML::Data &DI) { - uint64_t AbbrevCode = 0; for (const DWARFYAML::AbbrevTable &AbbrevTable : DI.DebugAbbrev) { + uint64_t AbbrevCode = 0; for (const DWARFYAML::Abbrev &AbbrevDecl : AbbrevTable.Table) { AbbrevCode = AbbrevDecl.Code ? (uint64_t)*AbbrevDecl.Code : AbbrevCode + 1; diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 184458607c3cc4..fda514b2006c27 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -4827,7 +4827,8 @@ static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT, return EVT(); ElementCount EC = PredVT.getVectorElementCount(); - EVT ScalarVT = EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.Min); + EVT ScalarVT = + EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue()); EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC * NumVec); return MemVT; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c904fc7f8c93cc..e68183dc46a2ca 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3532,8 +3532,9 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, // 256 bit non-temporal stores can be lowered to STNP. Do this as part of // the custom lowering, as there are no un-paired non-temporal stores and // legalization will break up 256 bit inputs. + ElementCount EC = MemVT.getVectorElementCount(); if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u && - MemVT.getVectorElementCount().Min % 2u == 0 && + EC.isKnownEven() && ((MemVT.getScalarSizeInBits() == 8u || MemVT.getScalarSizeInBits() == 16u || MemVT.getScalarSizeInBits() == 32u || @@ -3542,11 +3543,11 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64)); - SDValue Hi = DAG.getNode( - ISD::EXTRACT_SUBVECTOR, Dl, - MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), - StoreNode->getValue(), - DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64)); + SDValue Hi = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl, + MemVT.getHalfNumVectorElementsVT(*DAG.getContext()), + StoreNode->getValue(), + DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64)); SDValue Result = DAG.getMemIntrinsicNode( AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other), {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, @@ -10370,7 +10371,7 @@ SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic, {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}}; std::tie(N, Opcode) = IntrinsicMap[Intrinsic]; - assert(VT.getVectorElementCount().Min % N == 0 && + assert(VT.getVectorElementCount().getKnownMinValue() % N == 0 && "invalid tuple vector type!"); EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), @@ -14443,7 +14444,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, uint64_t IdxConst = cast(Idx)->getZExtValue(); EVT ResVT = N->getValueType(0); - uint64_t NumLanes = ResVT.getVectorElementCount().Min; + uint64_t NumLanes = ResVT.getVectorElementCount().getKnownMinValue(); SDValue ExtIdx = DAG.getVectorIdxConstant(IdxConst * NumLanes, DL); SDValue Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1, ExtIdx); @@ -14457,10 +14458,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, SDValue Vec = N->getOperand(4); EVT TupleVT = Tuple.getValueType(); - uint64_t TupleLanes = TupleVT.getVectorElementCount().Min; + uint64_t TupleLanes = TupleVT.getVectorElementCount().getKnownMinValue(); uint64_t IdxConst = cast(Idx)->getZExtValue(); - uint64_t NumLanes = Vec.getValueType().getVectorElementCount().Min; + uint64_t NumLanes = + Vec.getValueType().getVectorElementCount().getKnownMinValue(); if ((TupleLanes % NumLanes) != 0) report_fatal_error("invalid tuple vector!"); @@ -14696,7 +14698,7 @@ void AArch64TargetLowering::ReplaceExtractSubVectorResults( ElementCount ResEC = VT.getVectorElementCount(); - if (InVT.getVectorElementCount().Min != (ResEC.Min * 2)) + if (InVT.getVectorElementCount() != (ResEC * 2)) return; auto *CIndex = dyn_cast(N->getOperand(1)); @@ -14704,7 +14706,7 @@ void AArch64TargetLowering::ReplaceExtractSubVectorResults( return; unsigned Index = CIndex->getZExtValue(); - if ((Index != 0) && (Index != ResEC.Min)) + if ((Index != 0) && (Index != ResEC.getKnownMinValue())) return; unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI; diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index 4d1ab88fe3b2c8..a98590fd79c685 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -201,17 +201,6 @@ namespace { PredicatedMI *getDivergent() const { return Divergent; } }; - struct Reduction { - MachineInstr *Init; - MachineInstr &Copy; - MachineInstr &Reduce; - MachineInstr &VPSEL; - - Reduction(MachineInstr *Init, MachineInstr *Mov, MachineInstr *Add, - MachineInstr *Sel) - : Init(Init), Copy(*Mov), Reduce(*Add), VPSEL(*Sel) { } - }; - struct LowOverheadLoop { MachineLoop &ML; @@ -232,7 +221,6 @@ namespace { SetVector CurrentPredicate; SmallVector VPTBlocks; SmallPtrSet ToRemove; - SmallVector, 1> Reductions; SmallPtrSet BlockMasksToRecompute; bool Revert = false; bool CannotTailPredicate = false; @@ -270,10 +258,6 @@ namespace { // of elements to the loop start instruction. bool ValidateTailPredicate(MachineInstr *StartInsertPt); - // See whether the live-out instructions are a reduction that we can fixup - // later. - bool FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers); - // Check that any values available outside of the loop will be the same // after tail predication conversion. bool ValidateLiveOuts(); @@ -365,8 +349,6 @@ namespace { void ConvertVPTBlocks(LowOverheadLoop &LoLoop); - void FixupReductions(LowOverheadLoop &LoLoop) const; - MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); void Expand(LowOverheadLoop &LoLoop); @@ -447,8 +429,10 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) { } } - if (!ValidateLiveOuts()) + if (!ValidateLiveOuts()) { + LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n"); return false; + } // For tail predication, we need to provide the number of elements, instead // of the iteration count, to the loop start instruction. The number of @@ -636,7 +620,6 @@ static bool canGenerateNonZeros(const MachineInstr &MI) { return false; } - // Look at its register uses to see if it only can only receive zeros // into its false lanes which would then produce zeros. Also check that // the output register is also defined by an FalseLanesZero instruction @@ -649,120 +632,40 @@ static bool producesFalseLanesZero(MachineInstr &MI, if (canGenerateNonZeros(MI)) return false; + bool isPredicated = isVectorPredicated(&MI); + // Predicated loads will write zeros to the falsely predicated bytes of the + // destination register. + if (MI.mayLoad()) + return isPredicated; + + auto IsZeroInit = [](MachineInstr *Def) { + return !isVectorPredicated(Def) && + Def->getOpcode() == ARM::MVE_VMOVimmi32 && + Def->getOperand(1).getImm() == 0; + }; + bool AllowScalars = isHorizontalReduction(MI); for (auto &MO : MI.operands()) { if (!MO.isReg() || !MO.getReg()) continue; if (!isRegInClass(MO, QPRs) && AllowScalars) continue; - if (auto *OpDef = RDA.getMIOperand(&MI, MO)) - if (FalseLanesZero.count(OpDef)) - continue; - return false; - } - LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI); - return true; -} - -bool -LowOverheadLoop::FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers) { - // Also check for reductions where the operation needs to be merging values - // from the last and previous loop iterations. This means an instruction - // producing a value and a vmov storing the value calculated in the previous - // iteration. So we can have two live-out regs, one produced by a vmov and - // both being consumed by a vpsel. - LLVM_DEBUG(dbgs() << "ARM Loops: Looking for reduction live-outs:\n"; - for (auto *MI : LiveMIs) - dbgs() << " - " << *MI); - - if (!Preheader) - return false; - // Expect a vmov, a vadd and a single vpsel user. - // TODO: This means we can't currently support multiple reductions in the - // loop. - if (LiveMIs.size() != 2 || LiveOutUsers.size() != 1) - return false; - - MachineInstr *VPSEL = *LiveOutUsers.begin(); - if (VPSEL->getOpcode() != ARM::MVE_VPSEL) - return false; - - unsigned VPRIdx = llvm::findFirstVPTPredOperandIdx(*VPSEL) + 1; - MachineInstr *Pred = RDA.getMIOperand(VPSEL, VPRIdx); - if (!Pred || Pred != VCTP) { - LLVM_DEBUG(dbgs() << "ARM Loops: Not using equivalent predicate.\n"); - return false; - } - - MachineInstr *Reduce = RDA.getMIOperand(VPSEL, 1); - if (!Reduce) - return false; - - assert(LiveMIs.count(Reduce) && "Expected MI to be live-out"); - - // TODO: Support more operations than VADD. - switch (VCTP->getOpcode()) { - default: - return false; - case ARM::MVE_VCTP8: - if (Reduce->getOpcode() != ARM::MVE_VADDi8) - return false; - break; - case ARM::MVE_VCTP16: - if (Reduce->getOpcode() != ARM::MVE_VADDi16) - return false; - break; - case ARM::MVE_VCTP32: - if (Reduce->getOpcode() != ARM::MVE_VADDi32) + // Check that this instruction will produce zeros in its false lanes: + // - If it only consumes false lanes zero or constant 0 (vmov #0) + // - If it's predicated, it only matters that it's def register already has + // false lane zeros, so we can ignore the uses. + SmallPtrSet Defs; + RDA.getGlobalReachingDefs(&MI, MO.getReg(), Defs); + for (auto *Def : Defs) { + if (Def == &MI || FalseLanesZero.count(Def) || IsZeroInit(Def)) + continue; + if (MO.isUse() && isPredicated) + continue; return false; - break; - } - - // Test that the reduce op is overwriting ones of its operands. - if (Reduce->getOperand(0).getReg() != Reduce->getOperand(1).getReg() && - Reduce->getOperand(0).getReg() != Reduce->getOperand(2).getReg()) { - LLVM_DEBUG(dbgs() << "ARM Loops: Reducing op isn't overwriting itself.\n"); - return false; - } - - // Check that the VORR is actually a VMOV. - MachineInstr *Copy = RDA.getMIOperand(VPSEL, 2); - if (!Copy || Copy->getOpcode() != ARM::MVE_VORR || - !Copy->getOperand(1).isReg() || !Copy->getOperand(2).isReg() || - Copy->getOperand(1).getReg() != Copy->getOperand(2).getReg()) - return false; - - assert(LiveMIs.count(Copy) && "Expected MI to be live-out"); - - // Check that the vadd and vmov are only used by each other and the vpsel. - SmallPtrSet CopyUsers; - RDA.getGlobalUses(Copy, Copy->getOperand(0).getReg(), CopyUsers); - if (CopyUsers.size() > 2 || !CopyUsers.count(Reduce)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Copy users unsupported.\n"); - return false; - } - - SmallPtrSet ReduceUsers; - RDA.getGlobalUses(Reduce, Reduce->getOperand(0).getReg(), ReduceUsers); - if (ReduceUsers.size() > 2 || !ReduceUsers.count(Copy)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Reduce users unsupported.\n"); - return false; + } } - - // Then find whether there's an instruction initialising the register that - // is storing the reduction. - SmallPtrSet Incoming; - RDA.getLiveOuts(Preheader, Copy->getOperand(1).getReg(), Incoming); - if (Incoming.size() > 1) - return false; - - MachineInstr *Init = Incoming.empty() ? nullptr : *Incoming.begin(); - LLVM_DEBUG(dbgs() << "ARM Loops: Found a reduction:\n" - << " - " << *Copy - << " - " << *Reduce - << " - " << *VPSEL); - Reductions.push_back(std::make_unique(Init, Copy, Reduce, VPSEL)); + LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI); return true; } @@ -803,28 +706,20 @@ bool LowOverheadLoop::ValidateLiveOuts() { if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode())) continue; - // Predicated loads will write zeros to the falsely predicated bytes of the - // destination register. - if (isVectorPredicated(&MI)) { - if (MI.mayLoad()) - FalseLanesZero.insert(&MI); - Predicated.insert(&MI); - continue; - } + bool isPredicated = isVectorPredicated(&MI); + bool retainsOrReduces = + retainsPreviousHalfElement(MI) || isHorizontalReduction(MI); - if (MI.getNumDefs() == 0) + if (isPredicated) + Predicated.insert(&MI); + if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero)) + FalseLanesZero.insert(&MI); + else if (MI.getNumDefs() == 0) continue; - - if (!producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero)) { - // We require retaining and horizontal operations to operate upon zero'd - // false lanes to ensure the conversion doesn't change the output. - if (retainsPreviousHalfElement(MI) || isHorizontalReduction(MI)) - return false; - // Otherwise we need to evaluate this instruction later to see whether - // unknown false lanes will get masked away by their user(s). + else if (!isPredicated && retainsOrReduces) + return false; + else FalseLanesUnknown.insert(&MI); - } else if (!isHorizontalReduction(MI)) - FalseLanesZero.insert(&MI); } auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO, @@ -853,48 +748,44 @@ bool LowOverheadLoop::ValidateLiveOuts() { LLVM_DEBUG(dbgs() << "ARM Loops: Found an unknown def of : " << TRI.getRegAsmName(MO.getReg()) << " at " << *MI); NonPredicated.insert(MI); - continue; + break; } } // Any unknown false lanes have been masked away by the user(s). - Predicated.insert(MI); + if (!NonPredicated.contains(MI)) + Predicated.insert(MI); } SmallPtrSet LiveOutMIs; - SmallPtrSet LiveOutUsers; SmallVector ExitBlocks; ML.getExitBlocks(ExitBlocks); assert(ML.getNumBlocks() == 1 && "Expected single block loop!"); assert(ExitBlocks.size() == 1 && "Expected a single exit block"); MachineBasicBlock *ExitBB = ExitBlocks.front(); for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) { + // TODO: Instead of blocking predication, we could move the vctp to the exit + // block and calculate it's operand there in or the preheader. + if (RegMask.PhysReg == ARM::VPR) + return false; // Check Q-regs that are live in the exit blocks. We don't collect scalars // because they won't be affected by lane predication. - if (QPRs->contains(RegMask.PhysReg)) { + if (QPRs->contains(RegMask.PhysReg)) if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg)) LiveOutMIs.insert(MI); - RDA.getLiveInUses(ExitBB, RegMask.PhysReg, LiveOutUsers); - } } - // If we have any non-predicated live-outs, they need to be part of a - // reduction that we can fixup later. The reduction that the form of an - // operation that uses its previous values through a vmov and then a vpsel - // resides in the exit blocks to select the final bytes from n and n-1 - // iterations. - if (!NonPredicated.empty() && - !FindValidReduction(NonPredicated, LiveOutUsers)) - return false; - // We've already validated that any VPT predication within the loop will be // equivalent when we perform the predication transformation; so we know that // any VPT predicated instruction is predicated upon VCTP. Any live-out // instruction needs to be predicated, so check this here. The instructions // in NonPredicated have been found to be a reduction that we can ensure its // legality. - for (auto *MI : LiveOutMIs) - if (!isVectorPredicated(MI) && !NonPredicated.count(MI)) + for (auto *MI : LiveOutMIs) { + if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Unable to handle live out: " << *MI); return false; + } + } return true; } @@ -1360,61 +1251,6 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { return &*MIB; } -void ARMLowOverheadLoops::FixupReductions(LowOverheadLoop &LoLoop) const { - LLVM_DEBUG(dbgs() << "ARM Loops: Fixing up reduction(s).\n"); - auto BuildMov = [this](MachineInstr &InsertPt, Register To, Register From) { - MachineBasicBlock *MBB = InsertPt.getParent(); - MachineInstrBuilder MIB = - BuildMI(*MBB, &InsertPt, InsertPt.getDebugLoc(), TII->get(ARM::MVE_VORR)); - MIB.addDef(To); - MIB.addReg(From); - MIB.addReg(From); - MIB.addImm(0); - MIB.addReg(0); - MIB.addReg(To); - LLVM_DEBUG(dbgs() << "ARM Loops: Inserted VMOV: " << *MIB); - }; - - for (auto &Reduction : LoLoop.Reductions) { - MachineInstr &Copy = Reduction->Copy; - MachineInstr &Reduce = Reduction->Reduce; - Register DestReg = Copy.getOperand(0).getReg(); - - // Change the initialiser if present - if (Reduction->Init) { - MachineInstr *Init = Reduction->Init; - - for (unsigned i = 0; i < Init->getNumOperands(); ++i) { - MachineOperand &MO = Init->getOperand(i); - if (MO.isReg() && MO.isUse() && MO.isTied() && - Init->findTiedOperandIdx(i) == 0) - Init->getOperand(i).setReg(DestReg); - } - Init->getOperand(0).setReg(DestReg); - LLVM_DEBUG(dbgs() << "ARM Loops: Changed init regs: " << *Init); - } else - BuildMov(LoLoop.Preheader->instr_back(), DestReg, Copy.getOperand(1).getReg()); - - // Change the reducing op to write to the register that is used to copy - // its value on the next iteration. Also update the tied-def operand. - Reduce.getOperand(0).setReg(DestReg); - Reduce.getOperand(5).setReg(DestReg); - LLVM_DEBUG(dbgs() << "ARM Loops: Changed reduction regs: " << Reduce); - - // Instead of a vpsel, just copy the register into the necessary one. - MachineInstr &VPSEL = Reduction->VPSEL; - if (VPSEL.getOperand(0).getReg() != DestReg) - BuildMov(VPSEL, VPSEL.getOperand(0).getReg(), DestReg); - - // Remove the unnecessary instructions. - LLVM_DEBUG(dbgs() << "ARM Loops: Removing:\n" - << " - " << Copy - << " - " << VPSEL << "\n"); - Copy.eraseFromParent(); - VPSEL.eraseFromParent(); - } -} - void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { auto RemovePredicate = [](MachineInstr *MI) { LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI); @@ -1568,10 +1404,8 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { RemoveDeadBranch(LoLoop.Start); LoLoop.End = ExpandLoopEnd(LoLoop); RemoveDeadBranch(LoLoop.End); - if (LoLoop.IsTailPredicationLegal()) { + if (LoLoop.IsTailPredicationLegal()) ConvertVPTBlocks(LoLoop); - FixupReductions(LoLoop); - } for (auto *I : LoLoop.ToRemove) { LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I); I->eraseFromParent(); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index a4b38e8082224b..0a289fd4e70ffb 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -340,17 +340,17 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { auto *IndexC = dyn_cast(Index); if (IndexC) { ElementCount EC = EI.getVectorOperandType()->getElementCount(); - unsigned NumElts = EC.Min; + unsigned NumElts = EC.getKnownMinValue(); // InstSimplify should handle cases where the index is invalid. // For fixed-length vector, it's invalid to extract out-of-range element. - if (!EC.Scalable && IndexC->getValue().uge(NumElts)) + if (!EC.isScalable() && IndexC->getValue().uge(NumElts)) return nullptr; // This instruction only demands the single element from the input vector. // Skip for scalable type, the number of elements is unknown at // compile-time. - if (!EC.Scalable && NumElts != 1) { + if (!EC.isScalable() && NumElts != 1) { // If the input vector has a single use, simplify it based on this use // property. if (SrcVec->hasOneUse()) { diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index 423c4baf262ad6..57befc9c3cfb36 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -1976,18 +1976,18 @@ bool llvm::runIPSCCP( // inaccessiblemem_or_argmemonly attributes do not hold any longer. Remove // them from both the function and callsites. if (ReplacedPointerArg) { - SmallVector AttributesToRemove = { - Attribute::ArgMemOnly, Attribute::InaccessibleMemOrArgMemOnly}; - for (auto Attr : AttributesToRemove) - F.removeFnAttr(Attr); + AttrBuilder AttributesToRemove; + AttributesToRemove.addAttribute(Attribute::ArgMemOnly); + AttributesToRemove.addAttribute(Attribute::InaccessibleMemOrArgMemOnly); + F.removeAttributes(AttributeList::FunctionIndex, AttributesToRemove); for (User *U : F.users()) { auto *CB = dyn_cast(U); if (!CB || CB->getCalledFunction() != &F) continue; - for (auto Attr : AttributesToRemove) - CB->removeAttribute(AttributeList::FunctionIndex, Attr); + CB->removeAttributes(AttributeList::FunctionIndex, + AttributesToRemove); } } } diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp index 101cb232d8aed1..dfab9369c7b72e 100644 --- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp +++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp @@ -488,12 +488,13 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const { case Type::ScalableVectorTyID: { auto *STyL = cast(TyL); auto *STyR = cast(TyR); - if (STyL->getElementCount().Scalable != STyR->getElementCount().Scalable) - return cmpNumbers(STyL->getElementCount().Scalable, - STyR->getElementCount().Scalable); - if (STyL->getElementCount().Min != STyR->getElementCount().Min) - return cmpNumbers(STyL->getElementCount().Min, - STyR->getElementCount().Min); + if (STyL->getElementCount().isScalable() != + STyR->getElementCount().isScalable()) + return cmpNumbers(STyL->getElementCount().isScalable(), + STyR->getElementCount().isScalable()); + if (STyL->getElementCount() != STyR->getElementCount()) + return cmpNumbers(STyL->getElementCount().getKnownMinValue(), + STyR->getElementCount().getKnownMinValue()); return cmpTypes(STyL->getElementType(), STyR->getElementType()); } } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f999c5af7f4755..1f82995588b094 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -342,7 +342,7 @@ static Type *getMemInstValueType(Value *I) { /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type at the given vectorization factor. static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) { - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); // Determine if an array of VF elements of type Ty is "bitcast compatible" // with a vector. if (VF.isVector()) { @@ -899,8 +899,9 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) const DILocation *DIL = Inst->getDebugLoc(); if (DIL && Inst->getFunction()->isDebugInfoForProfiling() && !isa(Inst)) { - assert(!VF.Scalable && "scalable vectors not yet supported."); - auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF.Min); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + auto NewDIL = + DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue()); if (NewDIL) B.SetCurrentDebugLocation(NewDIL.getValue()); else @@ -1216,7 +1217,7 @@ class LoopVectorizationCostModel { /// width \p VF. Return CM_Unknown if this instruction did not pass /// through the cost modeling. InstWidening getWideningDecision(Instruction *I, ElementCount VF) { - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); assert(VF.isVector() && "Expected VF >=2"); // Cost model is not run in the VPlan-native path - return conservative @@ -1837,7 +1838,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // Multiply the vectorization factor by the step using integer or // floating-point arithmetic as appropriate. - Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF.Min); + Value *ConstVF = + getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue()); Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF)); // Create a vector splat to use in the induction update. @@ -1845,7 +1847,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // FIXME: If the step is non-constant, we create the vector splat with // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); Value *SplatVF = isa(Mul) ? ConstantVector::getSplat(VF, cast(Mul)) : Builder.CreateVectorSplat(VF, Mul); @@ -1982,9 +1984,10 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { - assert(!VF.Scalable && "scalable vectors not yet supported."); - Value *EntryPart = getStepVector(Broadcasted, VF.Min * Part, Step, - ID.getInductionOpcode()); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + Value *EntryPart = + getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step, + ID.getInductionOpcode()); VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); if (Trunc) addMetadata(EntryPart, Trunc); @@ -2093,7 +2096,7 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, const InductionDescriptor &ID) { // We shouldn't have to build scalar steps if we aren't vectorizing. assert(VF.isVector() && "VF should be greater than one"); - assert(!VF.Scalable && + assert(!VF.isScalable() && "the code below assumes a fixed number of elements at compile time"); // Get the value type and ensure it and the step have the same integer type. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); @@ -2118,12 +2121,12 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, unsigned Lanes = Cost->isUniformAfterVectorization(cast(EntryVal), VF) ? 1 - : VF.Min; + : VF.getKnownMinValue(); // Compute the scalar steps and save the results in VectorLoopValueMap. for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - auto *StartIdx = - getSignedIntOrFpConstant(ScalarIVTy, VF.Min * Part + Lane); + auto *StartIdx = getSignedIntOrFpConstant( + ScalarIVTy, VF.getKnownMinValue() * Part + Lane); auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); @@ -2166,9 +2169,10 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { // is known to be uniform after vectorization, this corresponds to lane zero // of the Part unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the Part unroll iteration. - assert(!VF.Scalable && "scalable vectors not yet supported."); - unsigned LastLane = - Cost->isUniformAfterVectorization(I, VF) ? 0 : VF.Min - 1; + assert(!VF.isScalable() && "scalable vectors not yet supported."); + unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) + ? 0 + : VF.getKnownMinValue() - 1; auto *LastInst = cast( VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); @@ -2190,10 +2194,10 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { VectorLoopValueMap.setVectorValue(V, Part, VectorValue); } else { // Initialize packing with insertelements to start from undef. - assert(!VF.Scalable && "VF is assumed to be non scalable."); + assert(!VF.isScalable() && "VF is assumed to be non scalable."); Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); VectorLoopValueMap.setVectorValue(V, Part, Undef); - for (unsigned Lane = 0; Lane < VF.Min; ++Lane) + for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) packScalarIntoVectorValue(V, {Part, Lane}); VectorValue = VectorLoopValueMap.getVectorValue(V, Part); } @@ -2257,10 +2261,10 @@ void InnerLoopVectorizer::packScalarIntoVectorValue( Value *InnerLoopVectorizer::reverseVector(Value *Vec) { assert(Vec->getType()->isVectorTy() && "Invalid type"); - assert(!VF.Scalable && "Cannot reverse scalable vectors"); + assert(!VF.isScalable() && "Cannot reverse scalable vectors"); SmallVector ShuffleMask; - for (unsigned i = 0; i < VF.Min; ++i) - ShuffleMask.push_back(VF.Min - i - 1); + for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) + ShuffleMask.push_back(VF.getKnownMinValue() - i - 1); return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()), ShuffleMask, "reverse"); @@ -2314,7 +2318,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor); // Prepare for the new pointers. @@ -2331,10 +2335,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( // pointer operand of the interleaved access is supposed to be uniform. For // uniform instructions, we're only required to generate a value for the // first vector lane in each unroll iteration. - assert(!VF.Scalable && + assert(!VF.isScalable() && "scalable vector reverse operation is not implemented"); if (Group->isReverse()) - Index += (VF.Min - 1) * Group->getFactor(); + Index += (VF.getKnownMinValue() - 1) * Group->getFactor(); for (unsigned Part = 0; Part < UF; Part++) { Value *AddrPart = State.get(Addr, {Part, 0}); @@ -2369,8 +2373,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *MaskForGaps = nullptr; if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { - assert(!VF.Scalable && "scalable vectors not yet supported."); - MaskForGaps = createBitMaskForGaps(Builder, VF.Min, *Group); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group); assert(MaskForGaps && "Mask for Gaps is required but it is null"); } @@ -2387,10 +2391,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( if (BlockInMask) { Value *BlockInMaskPart = State.get(BlockInMask, Part); auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); Value *ShuffledMask = Builder.CreateShuffleVector( BlockInMaskPart, Undefs, - createReplicatedMask(InterleaveFactor, VF.Min), + createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), "interleaved.mask"); GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, @@ -2417,15 +2421,16 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( if (!Member) continue; - assert(!VF.Scalable && "scalable vectors not yet supported."); - auto StrideMask = createStrideMask(I, InterleaveFactor, VF.Min); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + auto StrideMask = + createStrideMask(I, InterleaveFactor, VF.getKnownMinValue()); for (unsigned Part = 0; Part < UF; Part++) { Value *StridedVec = Builder.CreateShuffleVector( NewLoads[Part], UndefVec, StrideMask, "strided.vec"); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { - assert(!VF.Scalable && "VF is assumed to be non scalable."); + assert(!VF.isScalable() && "VF is assumed to be non scalable."); VectorType *OtherVTy = VectorType::get(Member->getType(), VF); StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL); } @@ -2440,7 +2445,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } // The sub vector type for current instruction. - assert(!VF.Scalable && "VF is assumed to be non scalable."); + assert(!VF.isScalable() && "VF is assumed to be non scalable."); auto *SubVT = VectorType::get(ScalarTy, VF); // Vectorize the interleaved store group. @@ -2469,9 +2474,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( Value *WideVec = concatenateVectors(Builder, StoredVecs); // Interleave the elements in the wide vector. - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); Value *IVec = Builder.CreateShuffleVector( - WideVec, UndefVec, createInterleaveMask(VF.Min, InterleaveFactor), + WideVec, UndefVec, + createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor), "interleaved.vec"); Instruction *NewStoreInstr; @@ -2480,7 +2486,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); Value *ShuffledMask = Builder.CreateShuffleVector( BlockInMaskPart, Undefs, - createReplicatedMask(InterleaveFactor, VF.Min), "interleaved.mask"); + createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()), + "interleaved.mask"); NewStoreInstr = Builder.CreateMaskedStore( IVec, AddrParts[Part], Group->getAlign(), ShuffledMask); } @@ -2514,7 +2521,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Type *ScalarDataTy = getMemInstValueType(Instr); - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); auto *DataTy = VectorType::get(ScalarDataTy, VF); const Align Alignment = getLoadStoreAlignment(Instr); @@ -2550,16 +2557,16 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. PartPtr = cast(Builder.CreateGEP( - ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.Min))); + ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue()))); PartPtr->setIsInBounds(InBounds); PartPtr = cast(Builder.CreateGEP( - ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.Min))); + ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue()))); PartPtr->setIsInBounds(InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); } else { PartPtr = cast(Builder.CreateGEP( - ScalarDataTy, Ptr, Builder.getInt32(Part * VF.Min))); + ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue()))); PartPtr->setIsInBounds(InBounds); } @@ -2756,8 +2763,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { Type *Ty = TC->getType(); // This is where we can make the step a runtime constant. - assert(!VF.Scalable && "scalable vectorization is not supported yet"); - Constant *Step = ConstantInt::get(Ty, VF.Min * UF); + assert(!VF.isScalable() && "scalable vectorization is not supported yet"); + Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF); // If the tail is to be folded by masking, round the number of iterations N // up to a multiple of Step instead of rounding down. This is done by first @@ -2766,10 +2773,10 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { // that it starts at zero and its Step is a power of two; the loop will then // exit, with the last early-exit vector comparison also producing all-true. if (Cost->foldTailByMasking()) { - assert(isPowerOf2_32(VF.Min * UF) && + assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"); - TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF.Min * UF - 1), - "n.rnd.up"); + TC = Builder.CreateAdd( + TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up"); } // Now we need to generate the expression for the part of the loop that the @@ -2846,9 +2853,10 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, // If tail is to be folded, vector loop takes care of all iterations. Value *CheckMinIters = Builder.getFalse(); if (!Cost->foldTailByMasking()) { - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); CheckMinIters = Builder.CreateICmp( - P, Count, ConstantInt::get(Count->getType(), VF.Min * UF), + P, Count, + ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF), "min.iters.check"); } // Create new preheader for vector loop. @@ -3303,8 +3311,8 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { Value *StartIdx = ConstantInt::get(IdxTy, 0); // The loop step is equal to the vectorization factor (num of SIMD elements) // times the unroll factor (num of SIMD instructions). - assert(!VF.Scalable && "scalable vectors not yet supported."); - Constant *Step = ConstantInt::get(IdxTy, VF.Min * UF); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); Value *CountRoundDown = getOrCreateVectorTripCount(Lp); Induction = createInductionVariable(Lp, StartIdx, CountRoundDown, Step, @@ -3438,7 +3446,7 @@ static void cse(BasicBlock *BB) { unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, bool &NeedToScalarize) { - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); Function *F = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector Tys, ScalarTys; @@ -3463,7 +3471,7 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, // packing the return values to a vector. unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); - unsigned Cost = ScalarCallCost * VF.Min + ScalarizationCost; + unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; // If we can't emit a vector call for this function, then the currently found // cost is the cost we need to return. @@ -3684,11 +3692,11 @@ void InnerLoopVectorizer::fixVectorizedLoop() { // profile is not inherently precise anyway. Note also possible bypass of // vector code caused by legality checks is ignored, assigning all the weight // to the vector loop, optimistically. - assert(!VF.Scalable && + assert(!VF.isScalable() && "cannot use scalable ElementCount to determine unroll factor"); - setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), - LI->getLoopFor(LoopVectorBody), - LI->getLoopFor(LoopScalarBody), VF.Min * UF); + setProfileInfoAfterUnrolling( + LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody), + LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF); } void InnerLoopVectorizer::fixCrossIterationPHIs() { @@ -3769,10 +3777,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { auto *VectorInit = ScalarInit; if (VF.isVector()) { Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); - assert(!VF.Scalable && "VF is assumed to be non scalable."); + assert(!VF.isScalable() && "VF is assumed to be non scalable."); VectorInit = Builder.CreateInsertElement( UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit, - Builder.getInt32(VF.Min - 1), "vector.recur.init"); + Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init"); } // We constructed a temporary phi node in the first phase of vectorization. @@ -3813,11 +3821,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // We will construct a vector for the recurrence by combining the values for // the current and previous iterations. This is the required shuffle mask. - assert(!VF.Scalable); - SmallVector ShuffleMask(VF.Min); - ShuffleMask[0] = VF.Min - 1; - for (unsigned I = 1; I < VF.Min; ++I) - ShuffleMask[I] = I + VF.Min - 1; + assert(!VF.isScalable()); + SmallVector ShuffleMask(VF.getKnownMinValue()); + ShuffleMask[0] = VF.getKnownMinValue() - 1; + for (unsigned I = 1; I < VF.getKnownMinValue(); ++I) + ShuffleMask[I] = I + VF.getKnownMinValue() - 1; // The vector from which to take the initial value for the current iteration // (actual or unrolled). Initially, this is the vector phi node. @@ -3846,7 +3854,8 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { if (VF.isVector()) { Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); ExtractForScalar = Builder.CreateExtractElement( - ExtractForScalar, Builder.getInt32(VF.Min - 1), "vector.recur.extract"); + ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1), + "vector.recur.extract"); } // Extract the second last element in the middle block if the // Phi is used outside the loop. We need to extract the phi itself @@ -3856,7 +3865,8 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { Value *ExtractForPhiUsedOutsideLoop = nullptr; if (VF.isVector()) ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( - Incoming, Builder.getInt32(VF.Min - 2), "vector.recur.extract.for.phi"); + Incoming, Builder.getInt32(VF.getKnownMinValue() - 2), + "vector.recur.extract.for.phi"); // When loop is unrolled without vectorizing, initialize // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of // `Incoming`. This is analogous to the vectorized case above: extracting the @@ -4013,7 +4023,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { // entire expression in the smaller type. if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) { assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!"); - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF); Builder.SetInsertPoint( LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator()); @@ -4145,7 +4155,7 @@ void InnerLoopVectorizer::clearReductionWrapFlags( } void InnerLoopVectorizer::fixLCSSAPHIs() { - assert(!VF.Scalable && "the code below assumes fixed width vectors"); + assert(!VF.isScalable() && "the code below assumes fixed width vectors"); for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { if (LCSSAPhi.getNumIncomingValues() == 1) { auto *IncomingValue = LCSSAPhi.getIncomingValue(0); @@ -4155,7 +4165,7 @@ void InnerLoopVectorizer::fixLCSSAPHIs() { LastLane = Cost->isUniformAfterVectorization( cast(IncomingValue), VF) ? 0 - : VF.Min - 1; + : VF.getKnownMinValue() - 1; // Can be a loop invariant incoming value or the last scalar value to be // extracted from the vectorized loop. Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); @@ -4338,7 +4348,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands, void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF) { - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); PHINode *P = cast(PN); if (EnableVPlanNativePath) { // Currently we enter here in the VPlan-native path for non-induction @@ -4403,11 +4413,12 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the // first lane. Otherwise, we generate all VF values. - unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.Min; + unsigned Lanes = + Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue(); for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - Constant *Idx = - ConstantInt::get(PtrInd->getType(), Lane + Part * VF.Min); + Constant *Idx = ConstantInt::get(PtrInd->getType(), + Lane + Part * VF.getKnownMinValue()); Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); @@ -4437,8 +4448,9 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc); Value *InductionGEP = GetElementPtrInst::Create( ScStValueType->getPointerElementType(), NewPointerPhi, - Builder.CreateMul(ScalarStepValue, - ConstantInt::get(PhiType, VF.Min * UF)), + Builder.CreateMul( + ScalarStepValue, + ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)), "ptr.ind", InductionLoc); NewPointerPhi->addIncoming(InductionGEP, LoopLatch); @@ -4448,15 +4460,17 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, for (unsigned Part = 0; Part < UF; ++Part) { SmallVector Indices; // Create a vector of consecutive numbers from zero to VF. - for (unsigned i = 0; i < VF.Min; ++i) - Indices.push_back(ConstantInt::get(PhiType, i + Part * VF.Min)); + for (unsigned i = 0; i < VF.getKnownMinValue(); ++i) + Indices.push_back( + ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue())); Constant *StartOffset = ConstantVector::get(Indices); Value *GEP = Builder.CreateGEP( ScStValueType->getPointerElementType(), NewPointerPhi, - Builder.CreateMul(StartOffset, - Builder.CreateVectorSplat(VF.Min, ScalarStepValue), - "vector.gep")); + Builder.CreateMul( + StartOffset, + Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue), + "vector.gep")); VectorLoopValueMap.setVectorValue(P, Part, GEP); } } @@ -4483,7 +4497,7 @@ static bool mayDivideByZero(Instruction &I) { void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, VPTransformState &State) { - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); switch (I.getOpcode()) { case Instruction::Call: case Instruction::Br: @@ -4571,7 +4585,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User, setDebugLocFromInst(Builder, CI); /// Vectorize casts. - assert(!VF.Scalable && "VF is assumed to be non scalable."); + assert(!VF.isScalable() && "VF is assumed to be non scalable."); Type *DestTy = (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF); @@ -4601,7 +4615,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, SmallVector Tys; for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.Min)); + Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); @@ -4633,7 +4647,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands, // Use vector version of the intrinsic. Type *TysForDecl[] = {CI->getType()}; if (VF.isVector()) { - assert(!VF.Scalable && "VF is assumed to be non scalable."); + assert(!VF.isScalable() && "VF is assumed to be non scalable."); TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); } VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); @@ -4872,7 +4886,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, ElementCount VF) { - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); if (!blockNeedsPredication(I->getParent())) return false; switch(I->getOpcode()) { @@ -5357,7 +5371,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { Selected = false; } if (Selected) { - MaxVF = VFs[i].Min; + MaxVF = VFs[i].getKnownMinValue(); break; } } @@ -5558,8 +5572,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, } // Clamp the interleave ranges to reasonable counts. - assert(!VF.Scalable && "scalable vectors not yet supported."); - unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF.Min); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + unsigned MaxInterleaveCount = + TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); // Check if the user has overridden the max. if (VF == 1) { @@ -5573,7 +5588,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, // If trip count is known or estimated compile time constant, limit the // interleave count to be less than the trip count divided by VF. if (BestKnownTC) { - MaxInterleaveCount = std::min(*BestKnownTC / VF.Min, MaxInterleaveCount); + MaxInterleaveCount = + std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); } // If we did not calculate the cost for VF (because the user selected the VF) @@ -5745,8 +5761,9 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { if (Ty->isTokenTy()) return 0U; unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); - assert(!VF.Scalable && "scalable vectors not yet supported."); - return std::max(1, VF.Min * TypeSize / WidestRegister); + assert(!VF.isScalable() && "scalable vectors not yet supported."); + return std::max(1, VF.getKnownMinValue() * TypeSize / + WidestRegister); }; for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { @@ -5973,19 +5990,20 @@ int LoopVectorizationCostModel::computePredInstDiscount( // the instruction as if it wasn't if-converted and instead remained in the // predicated block. We will scale this cost by block probability after // computing the scalarization overhead. - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); unsigned ScalarCost = - VF.Min * getInstructionCost(I, ElementCount::getFixed(1)).first; + VF.getKnownMinValue() * + getInstructionCost(I, ElementCount::getFixed(1)).first; // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(I->getType(), VF)), - APInt::getAllOnesValue(VF.Min), true, false); - assert(!VF.Scalable && "scalable vectors not yet supported."); + APInt::getAllOnesValue(VF.getKnownMinValue()), true, false); + assert(!VF.isScalable() && "scalable vectors not yet supported."); ScalarCost += - VF.Min * + VF.getKnownMinValue() * TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); } @@ -6000,10 +6018,10 @@ int LoopVectorizationCostModel::computePredInstDiscount( if (canBeScalarized(J)) Worklist.push_back(J); else if (needsExtract(J, VF)) { - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(J->getType(), VF)), - APInt::getAllOnesValue(VF.Min), false, true); + APInt::getAllOnesValue(VF.getKnownMinValue()), false, true); } } @@ -6021,7 +6039,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost(ElementCount VF) { - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); VectorizationCostTy Cost; // For each block. @@ -6104,7 +6122,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, ElementCount VF) { assert(VF.isVector() && "Scalarization cost of instruction implies vectorization."); - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); Type *ValTy = getMemInstValueType(I); auto SE = PSE.getSE(); @@ -6117,12 +6135,13 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); // Get the cost of the scalar memory instruction and address computation. - unsigned Cost = VF.Min * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); + unsigned Cost = + VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. const Align Alignment = getLoadStoreAlignment(I); - Cost += VF.Min * + Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, AS, TTI::TCK_RecipThroughput); @@ -6190,9 +6209,10 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS, CostKind) + - (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost( - Instruction::ExtractElement, - VectorTy, VF.Min - 1)); + (isLoopInvariantStoreValue + ? 0 + : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, + VF.getKnownMinValue() - 1)); } unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, @@ -6218,7 +6238,7 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, assert(Group && "Fail to get an interleaved access group."); unsigned InterleaveFactor = Group->getFactor(); - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor); // Holds the indices of existing members in an interleaved load group. @@ -6266,7 +6286,7 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF) { - assert(!VF.Scalable && + assert(!VF.isScalable() && "the cost model is not yet implemented for scalable vectorization"); // If we know that this instruction will remain uniform, check the cost of // the scalar version. @@ -6282,22 +6302,24 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, auto InstSet = ForcedScalar->second; if (InstSet.count(I)) return VectorizationCostTy( - (getInstructionCost(I, ElementCount::getFixed(1)).first * VF.Min), + (getInstructionCost(I, ElementCount::getFixed(1)).first * + VF.getKnownMinValue()), false); } Type *VectorTy; unsigned C = getInstructionCost(I, VF, VectorTy); - bool TypeNotScalarized = VF.isVector() && VectorTy->isVectorTy() && - TTI.getNumberOfParts(VectorTy) < VF.Min; + bool TypeNotScalarized = + VF.isVector() && VectorTy->isVectorTy() && + TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue(); return VectorizationCostTy(C, TypeNotScalarized); } unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, ElementCount VF) { - assert(!VF.Scalable && + assert(!VF.isScalable() && "cannot compute scalarization overhead for scalable vectorization"); if (VF.isScalar()) return 0; @@ -6307,7 +6329,8 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, if (!RetTy->isVoidTy() && (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead( - cast(RetTy), APInt::getAllOnesValue(VF.Min), true, false); + cast(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()), + true, false); // Some targets keep addresses scalar. if (isa(I) && !TTI.prefersVectorizedAddressing()) @@ -6323,13 +6346,12 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, // Skip operands that do not require extraction/scalarization and do not incur // any overhead. - return Cost + - TTI.getOperandsScalarizationOverhead(filterExtractingOperands(Ops, VF), - VF.Min); + return Cost + TTI.getOperandsScalarizationOverhead( + filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); } void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); if (VF.isScalar()) return; NumPredStores = 0; @@ -6466,14 +6488,15 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { // Scalarize a widened load of address. setWideningDecision( I, VF, CM_Scalarize, - (VF.Min * getMemoryInstructionCost(I, ElementCount::getFixed(1)))); + (VF.getKnownMinValue() * + getMemoryInstructionCost(I, ElementCount::getFixed(1)))); else if (auto Group = getInterleavedAccessGroup(I)) { // Scalarize an interleave group of address loads. for (unsigned I = 0; I < Group->getFactor(); ++I) { if (Instruction *Member = Group->getMember(I)) setWideningDecision( Member, VF, CM_Scalarize, - (VF.Min * + (VF.getKnownMinValue() * getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); } } @@ -6515,12 +6538,14 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, if (ScalarPredicatedBB) { // Return cost for branches around scalarized and predicated blocks. - assert(!VF.Scalable && "scalable vectors not yet supported."); + assert(!VF.isScalable() && "scalable vectors not yet supported."); auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); return (TTI.getScalarizationOverhead( - Vec_i1Ty, APInt::getAllOnesValue(VF.Min), false, true) + - (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.Min)); + Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()), + false, true) + + (TTI.getCFInstrCost(Instruction::Br, CostKind) * + VF.getKnownMinValue())); } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) // The back-edge branch will remain, as will all scalar branches. return TTI.getCFInstrCost(Instruction::Br, CostKind); @@ -6537,9 +6562,9 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // First-order recurrences are replaced by vector shuffles inside the loop. // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) - return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - cast(VectorTy), VF.Min - 1, - FixedVectorType::get(RetTy, 1)); + return TTI.getShuffleCost( + TargetTransformInfo::SK_ExtractSubvector, cast(VectorTy), + VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi @@ -6568,11 +6593,12 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // that we will create. This cost is likely to be zero. The phi node // cost, if any, should be scaled by the block probability because it // models a copy at the end of each predicated block. - Cost += VF.Min * TTI.getCFInstrCost(Instruction::PHI, CostKind); + Cost += VF.getKnownMinValue() * + TTI.getCFInstrCost(Instruction::PHI, CostKind); // The cost of the non-predicated instruction. - Cost += - VF.Min * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); + Cost += VF.getKnownMinValue() * + TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind); // The cost of insertelement and extractelement instructions needed for // scalarization. @@ -6611,15 +6637,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, Op2VK = TargetTransformInfo::OK_UniformValue; SmallVector Operands(I->operand_values()); - unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; + unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); } case Instruction::FNeg: { - assert(!VF.Scalable && "VF is assumed to be non scalable."); - unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; + assert(!VF.isScalable() && "VF is assumed to be non scalable."); + unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue, @@ -6633,7 +6659,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); Type *CondTy = SI->getCondition()->getType(); if (!ScalarCond) { - assert(!VF.Scalable && "VF is assumed to be non scalable."); + assert(!VF.isScalable() && "VF is assumed to be non scalable."); CondTy = VectorType::get(CondTy, VF); } return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, @@ -6745,8 +6771,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, } } - assert(!VF.Scalable && "VF is assumed to be non scalable"); - unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1; + assert(!VF.isScalable() && "VF is assumed to be non scalable"); + unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; return N * TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } @@ -6761,9 +6787,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. - return VF.Min * - TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, - CostKind) + + return VF.getKnownMinValue() * TTI.getArithmeticInstrCost( + Instruction::Mul, VectorTy, CostKind) + getScalarizationOverhead(I, VF); } // end of switch. } @@ -6870,7 +6895,7 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, VectorizationFactor LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { - assert(!UserVF.Scalable && "scalable vectors not yet supported"); + assert(!UserVF.isScalable() && "scalable vectors not yet supported"); ElementCount VF = UserVF; // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. @@ -6892,10 +6917,11 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { } } assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); - assert(isPowerOf2_32(VF.Min) && "VF needs to be a power of two"); + assert(isPowerOf2_32(VF.getKnownMinValue()) && + "VF needs to be a power of two"); LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") << "VF " << VF << " to build VPlans.\n"); - buildVPlans(VF.Min, VF.Min); + buildVPlans(VF.getKnownMinValue(), VF.getKnownMinValue()); // For VPlan build stress testing, we bail out after VPlan construction. if (VPlanBuildStressTest) @@ -6912,9 +6938,10 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { Optional LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { - assert(!UserVF.Scalable && "scalable vectorization not yet handled"); + assert(!UserVF.isScalable() && "scalable vectorization not yet handled"); assert(OrigLoop->empty() && "Inner loop expected."); - Optional MaybeMaxVF = CM.computeMaxVF(UserVF.Min, UserIC); + Optional MaybeMaxVF = + CM.computeMaxVF(UserVF.getKnownMinValue(), UserIC); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. return None; @@ -6934,12 +6961,14 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { if (!UserVF.isZero()) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - assert(isPowerOf2_32(UserVF.Min) && "VF needs to be a power of two"); + assert(isPowerOf2_32(UserVF.getKnownMinValue()) && + "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. CM.selectUserVectorizationFactor(UserVF); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(UserVF.Min, UserVF.Min); + buildVPlansWithVPRecipes(UserVF.getKnownMinValue(), + UserVF.getKnownMinValue()); LLVM_DEBUG(printPlans(dbgs())); return {{UserVF, 0}}; } @@ -7228,7 +7257,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, "Must be called with either a load or store"); auto willWiden = [&](ElementCount VF) -> bool { - assert(!VF.Scalable && "unexpected scalable ElementCount"); + assert(!VF.isScalable() && "unexpected scalable ElementCount"); if (VF.isScalar()) return false; LoopVectorizationCostModel::InstWidening Decision = @@ -7762,7 +7791,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( ElementCount VF = ElementCount::getFixed(Range.Start); Plan->addVF(VF); RSO << "Initial VPlan for VF={" << VF; - for (VF.Min *= 2; VF.Min < Range.End; VF.Min *= 2) { + for (VF *= 2; VF.getKnownMinValue() < Range.End; VF *= 2) { Plan->addVF(VF); RSO << "," << VF; } @@ -7986,7 +8015,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { if (AlsoPack && State.VF.isVector()) { // If we're constructing lane 0, initialize to start from undef. if (State.Instance->Lane == 0) { - assert(!State.VF.Scalable && "VF is assumed to be non scalable."); + assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); Value *Undef = UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); @@ -7999,7 +8028,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { // Generate scalar instances for all VF lanes of all UF parts, unless the // instruction is uniform inwhich case generate only the first lane for each // of the UF parts. - unsigned EndLane = IsUniform ? 1 : State.VF.Min; + unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue(); for (unsigned Part = 0; Part < State.UF; ++Part) for (unsigned Lane = 0; Lane < EndLane; ++Lane) State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane}, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index a616de6eb4f076..679445455a4542 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -300,8 +300,9 @@ void VPRegionBlock::execute(VPTransformState *State) { for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) { State->Instance->Part = Part; - assert(!State->VF.Scalable && "VF is assumed to be non scalable."); - for (unsigned Lane = 0, VF = State->VF.Min; Lane < VF; ++Lane) { + assert(!State->VF.isScalable() && "VF is assumed to be non scalable."); + for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF; + ++Lane) { State->Instance->Lane = Lane; // Visit the VPBlocks connected to \p this, starting from it. for (VPBlockBase *Block : RPOT) { @@ -388,7 +389,7 @@ void VPInstruction::generateInstruction(VPTransformState &State, Value *ScalarTC = State.TripCount; auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.Min); + auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue()); Instruction *Call = Builder.CreateIntrinsic( Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, {VIVElem0, ScalarTC}, nullptr, "active.lane.mask"); @@ -840,14 +841,16 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { Type *STy = CanonicalIV->getType(); IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); ElementCount VF = State.VF; - assert(!VF.Scalable && "the code following assumes non scalables ECs"); - Value *VStart = VF.isScalar() ? CanonicalIV - : Builder.CreateVectorSplat(VF.Min, CanonicalIV, - "broadcast"); + assert(!VF.isScalable() && "the code following assumes non scalables ECs"); + Value *VStart = VF.isScalar() + ? CanonicalIV + : Builder.CreateVectorSplat(VF.getKnownMinValue(), + CanonicalIV, "broadcast"); for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { SmallVector Indices; - for (unsigned Lane = 0; Lane < VF.Min; ++Lane) - Indices.push_back(ConstantInt::get(STy, Part * VF.Min + Lane)); + for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane) + Indices.push_back( + ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane)); // If VF == 1, there is only one iteration in the loop above, thus the // element pushed back into Indices is ConstantInt::get(STy, Part) Constant *VStep = VF == 1 ? Indices.back() : ConstantVector::get(Indices); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 6eed236fc1493e..078b2ba1c70ac3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -151,14 +151,15 @@ struct VectorizerValueMap { /// \return True if the map has a scalar entry for \p Key and \p Instance. bool hasScalarValue(Value *Key, const VPIteration &Instance) const { assert(Instance.Part < UF && "Queried Scalar Part is too large."); - assert(Instance.Lane < VF.Min && "Queried Scalar Lane is too large."); - assert(!VF.Scalable && "VF is assumed to be non scalable."); + assert(Instance.Lane < VF.getKnownMinValue() && + "Queried Scalar Lane is too large."); + assert(!VF.isScalable() && "VF is assumed to be non scalable."); if (!hasAnyScalarValue(Key)) return false; const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); - assert(Entry[Instance.Part].size() == VF.Min && + assert(Entry[Instance.Part].size() == VF.getKnownMinValue() && "ScalarParts has wrong dimensions."); return Entry[Instance.Part][Instance.Lane] != nullptr; } @@ -197,7 +198,7 @@ struct VectorizerValueMap { // TODO: Consider storing uniform values only per-part, as they occupy // lane 0 only, keeping the other VF-1 redundant entries null. for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part].resize(VF.Min, nullptr); + Entry[Part].resize(VF.getKnownMinValue(), nullptr); ScalarMapStorage[Key] = Entry; } ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir index 550972e4a4f458..37a7b7bd010ddc 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir @@ -135,34 +135,27 @@ body: | ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: $r12 = t2MOVi16 target-flags(arm-lo16) @mask, 14 /* CC::al */, $noreg - ; CHECK: renamable $r4, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: $r12 = t2MOVTi16 killed $r12, target-flags(arm-hi16) @mask, 14 /* CC::al */, $noreg - ; CHECK: renamable $r4 = t2BICri killed renamable $r4, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r5 = t2LDRHi12 killed renamable $r12, 0, 14 /* CC::al */, $noreg :: (dereferenceable load 2 from %ir.mask.gep9) - ; CHECK: renamable $r12 = t2SUBri killed renamable $r4, 4, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r4, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg ; CHECK: $vpr = VMSR_P0 $r5, 14 /* CC::al */, $noreg - ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 16, 14 /* CC::al */, $noreg, $noreg ; CHECK: VSTR_P0_off killed renamable $vpr, $sp, 0, 14 /* CC::al */, $noreg :: (store 4 into %stack.0) ; CHECK: renamable $q0 = MVE_VDUP32 killed renamable $r5, 0, $noreg, undef renamable $q0 ; CHECK: $r3 = tMOVr $r0, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 ; CHECK: bb.2.bb9: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r12 + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r3, $r12 ; CHECK: renamable $vpr = VLDR_P0_off $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.0) - ; CHECK: MVE_VPST 2, implicit $vpr - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr + ; CHECK: MVE_VPST 4, implicit $vpr ; CHECK: renamable $r1, renamable $q1 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) ; CHECK: renamable $r3, renamable $q2 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4) - ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 ; CHECK: renamable $r12, renamable $q2 = MVE_VLDRWU32_pre killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.scevgep2, align 8) ; CHECK: MVE_VPTv4u32 8, renamable $q0, killed renamable $q2, 2, implicit-def $vpr ; CHECK: MVE_VSTRWU32 killed renamable $q1, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4) ; CHECK: $r0 = tMOVr $r3, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 ; CHECK: bb.3.bb27: ; CHECK: $sp = tADDspi $sp, 1, 14 /* CC::al */, $noreg ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $r5, def $r7, def $pc diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir index 96652c5d76e935..712faa59fb7d5c 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir @@ -118,27 +118,16 @@ body: | ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 - ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - ; CHECK: dead $lr = t2DLS renamable $r12 - ; CHECK: $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $q0, $r0, $r1, $r2, $r3 - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) - ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) - ; CHECK: $lr = tMOVr $r3, 14 /* CC::al */, $noreg + ; CHECK: liveins: $lr, $q0, $r0, $r1 + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv1820, align 2) ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 - ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 ; CHECK: bb.3.middle.block: ; CHECK: liveins: $q0 ; CHECK: $r0 = VMOVRS killed $s3, 14 /* CC::al */, $noreg, implicit killed $q0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir index 15166cece6acb3..9eb95d7e8072c0 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir @@ -115,27 +115,16 @@ body: | ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 - ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - ; CHECK: dead $lr = t2DLS renamable $r12 - ; CHECK: $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $q0, $r0, $r1, $r2, $r3 - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) - ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) - ; CHECK: $lr = tMOVr $r3, 14 /* CC::al */, $noreg + ; CHECK: liveins: $lr, $q0, $r0, $r1 + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv1820, align 2) ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 - ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 ; CHECK: bb.3.middle.block: ; CHECK: liveins: $q0 ; CHECK: renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir new file mode 100644 index 00000000000000..f013cb2f861569 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir @@ -0,0 +1,930 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s + +--- | + define dso_local arm_aapcs_vfpcc i32 @mul_var_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) { + entry: + %cmp9.not = icmp eq i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv14 = phi i8* [ %scevgep15, %vector.body ], [ %b, %vector.ph ] + %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ] + %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ] + %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] + %lsr.iv13 = bitcast i8* %lsr.iv to <4 x i8>* + %lsr.iv1416 = bitcast i8* %lsr.iv14 to <4 x i8>* + %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7) + %9 = sub i32 %7, 4 + %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %lsr.iv13, i32 1, <4 x i1> %8, <4 x i8> undef) + %10 = zext <4 x i8> %wide.masked.load to <4 x i32> + %wide.masked.load12 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %lsr.iv1416, i32 1, <4 x i1> %8, <4 x i8> undef) + %11 = zext <4 x i8> %wide.masked.load12 to <4 x i32> + %12 = mul nuw nsw <4 x i32> %11, %10 + %13 = select <4 x i1> %8, <4 x i32> %12, <4 x i32> zeroinitializer + %14 = add <4 x i32> %vec.phi, %13 + %scevgep = getelementptr i8, i8* %lsr.iv, i32 4 + %scevgep15 = getelementptr i8, i8* %lsr.iv14, i32 4 + %15 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) + %16 = icmp ne i32 %15, 0 + br i1 %16, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %17, %middle.block ] + ret i32 %res.0.lcssa + } + + define dso_local arm_aapcs_vfpcc i32 @add_var_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) { + entry: + %cmp10.not = icmp eq i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp10.not, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv15 = phi i8* [ %scevgep16, %vector.body ], [ %b, %vector.ph ] + %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ] + %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ] + %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] + %lsr.iv14 = bitcast i8* %lsr.iv to <4 x i8>* + %lsr.iv1517 = bitcast i8* %lsr.iv15 to <4 x i8>* + %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7) + %9 = sub i32 %7, 4 + %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %lsr.iv14, i32 1, <4 x i1> %8, <4 x i8> undef) + %10 = zext <4 x i8> %wide.masked.load to <4 x i32> + %wide.masked.load13 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %lsr.iv1517, i32 1, <4 x i1> %8, <4 x i8> undef) + %11 = zext <4 x i8> %wide.masked.load13 to <4 x i32> + %12 = add <4 x i32> %vec.phi, %10 + %13 = add <4 x i32> %12, %11 + %14 = select <4 x i1> %8, <4 x i32> %13, <4 x i32> %vec.phi + %scevgep = getelementptr i8, i8* %lsr.iv, i32 4 + %scevgep16 = getelementptr i8, i8* %lsr.iv15, i32 4 + %15 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) + %16 = icmp ne i32 %15, 0 + br i1 %16, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %17, %middle.block ] + ret i32 %res.0.lcssa + } + + define dso_local arm_aapcs_vfpcc i32 @mul_var_i16(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) { + entry: + %cmp9.not = icmp eq i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv14 = phi i16* [ %scevgep15, %vector.body ], [ %b, %vector.ph ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ] + %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ] + %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] + %lsr.iv13 = bitcast i16* %lsr.iv to <4 x i16>* + %lsr.iv1416 = bitcast i16* %lsr.iv14 to <4 x i16>* + %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7) + %9 = sub i32 %7, 4 + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv13, i32 2, <4 x i1> %8, <4 x i16> undef) + %10 = sext <4 x i16> %wide.masked.load to <4 x i32> + %wide.masked.load12 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1416, i32 2, <4 x i1> %8, <4 x i16> undef) + %11 = sext <4 x i16> %wide.masked.load12 to <4 x i32> + %12 = mul nsw <4 x i32> %11, %10 + %13 = select <4 x i1> %8, <4 x i32> %12, <4 x i32> zeroinitializer + %14 = add <4 x i32> %vec.phi, %13 + %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 + %scevgep15 = getelementptr i16, i16* %lsr.iv14, i32 4 + %15 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) + %16 = icmp ne i32 %15, 0 + br i1 %16, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %17, %middle.block ] + ret i32 %res.0.lcssa + } + + define dso_local arm_aapcs_vfpcc i32 @add_var_i16(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) { + entry: + %cmp10.not = icmp eq i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp10.not, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv15 = phi i16* [ %scevgep16, %vector.body ], [ %b, %vector.ph ] + %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ] + %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ] + %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] + %lsr.iv14 = bitcast i16* %lsr.iv to <4 x i16>* + %lsr.iv1517 = bitcast i16* %lsr.iv15 to <4 x i16>* + %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7) + %9 = sub i32 %7, 4 + %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv14, i32 2, <4 x i1> %8, <4 x i16> undef) + %10 = sext <4 x i16> %wide.masked.load to <4 x i32> + %wide.masked.load13 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1517, i32 2, <4 x i1> %8, <4 x i16> undef) + %11 = sext <4 x i16> %wide.masked.load13 to <4 x i32> + %12 = add <4 x i32> %vec.phi, %10 + %13 = add <4 x i32> %12, %11 + %14 = select <4 x i1> %8, <4 x i32> %13, <4 x i32> %vec.phi + %scevgep = getelementptr i16, i16* %lsr.iv, i32 4 + %scevgep16 = getelementptr i16, i16* %lsr.iv15, i32 4 + %15 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) + %16 = icmp ne i32 %15, 0 + br i1 %16, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %17, %middle.block ] + ret i32 %res.0.lcssa + } + + ; Function Attrs: norecurse nounwind readonly + define dso_local arm_aapcs_vfpcc i32 @mul_var_i32(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %N) local_unnamed_addr #0 { + entry: + %cmp8.not = icmp eq i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv13 = phi i32* [ %scevgep14, %vector.body ], [ %b, %vector.ph ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ] + %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ] + %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] + %lsr.iv12 = bitcast i32* %lsr.iv to <4 x i32>* + %lsr.iv1315 = bitcast i32* %lsr.iv13 to <4 x i32>* + %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7) + %9 = sub i32 %7, 4 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv12, i32 4, <4 x i1> %8, <4 x i32> undef) + %wide.masked.load11 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1315, i32 4, <4 x i1> %8, <4 x i32> undef) + %10 = mul nsw <4 x i32> %wide.masked.load11, %wide.masked.load + %11 = select <4 x i1> %8, <4 x i32> %10, <4 x i32> zeroinitializer + %12 = add <4 x i32> %vec.phi, %11 + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep14 = getelementptr i32, i32* %lsr.iv13, i32 4 + %13 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) + %14 = icmp ne i32 %13, 0 + br i1 %14, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %15 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %12) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %15, %middle.block ] + ret i32 %res.0.lcssa + } + + ; Function Attrs: norecurse nounwind readonly + define dso_local arm_aapcs_vfpcc i32 @add_var_i32(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %N) local_unnamed_addr #0 { + entry: + %cmp9.not = icmp eq i32 %N, 0 + %0 = add i32 %N, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph + + vector.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %b, %vector.ph ] + %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ] + %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ] + %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ] + %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* + %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* + %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7) + %9 = sub i32 %7, 4 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef) + %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef) + %10 = add <4 x i32> %wide.masked.load, %vec.phi + %11 = add <4 x i32> %10, %wide.masked.load12 + %12 = select <4 x i1> %8, <4 x i32> %11, <4 x i32> %vec.phi + %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 + %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 + %13 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) + %14 = icmp ne i32 %13, 0 + br i1 %14, label %vector.body, label %middle.block + + middle.block: ; preds = %vector.body + %15 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %12) + br label %for.cond.cleanup + + for.cond.cleanup: ; preds = %middle.block, %entry + %res.0.lcssa = phi i32 [ 0, %entry ], [ %15, %middle.block ] + ret i32 %res.0.lcssa + } + + declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) + declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) + declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) + declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.loop.decrement.reg.i32(i32, i32) + declare <4 x i1> @llvm.arm.mve.vctp32(i32) + +... +--- +name: mul_var_i8 +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: mul_var_i8 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 + ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 4, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 + ; CHECK: bb.2.vector.body (align 4): + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r1 + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRBU32_post killed renamable $r0, 4, 0, $noreg :: (load 4 from %ir.lsr.iv13, align 1) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRBU32_post killed renamable $r1, 4, 0, $noreg :: (load 4 from %ir.lsr.iv1416, align 1) + ; CHECK: renamable $q1 = nuw nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q0, killed renamable $q1, 0, killed $noreg, killed renamable $q0 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.middle.block: + ; CHECK: liveins: $q0 + ; CHECK: renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x50000000) + liveins: $r0, $r1, $r2, $lr + + tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2IT 0, 4, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $lr + + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg + renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg + renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + t2DoLoopStart renamable $lr + + bb.2.vector.body (align 4): + successors: %bb.2(0x7c000000), %bb.3(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2 + + renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $r0, renamable $q1 = MVE_VLDRBU32_post killed renamable $r0, 4, 1, renamable $vpr :: (load 4 from %ir.lsr.iv13, align 1) + renamable $r1, renamable $q2 = MVE_VLDRBU32_post killed renamable $r1, 4, 1, renamable $vpr :: (load 4 from %ir.lsr.iv1416, align 1) + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg + renamable $q1 = nuw nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + MVE_VPST 8, implicit $vpr + renamable $q0 = MVE_VADDi32 killed renamable $q0, killed renamable $q1, 1, killed renamable $vpr, renamable $q0 + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg + + bb.3.middle.block: + liveins: $q0 + + renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 + +... +--- +name: add_var_i8 +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: add_var_i8 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 + ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 4, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 + ; CHECK: bb.2.vector.body (align 4): + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r1 + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRBU32_post killed renamable $r0, 4, 0, $noreg :: (load 4 from %ir.lsr.iv14, align 1) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRBU32_post killed renamable $r1, 4, 0, $noreg :: (load 4 from %ir.lsr.iv1517, align 1) + ; CHECK: renamable $q1 = MVE_VADDi32 renamable $q0, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q2, 0, killed $noreg, killed renamable $q0 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.middle.block: + ; CHECK: liveins: $q0 + ; CHECK: renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x50000000) + liveins: $r0, $r1, $r2, $lr + + tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2IT 0, 4, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $lr + + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg + renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg + renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + t2DoLoopStart renamable $lr + + bb.2.vector.body (align 4): + successors: %bb.2(0x7c000000), %bb.3(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2 + + renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $r0, renamable $q1 = MVE_VLDRBU32_post killed renamable $r0, 4, 1, renamable $vpr :: (load 4 from %ir.lsr.iv14, align 1) + renamable $r1, renamable $q2 = MVE_VLDRBU32_post killed renamable $r1, 4, 1, renamable $vpr :: (load 4 from %ir.lsr.iv1517, align 1) + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg + renamable $q1 = MVE_VADDi32 renamable $q0, killed renamable $q1, 0, $noreg, undef renamable $q1 + MVE_VPST 8, implicit $vpr + renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0 + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg + + bb.3.middle.block: + liveins: $q0 + + renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 + +... +--- +name: mul_var_i16 +alignment: 2 +exposesReturnsTwice: false +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: mul_var_i16 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 + ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 4, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 + ; CHECK: bb.2.vector.body (align 4): + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r1 + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv13, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, $noreg :: (load 8 from %ir.lsr.iv1416, align 2) + ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q0, killed renamable $q1, 0, killed $noreg, killed renamable $q0 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.middle.block: + ; CHECK: liveins: $q0 + ; CHECK: renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x50000000) + liveins: $r0, $r1, $r2, $lr + + tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2IT 0, 4, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $lr + + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg + renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg + renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + t2DoLoopStart renamable $lr + + bb.2.vector.body (align 4): + successors: %bb.2(0x7c000000), %bb.3(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2 + + renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv13, align 2) + renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1416, align 2) + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg + renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + MVE_VPST 8, implicit $vpr + renamable $q0 = MVE_VADDi32 killed renamable $q0, killed renamable $q1, 1, killed renamable $vpr, renamable $q0 + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg + + bb.3.middle.block: + liveins: $q0 + + renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 + +... +--- +name: add_var_i16 +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: add_var_i16 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 + ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 4, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 + ; CHECK: bb.2.vector.body (align 4): + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r1 + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv14, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, $noreg :: (load 8 from %ir.lsr.iv1517, align 2) + ; CHECK: renamable $q1 = MVE_VADDi32 renamable $q0, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q2, 0, killed $noreg, killed renamable $q0 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.middle.block: + ; CHECK: liveins: $q0 + ; CHECK: renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x50000000) + liveins: $r0, $r1, $r2, $lr + + tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2IT 0, 4, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $lr + + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg + renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg + renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + t2DoLoopStart renamable $lr + + bb.2.vector.body (align 4): + successors: %bb.2(0x7c000000), %bb.3(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2 + + renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv14, align 2) + renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1517, align 2) + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg + renamable $q1 = MVE_VADDi32 renamable $q0, killed renamable $q1, 0, $noreg, undef renamable $q1 + MVE_VPST 8, implicit $vpr + renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0 + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg + + bb.3.middle.block: + liveins: $q0 + + renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 + +... +--- +name: mul_var_i32 +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: mul_var_i32 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 + ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 4, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 + ; CHECK: bb.2.vector.body (align 4): + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r1 + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.lsr.iv12, align 4) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1315, align 4) + ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q0, killed renamable $q1, 0, killed $noreg, killed renamable $q0 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.middle.block: + ; CHECK: liveins: $q0 + ; CHECK: renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x50000000) + liveins: $r0, $r1, $r2, $lr + + tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2IT 0, 4, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $lr + + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg + renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg + renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + t2DoLoopStart renamable $lr + + bb.2.vector.body (align 4): + successors: %bb.2(0x7c000000), %bb.3(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2 + + renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv12, align 4) + renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1315, align 4) + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg + renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 + MVE_VPST 8, implicit $vpr + renamable $q0 = MVE_VADDi32 killed renamable $q0, killed renamable $q1, 1, killed renamable $vpr, renamable $q0 + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg + + bb.3.middle.block: + liveins: $q0 + + renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 + +... +--- +name: add_var_i32 +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: add_var_i32 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 + ; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 0, 4, implicit-def $itstate + ; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + ; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + ; CHECK: bb.1.vector.ph: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7 + ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 + ; CHECK: bb.2.vector.body (align 4): + ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r1 + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.lsr.iv13, align 4) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1416, align 4) + ; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q2, 0, killed $noreg, killed renamable $q0 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: bb.3.middle.block: + ; CHECK: liveins: $q0 + ; CHECK: renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 + bb.0.entry: + successors: %bb.1(0x50000000) + liveins: $r0, $r1, $r2, $lr + + tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2IT 0, 4, implicit-def $itstate + renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate + tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate + + bb.1.vector.ph: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $lr + + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg + renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg + renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + t2DoLoopStart renamable $lr + + bb.2.vector.body (align 4): + successors: %bb.2(0x7c000000), %bb.3(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2 + + renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4) + renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4) + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg + renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 + MVE_VPST 8, implicit $vpr + renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0 + t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg + + bb.3.middle.block: + liveins: $q0 + + renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg + frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0 + +... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll index a4961f51f32b80..0554742369fdcc 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -4,38 +4,25 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_add_add_v16i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr { ; CHECK-LABEL: one_loop_add_add_v16i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: ittt eq -; CHECK-NEXT: moveq r0, #0 -; CHECK-NEXT: uxtbeq r0, r0 -; CHECK-NEXT: bxeq lr -; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: add.w r3, r2, #15 -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: bic r3, r3, #15 -; CHECK-NEXT: sub.w r12, r3, #16 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w lr, r3, r12, lsr #4 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB0_2: @ %vector.body +; CHECK-NEXT: cbz r2, .LBB0_4 +; CHECK-NEXT: @ %bb.1: @ %vector.ph +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: dlstp.8 lr, r2 +; CHECK: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.8 r2 -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u8 q1, [r0], #16 -; CHECK-NEXT: subs r2, #16 -; CHECK-NEXT: vadd.i8 q1, q1, q0 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u8 q2, [r1], #16 -; CHECK-NEXT: vadd.i8 q1, q1, q2 -; CHECK-NEXT: le lr, .LBB0_2 -; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vpsel q0, q1, q0 -; CHECK-NEXT: vaddv.u8 r0, q0 -; CHECK-NEXT: pop.w {r7, lr} -; CHECK-NEXT: uxtb r0, r0 -; CHECK-NEXT: bx lr +; CHECK-NEXT: vldrb.u8 q1, [r1], #16 +; CHECK-NEXT: vldrb.u8 q2, [r0], #16 +; CHECK-NEXT: vadd.i8 q0, q2, q1 +; CHECK-NEXT: vaddv.u8 r12, q0 +; CHECK-NEXT: letp lr, .LBB0_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup +; CHECK-NEXT: uxtb.w r0, r12 +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: uxtb.w r0, r12 +; CHECK-NEXT: pop {r7, pc} entry: %cmp11 = icmp eq i32 %N, 0 br i1 %cmp11, label %for.cond.cleanup, label %vector.ph @@ -56,19 +43,18 @@ vector.body: ; preds = %vector.body, %vecto %i2 = getelementptr inbounds i8, i8* %b, i32 %index %i3 = bitcast i8* %i2 to <16 x i8>* %wide.masked.load16 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %i3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) - %i4 = add <16 x i8> %wide.masked.load, %vec.phi - %i5 = add <16 x i8> %i4, %wide.masked.load16 + %i4 = add <16 x i8> %wide.masked.load, %wide.masked.load16 + %i5 = select <16 x i1> %active.lane.mask, <16 x i8> %i4, <16 x i8> %vec.phi + %i6 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %i5) %index.next = add i32 %index, 16 - %i6 = icmp eq i32 %index.next, %n.vec - br i1 %i6, label %middle.block, label %vector.body + %i7 = icmp eq i32 %index.next, %n.vec + br i1 %i7, label %middle.block, label %vector.body middle.block: ; preds = %vector.body - %i7 = select <16 x i1> %active.lane.mask, <16 x i8> %i5, <16 x i8> %vec.phi - %i8 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %i7) br label %for.cond.cleanup for.cond.cleanup: ; preds = %middle.block, %entry - %res.0.lcssa = phi i8 [ 0, %entry ], [ %i8, %middle.block ] + %res.0.lcssa = phi i8 [ 0, %entry ], [ %i6, %middle.block ] ret i8 %res.0.lcssa } @@ -89,7 +75,7 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_add_add_v8i16(i8* nocaptu ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #3 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB1_2: @ %vector.body +; CHECK: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 ; CHECK-NEXT: vmov q0, q1 @@ -155,16 +141,26 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_sub_add_v16i8(i8* nocaptur ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: .LBB2_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: dlstp.8 lr, r2 -; CHECK-NEXT: .LBB2_2: @ %vector.body +; CHECK-NEXT: add.w r3, r2, #15 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: bic r3, r3, #15 +; CHECK-NEXT: sub.w r12, r3, #16 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: add.w lr, r3, r12, lsr #4 +; CHECK-NEXT: dls lr, lr +; CHECK: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u8 q1, [r1], #16 -; CHECK-NEXT: vldrb.u8 q2, [r0], #16 +; CHECK-NEXT: vctp.8 r2 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrbt.u8 q1, [r1], #16 +; CHECK-NEXT: vldrbt.u8 q2, [r0], #16 +; CHECK-NEXT: subs r2, #16 ; CHECK-NEXT: vsub.i8 q1, q2, q1 -; CHECK-NEXT: vadd.i8 q0, q1, q0 -; CHECK-NEXT: letp lr, .LBB2_2 +; CHECK-NEXT: vadd.i8 q1, q1, q0 +; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u8 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: uxtb r0, r0 @@ -215,16 +211,26 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_sub_add_v8i16(i8* nocaptu ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: .LBB3_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: dlstp.16 lr, r2 -; CHECK-NEXT: .LBB3_2: @ %vector.body +; CHECK-NEXT: adds r3, r2, #7 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: bic r3, r3, #7 +; CHECK-NEXT: sub.w r12, r3, #8 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: add.w lr, r3, r12, lsr #3 +; CHECK-NEXT: dls lr, lr +; CHECK: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u16 q1, [r0], #8 -; CHECK-NEXT: vldrb.u16 q2, [r1], #8 +; CHECK-NEXT: vctp.16 r2 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrbt.u16 q1, [r0], #8 +; CHECK-NEXT: vldrbt.u16 q2, [r1], #8 +; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: vsub.i16 q1, q2, q1 -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: letp lr, .LBB3_2 +; CHECK-NEXT: vadd.i16 q1, q1, q0 +; CHECK-NEXT: le lr, .LBB3_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: sxth r0, r0 @@ -277,16 +283,26 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_mul_add_v16i8(i8* nocaptur ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: .LBB4_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: dlstp.8 lr, r2 -; CHECK-NEXT: .LBB4_2: @ %vector.body +; CHECK-NEXT: add.w r3, r2, #15 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: bic r3, r3, #15 +; CHECK-NEXT: sub.w r12, r3, #16 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: add.w lr, r3, r12, lsr #4 +; CHECK-NEXT: dls lr, lr +; CHECK: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u8 q1, [r0], #16 -; CHECK-NEXT: vldrb.u8 q2, [r1], #16 +; CHECK-NEXT: vctp.8 r2 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrbt.u8 q1, [r0], #16 +; CHECK-NEXT: vldrbt.u8 q2, [r1], #16 +; CHECK-NEXT: subs r2, #16 ; CHECK-NEXT: vmul.i8 q1, q2, q1 -; CHECK-NEXT: vadd.i8 q0, q1, q0 -; CHECK-NEXT: letp lr, .LBB4_2 +; CHECK-NEXT: vadd.i8 q1, q1, q0 +; CHECK-NEXT: le lr, .LBB4_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u8 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: uxtb r0, r0 @@ -337,16 +353,26 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_mul_add_v8i16(i8* nocaptu ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: .LBB5_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: dlstp.16 lr, r2 -; CHECK-NEXT: .LBB5_2: @ %vector.body +; CHECK-NEXT: adds r3, r2, #7 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: bic r3, r3, #7 +; CHECK-NEXT: sub.w r12, r3, #8 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: add.w lr, r3, r12, lsr #3 +; CHECK-NEXT: dls lr, lr +; CHECK: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u16 q1, [r0], #8 -; CHECK-NEXT: vldrb.u16 q2, [r1], #8 +; CHECK-NEXT: vctp.16 r2 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrbt.u16 q1, [r0], #8 +; CHECK-NEXT: vldrbt.u16 q2, [r1], #8 +; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: vmul.i16 q1, q2, q1 -; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: letp lr, .LBB5_2 +; CHECK-NEXT: vadd.i16 q1, q1, q0 +; CHECK-NEXT: le lr, .LBB5_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: pop.w {r7, lr} ; CHECK-NEXT: sxth r0, r0 @@ -406,7 +432,7 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read ; CHECK-NEXT: add.w lr, r3, r6, lsr #2 ; CHECK-NEXT: mov r3, r2 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB6_2: @ %vector.body +; CHECK: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vmov q0, q1 @@ -422,19 +448,25 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read ; CHECK-NEXT: vaddv.u32 r12, q0 ; CHECK-NEXT: cbz r2, .LBB6_7 ; CHECK-NEXT: @ %bb.4: @ %vector.ph47 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: add.w lr, r3, r6, lsr #2 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dlstp.32 lr, r2 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vdup.32 q0, r3 -; CHECK-NEXT: vmov.32 q1[0], r12 -; CHECK-NEXT: .LBB6_5: @ %vector.body46 +; CHECK-NEXT: vmov.32 q0[0], r12 +; CHECK: .LBB6_5: @ %vector.body46 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrb.u32 q0, [r0], #4 -; CHECK-NEXT: vldrb.u32 q2, [r1], #4 +; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrbt.u32 q0, [r0], #4 +; CHECK-NEXT: vldrbt.u32 q2, [r1], #4 +; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmul.i32 q0, q2, q0 -; CHECK-NEXT: vadd.i32 q1, q0, q1 -; CHECK-NEXT: letp lr, .LBB6_5 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: le lr, .LBB6_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block44 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r12, q0 ; CHECK-NEXT: .LBB6_7: @ %for.cond.cleanup7 ; CHECK-NEXT: mov r0, r12 @@ -527,7 +559,7 @@ define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(i8* nocaptur ; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: mov r4, r1 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB7_2: @ %vector.body +; CHECK: .LBB7_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.16 r2 ; CHECK-NEXT: vmov q0, q1 @@ -643,22 +675,31 @@ define i32 @wrongop(%struct.date* nocapture readonly %pd) { ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB8_5: @ %vector.ph +; CHECK-NEXT: adds r1, r3, #3 +; CHECK-NEXT: movs r2, #1 +; CHECK-NEXT: bic r1, r1, #3 +; CHECK-NEXT: subs r1, #4 +; CHECK-NEXT: add.w lr, r2, r1, lsr #2 ; CHECK-NEXT: movw r1, :lower16:days ; CHECK-NEXT: movt r1, :upper16:days ; CHECK-NEXT: movs r2, #52 ; CHECK-NEXT: mla r1, r4, r2, r1 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vdup.32 q0, r2 -; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: vmov.32 q0[0], r0 ; CHECK-NEXT: subs r0, r3, #1 -; CHECK-NEXT: dlstp.32 lr, r0 -; CHECK-NEXT: .LBB8_6: @ %vector.body +; CHECK: .LBB8_6: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: vadd.i32 q1, q0, q1 -; CHECK-NEXT: letp lr, .LBB8_6 +; CHECK-NEXT: vctp.32 r0 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q0, [r1], #16 +; CHECK-NEXT: subs r0, #4 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: le lr, .LBB8_6 ; CHECK-NEXT: @ %bb.7: @ %middle.block -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r4, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll index 34701aba6324a9..15aed3bd4e17a3 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -53,24 +53,33 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input ; ENABLED-NEXT: movs r7, #1 ; ENABLED-NEXT: subs r0, #4 ; ENABLED-NEXT: subs r4, r2, r6 -; ENABLED-NEXT: vmov.i32 q0, #0x0 +; ENABLED-NEXT: vmov.i32 q1, #0x0 ; ENABLED-NEXT: add.w r8, r7, r0, lsr #2 +; ENABLED-NEXT: sub.w r0, r12, r6 +; ENABLED-NEXT: bic r0, r0, #3 +; ENABLED-NEXT: subs r0, #4 +; ENABLED-NEXT: add.w r0, r7, r0, lsr #2 ; ENABLED-NEXT: mov r7, r10 -; ENABLED-NEXT: dlstp.32 lr, r4 +; ENABLED-NEXT: dls lr, r0 ; ENABLED-NEXT: ldr r0, [sp] @ 4-byte Reload -; ENABLED-NEXT: .LBB0_6: @ %vector.body +; ENABLED: .LBB0_6: @ %vector.body ; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1 ; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2 -; ENABLED-NEXT: vldrh.s32 q1, [r0], #8 -; ENABLED-NEXT: vldrh.s32 q2, [r7], #8 +; ENABLED-NEXT: vctp.32 r4 +; ENABLED-NEXT: vmov q0, q1 +; ENABLED-NEXT: vpstt +; ENABLED-NEXT: vldrht.s32 q1, [r0], #8 +; ENABLED-NEXT: vldrht.s32 q2, [r7], #8 ; ENABLED-NEXT: mov lr, r8 ; ENABLED-NEXT: vmul.i32 q1, q2, q1 ; ENABLED-NEXT: sub.w r8, r8, #1 ; ENABLED-NEXT: vshl.s32 q1, r5 -; ENABLED-NEXT: vadd.i32 q0, q1, q0 -; ENABLED-NEXT: letp lr, .LBB0_6 +; ENABLED-NEXT: subs r4, #4 +; ENABLED-NEXT: vadd.i32 q1, q1, q0 +; ENABLED-NEXT: le lr, .LBB0_6 ; ENABLED-NEXT: @ %bb.7: @ %middle.block ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 +; ENABLED-NEXT: vpsel q0, q1, q0 ; ENABLED-NEXT: vaddv.u32 r0, q0 ; ENABLED-NEXT: b .LBB0_3 ; ENABLED-NEXT: .LBB0_8: @ %for.end17 @@ -103,7 +112,7 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input ; NOREDUCTIONS-NEXT: adds r6, #1 ; NOREDUCTIONS-NEXT: add.w r10, r10, #2 ; NOREDUCTIONS-NEXT: cmp r6, r3 -; NOREDUCTIONS-NEXT: beq .LBB0_8 +; NOREDUCTIONS: beq .LBB0_8 ; NOREDUCTIONS-NEXT: .LBB0_4: @ %for.body ; NOREDUCTIONS-NEXT: @ =>This Loop Header: Depth=1 ; NOREDUCTIONS-NEXT: @ Child Loop BB0_6 Depth 2 @@ -115,24 +124,33 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input ; NOREDUCTIONS-NEXT: movs r7, #1 ; NOREDUCTIONS-NEXT: subs r0, #4 ; NOREDUCTIONS-NEXT: subs r4, r2, r6 -; NOREDUCTIONS-NEXT: vmov.i32 q0, #0x0 +; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0 ; NOREDUCTIONS-NEXT: add.w r8, r7, r0, lsr #2 +; NOREDUCTIONS-NEXT: sub.w r0, r12, r6 +; NOREDUCTIONS-NEXT: bic r0, r0, #3 +; NOREDUCTIONS-NEXT: subs r0, #4 +; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2 ; NOREDUCTIONS-NEXT: mov r7, r10 -; NOREDUCTIONS-NEXT: dlstp.32 lr, r4 -; NOREDUCTIONS-NEXT: ldr r0, [sp] @ 4-byte Reload +; NOREDUCTIONS-NEXT: dls lr, r0 +; NOREDUCTIONS: ldr r0, [sp] @ 4-byte Reload ; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body ; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2 -; NOREDUCTIONS-NEXT: vldrh.s32 q1, [r0], #8 -; NOREDUCTIONS-NEXT: vldrh.s32 q2, [r7], #8 +; NOREDUCTIONS-NEXT: vctp.32 r4 +; NOREDUCTIONS-NEXT: vmov q0, q1 +; NOREDUCTIONS-NEXT: vpstt +; NOREDUCTIONS-NEXT: vldrht.s32 q1, [r0], #8 +; NOREDUCTIONS-NEXT: vldrht.s32 q2, [r7], #8 ; NOREDUCTIONS-NEXT: mov lr, r8 ; NOREDUCTIONS-NEXT: vmul.i32 q1, q2, q1 ; NOREDUCTIONS-NEXT: sub.w r8, r8, #1 ; NOREDUCTIONS-NEXT: vshl.s32 q1, r5 -; NOREDUCTIONS-NEXT: vadd.i32 q0, q1, q0 -; NOREDUCTIONS-NEXT: letp lr, .LBB0_6 +; NOREDUCTIONS-NEXT: subs r4, #4 +; NOREDUCTIONS-NEXT: vadd.i32 q1, q1, q0 +; NOREDUCTIONS-NEXT: le lr, .LBB0_6 ; NOREDUCTIONS-NEXT: @ %bb.7: @ %middle.block ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 +; NOREDUCTIONS-NEXT: vpsel q0, q1, q0 ; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0 ; NOREDUCTIONS-NEXT: b .LBB0_3 ; NOREDUCTIONS-NEXT: .LBB0_8: @ %for.end17 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir index 4f80869de3ccb2..cdc9d7e7be9c69 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir @@ -122,28 +122,18 @@ body: | ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1 - ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - ; CHECK: dead $lr = t2DLS renamable $r12 - ; CHECK: $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg + ; CHECK: $lr = MVE_DLSTP_32 renamable $r2 ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $q1, $r0, $r1, $r2, $r3 - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + ; CHECK: liveins: $lr, $q1, $r0, $r1, $r2 ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) - ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) - ; CHECK: $lr = tMOVr $r3, 14 /* CC::al */, $noreg + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv1820, align 2) ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 - ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 ; CHECK: bb.3.middle.block: ; CHECK: liveins: $q0, $q1, $r2 ; CHECK: renamable $r0, dead $cpsr = tADDi3 killed renamable $r2, 4, 14 /* CC::al */, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll index c6f64a8464bdc7..6628df20f2024e 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -10,19 +10,28 @@ define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* no ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: adds r3, r2, #3 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: bic r3, r3, #3 +; CHECK-NEXT: sub.w r12, r3, #4 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: add.w lr, r3, r12, lsr #2 ; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK-NEXT: .LBB0_2: @ %vector.body +; CHECK-NEXT: dls lr, lr +; CHECK: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 +; CHECK-NEXT: vldrwt.u32 q2, [r1], #16 ; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vmul.i32 q0, q2, q0 -; CHECK-NEXT: vadd.i32 q1, q0, q1 -; CHECK-NEXT: letp lr, .LBB0_2 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: @@ -78,17 +87,26 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: .LBB1_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: adds r1, r2, #3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: bic r1, r1, #3 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: subs r1, #4 +; CHECK-NEXT: add.w lr, r3, r1, lsr #2 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK-NEXT: .LBB1_2: @ %vector.body +; CHECK-NEXT: dls lr, lr +; CHECK: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 ; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: vadd.i32 q1, q0, q1 -; CHECK-NEXT: letp lr, .LBB1_2 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: @@ -140,17 +158,26 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i ; CHECK-NEXT: bxeq lr ; CHECK-NEXT: .LBB2_1: @ %vector.ph ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: adds r1, r2, #3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: bic r1, r1, #3 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: subs r1, #4 +; CHECK-NEXT: add.w lr, r3, r1, lsr #2 ; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK-NEXT: .LBB2_2: @ %vector.body +; CHECK-NEXT: dls lr, lr +; CHECK: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q0, [r0], #16 ; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: vadd.i32 q1, q0, q1 -; CHECK-NEXT: letp lr, .LBB2_2 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vpsel q0, q0, q1 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: pop {r7, pc} entry: @@ -203,7 +230,7 @@ define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias ; CHECK-NEXT: .LBB3_1: @ %vector.ph ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: .LBB3_2: @ %vector.body +; CHECK: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 @@ -260,7 +287,7 @@ define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias ; CHECK-NEXT: .LBB4_1: @ %vector.ph ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: .LBB4_2: @ %vector.body +; CHECK: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #4 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 @@ -317,7 +344,7 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocaptur ; CHECK-NEXT: .LBB5_1: @ %vector.ph ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.8 lr, r3 -; CHECK-NEXT: .LBB5_2: @ %vector.body +; CHECK: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #16 ; CHECK-NEXT: vldrb.u8 q0, [r1], #16 @@ -377,7 +404,7 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapt ; CHECK-NEXT: .LBB6_1: @ %vector.ph ; CHECK-NEXT: mov.w r12, #0 ; CHECK-NEXT: dlstp.16 lr, r3 -; CHECK-NEXT: .LBB6_2: @ %vector.body +; CHECK: .LBB6_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: add.w r12, r12, #8 ; CHECK-NEXT: vldrh.u16 q0, [r1], #16 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir index 595d70b715acd5..7578b429790bec 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir @@ -425,13 +425,8 @@ body: | ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg - ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $lr = t2WLS killed renamable $lr, %bb.1 + ; CHECK: $lr = MVE_WLSTP_32 $r2, %bb.1 ; CHECK: tB %bb.4, 14 /* CC::al */, $noreg ; CHECK: bb.1.vector.ph: ; CHECK: successors: %bb.2(0x80000000) @@ -441,18 +436,15 @@ body: | ; CHECK: successors: %bb.3(0x04000000), %bb.2(0x7c000000) ; CHECK: liveins: $lr, $q1, $r0, $r1, $r2 ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 - ; CHECK: renamable $vpr = MVE_VCTP32 $r2, 0, $noreg - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) - ; CHECK: renamable $q2 = MVE_VLDRWU32 renamable $r1, 0, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4) + ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, $noreg :: (load 16 from %ir.lsr.iv24, align 4) + ; CHECK: renamable $q2 = MVE_VLDRWU32 renamable $r1, 0, 0, $noreg :: (load 16 from %ir.lsr.iv1, align 4) ; CHECK: $r3 = tMOVr $r2, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 ; CHECK: renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 16, 14 /* CC::al */, $noreg ; CHECK: renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed $r2, 4, 14 /* CC::al */, $noreg - ; CHECK: MVE_VPST 8, implicit $vpr - ; CHECK: renamable $q1 = nsw MVE_VADDi32 killed renamable $q1, renamable $q0, 0, killed renamable $vpr, undef renamable $q1 - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: renamable $q1 = nsw MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 ; CHECK: bb.3.middle.block: ; CHECK: successors: %bb.4(0x80000000) ; CHECK: liveins: $q0, $q1, $r3 @@ -501,7 +493,7 @@ body: | renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg renamable $r2, dead $cpsr = tSUBi8 killed $r2, 4, 14, $noreg MVE_VPST 8, implicit $vpr - renamable $q1 = nsw MVE_VADDi32 killed renamable $q1, renamable $q0, 0, renamable $vpr, undef renamable $q1 + renamable $q1 = nsw MVE_VADDi32 killed renamable $q1, renamable $q0, 1, renamable $vpr, undef renamable $q1 renamable $lr = t2LoopDec killed renamable $lr, 1 t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr tB %bb.3, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir index d91556e3e70b93..e377b06fea9f89 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir @@ -133,23 +133,21 @@ body: | ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg ; CHECK: renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - ; CHECK: dead $lr = t2DLS renamable $r3 ; CHECK: $r12 = tMOVr killed $r3, 14 /* CC::al */, $noreg ; CHECK: $r3 = tMOVr $r2, 14 /* CC::al */, $noreg + ; CHECK: dead $lr = MVE_DLSTP_32 renamable $r3 ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) ; CHECK: liveins: $q1, $r0, $r1, $r2, $r3, $r12 - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) - ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv1820, align 2) ; CHECK: $lr = tMOVr $r12, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 ; CHECK: renamable $r12 = nsw t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: dead $lr = MVE_LETP killed renamable $lr, %bb.2 ; CHECK: bb.3.middle.block: ; CHECK: liveins: $q0, $q1, $r2, $r3 ; CHECK: renamable $r0, dead $cpsr = tSUBi3 killed renamable $r2, 1, 14 /* CC::al */, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir index 337816146e5f0b..05bfdbb2fc0f8d 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir @@ -119,28 +119,18 @@ body: | ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1 - ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - ; CHECK: dead $lr = t2DLS renamable $r12 - ; CHECK: $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg + ; CHECK: $lr = MVE_DLSTP_32 renamable $r2 ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $q1, $r0, $r1, $r2, $r3 - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + ; CHECK: liveins: $lr, $q1, $r0, $r1, $r2 ; CHECK: $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0 - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2) - ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2) - ; CHECK: $lr = tMOVr $r3, 14 /* CC::al */, $noreg + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv17, align 2) + ; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv1820, align 2) ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 - ; CHECK: renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1 - ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 ; CHECK: bb.3.middle.block: ; CHECK: liveins: $q0, $q1, $r2 ; CHECK: renamable $vpr = MVE_VCTP32 killed renamable $r2, 0, $noreg diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll index 535d7a1c38cb7e..b5efcd1342db74 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll @@ -9,29 +9,27 @@ define dso_local void @mve_gather_qi_wb(i32* noalias nocapture readonly %A, i32* ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 ; CHECK-NEXT: adr r0, .LCPI0_0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov.i32 q2, #0x0 -; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: movw lr, #1250 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: adds r1, r3, #4 -; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r12], #16 -; CHECK-NEXT: vldrw.u32 q3, [q0, #80]! +; CHECK-NEXT: vctp.32 r3 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrwt.u32 q1, [r12], #16 +; CHECK-NEXT: vldrwt.u32 q3, [q0, #80]! +; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.i32 q1, q3, q1 -; CHECK-NEXT: vadd.i32 q2, q2, q1 -; CHECK-NEXT: letp lr, .LBB0_1 +; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vpsel q0, q1, q2 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: str.w r0, [r2, r1, lsl #2] ; CHECK-NEXT: pop {r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI0_0: -; CHECK-NEXT: .long 4294967228 @ 0xffffffbc -; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 -; CHECK-NEXT: .long 4294967268 @ 0xffffffe4 -; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 entry: ; preds = %middle. %add.us.us = add i32 4, %n %arrayidx.us.us = getelementptr inbounds i32, i32* %C, i32 %add.us.us @@ -77,20 +75,26 @@ define dso_local void @mve_gatherscatter_offset(i32* noalias nocapture readonly ; CHECK-NEXT: adr r0, .LCPI1_0 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: add.w r12, r3, #4 -; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: movw lr, #1250 +; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vmov.i32 q0, #0x14 -; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: dls lr, lr +; CHECK: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r1, q1, uxtw #2] -; CHECK-NEXT: vldrw.u32 q4, [r4], #16 +; CHECK-NEXT: vctp.32 r3 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrwt.u32 q2, [r1, q1, uxtw #2] +; CHECK-NEXT: vldrwt.u32 q4, [r4], #16 +; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.i32 q2, q2, q4 -; CHECK-NEXT: vstrw.32 q2, [r1, q1, uxtw #2] +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q2, [r1, q1, uxtw #2] ; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: letp lr, .LBB1_1 +; CHECK-NEXT: vadd.i32 q2, q3, q2 +; CHECK-NEXT: le lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vmov q0, q3 +; CHECK-NEXT: vpsel q0, q2, q3 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: str.w r0, [r2, r12, lsl #2] ; CHECK-NEXT: vpop {d8, d9} @@ -144,20 +148,26 @@ define dso_local void @mve_scatter_qi(i32* noalias nocapture readonly %A, i32* n ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 ; CHECK-NEXT: adr r0, .LCPI2_0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: movw lr, #1250 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: vmov.i32 q2, #0x3 ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: adds r1, r3, #4 -; CHECK-NEXT: dlstp.32 lr, r3 -; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: dls lr, lr +; CHECK: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r12], #16 +; CHECK-NEXT: vctp.32 r3 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r12], #16 +; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.i32 q1, q1, q2 -; CHECK-NEXT: vstrw.32 q1, [q0, #80]! -; CHECK-NEXT: vadd.i32 q3, q3, q1 -; CHECK-NEXT: letp lr, .LBB2_1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [q0, #80]! +; CHECK-NEXT: vadd.i32 q1, q3, q1 +; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vmov q0, q3 +; CHECK-NEXT: vpsel q0, q1, q3 ; CHECK-NEXT: vaddv.u32 r0, q0 ; CHECK-NEXT: str.w r0, [r2, r1, lsl #2] ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll index 311a06a6757711..5669fdf38fee01 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll @@ -9,22 +9,32 @@ define void @arm_min_helium_f32(float* %pSrc, i32 %blockSize, float* nocapture % ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: mov r12, r1 ; CHECK-NEXT: vidup.u32 q2, r6, #1 +; CHECK-NEXT: cmp r1, #4 +; CHECK-NEXT: it ge +; CHECK-NEXT: movge.w r12, #4 +; CHECK-NEXT: sub.w r6, r1, r12 +; CHECK-NEXT: adds r6, #3 +; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: adr r4, .LCPI0_0 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: add.w lr, lr, r6, lsr #2 ; CHECK-NEXT: vldrw.u32 q1, [r4] ; CHECK-NEXT: vmov.i32 q3, #0x4 ; CHECK-NEXT: mov r12, r1 -; CHECK-NEXT: dlstp.32 lr, r12 -; CHECK-NEXT: .LBB0_1: @ %do.body +; CHECK-NEXT: dls lr, lr +; CHECK: .LBB0_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q4, [r0], #16 -; CHECK-NEXT: vcmp.f32 ge, q1, q4 -; CHECK-NEXT: vpstt +; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vpstttt +; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 +; CHECK-NEXT: vcmpt.f32 ge, q1, q4 ; CHECK-NEXT: vmovt q1, q4 ; CHECK-NEXT: vmovt q0, q2 ; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: letp lr, .LBB0_1 +; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %do.end ; CHECK-NEXT: vldr s8, .LCPI0_1 ; CHECK-NEXT: vdup.32 q3, r1 @@ -38,15 +48,6 @@ define void @arm_min_helium_f32(float* %pSrc, i32 %blockSize, float* nocapture % ; CHECK-NEXT: vstr s8, [r2] ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r4, r6, r7, pc} -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI0_0: -; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11 -; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11 -; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11 -; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11 -; CHECK-NEXT: .LCPI0_1: -; CHECK-NEXT: .long 0x5368d4a5 @ float 9.99999995E+11 entry: %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 1) %1 = extractvalue { <4 x i32>, i32 } %0, 0 diff --git a/llvm/test/tools/yaml2obj/ELF/DWARF/debug-abbrev.yaml b/llvm/test/tools/yaml2obj/ELF/DWARF/debug-abbrev.yaml index f1d14847699cb4..23262e0ff5e40f 100644 --- a/llvm/test/tools/yaml2obj/ELF/DWARF/debug-abbrev.yaml +++ b/llvm/test/tools/yaml2obj/ELF/DWARF/debug-abbrev.yaml @@ -277,9 +277,12 @@ DWARF: ## ^- abbreviation code (ULEB128) 0x04 ## ^- abbreviation code (ULEB128) 0x04 ## -# CODE-NEXT: 0x00000010 2e000000 052e0000 00062e00 0000 -## ^- abbreviation code ULEB128 -## ^- abbreviation code ULEB128 +# CODE-NEXT: 0x00000010 2e000000 052e0000 00062e00 00000001 +## ^- abbreviation code ULEB128 +## ^- abbreviation code ULEB128 +## ^- abbreviation code ULEB128 +# CODE-NEXT: 0x00000020 11000000 022e0000 0000 +## ^- abbreviation code ULEB128 --- !ELF FileHeader: @@ -303,6 +306,12 @@ DWARF: Children: DW_CHILDREN_no - Tag: DW_TAG_subprogram Children: DW_CHILDREN_no + - Table: + ## Test that the abbrev codes in a new table start from 1 by default. + - Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_no + - Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no ## i) Test that yaml2obj emits an error message when there are non-empty compilation units ## and multiple abbrev tables are assigned the same ID. diff --git a/llvm/unittests/CodeGen/ScalableVectorMVTsTest.cpp b/llvm/unittests/CodeGen/ScalableVectorMVTsTest.cpp index f1c4a8bea1f565..f76b8db602ec70 100644 --- a/llvm/unittests/CodeGen/ScalableVectorMVTsTest.cpp +++ b/llvm/unittests/CodeGen/ScalableVectorMVTsTest.cpp @@ -71,8 +71,8 @@ TEST(ScalableVectorMVTsTest, HelperFuncs) { // Check fields inside llvm::ElementCount EltCnt = Vnx4i32.getVectorElementCount(); - EXPECT_EQ(EltCnt.Min, 4U); - ASSERT_TRUE(EltCnt.Scalable); + EXPECT_EQ(EltCnt.getKnownMinValue(), 4U); + ASSERT_TRUE(EltCnt.isScalable()); // Check that fixed-length vector types aren't scalable. EVT V8i32 = EVT::getVectorVT(Ctx, MVT::i32, 8); @@ -82,8 +82,8 @@ TEST(ScalableVectorMVTsTest, HelperFuncs) { // Check that llvm::ElementCount works for fixed-length types. EltCnt = V8i32.getVectorElementCount(); - EXPECT_EQ(EltCnt.Min, 8U); - ASSERT_FALSE(EltCnt.Scalable); + EXPECT_EQ(EltCnt.getKnownMinValue(), 8U); + ASSERT_FALSE(EltCnt.isScalable()); } TEST(ScalableVectorMVTsTest, IRToVTTranslation) { diff --git a/llvm/unittests/IR/VectorTypesTest.cpp b/llvm/unittests/IR/VectorTypesTest.cpp index b28e445c97a795..7525ee9872b7ac 100644 --- a/llvm/unittests/IR/VectorTypesTest.cpp +++ b/llvm/unittests/IR/VectorTypesTest.cpp @@ -119,8 +119,8 @@ TEST(VectorTypesTest, FixedLength) { EXPECT_EQ(ConvTy->getElementType()->getScalarSizeInBits(), 64U); EltCnt = V8Int64Ty->getElementCount(); - EXPECT_EQ(EltCnt.Min, 8U); - ASSERT_FALSE(EltCnt.Scalable); + EXPECT_EQ(EltCnt.getKnownMinValue(), 8U); + ASSERT_FALSE(EltCnt.isScalable()); } TEST(VectorTypesTest, Scalable) { @@ -215,8 +215,8 @@ TEST(VectorTypesTest, Scalable) { EXPECT_EQ(ConvTy->getElementType()->getScalarSizeInBits(), 64U); EltCnt = ScV8Int64Ty->getElementCount(); - EXPECT_EQ(EltCnt.Min, 8U); - ASSERT_TRUE(EltCnt.Scalable); + EXPECT_EQ(EltCnt.getKnownMinValue(), 8U); + ASSERT_TRUE(EltCnt.isScalable()); } TEST(VectorTypesTest, BaseVectorType) { @@ -250,7 +250,7 @@ TEST(VectorTypesTest, BaseVectorType) { // test I == J VectorType *VI = VTys[I]; ElementCount ECI = VI->getElementCount(); - EXPECT_EQ(isa(VI), ECI.Scalable); + EXPECT_EQ(isa(VI), ECI.isScalable()); for (size_t J = I + 1, JEnd = VTys.size(); J < JEnd; ++J) { // test I < J