diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 69899fd8e6684..1192fbdc1c9d8 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -8457,7 +8457,8 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
   case SVE::BI__builtin_sve_svlen_u64: {
     SVETypeFlags TF(Builtin->TypeModifier);
     auto VTy = cast<llvm::VectorType>(getSVEType(TF));
-    auto NumEls = llvm::ConstantInt::get(Ty, VTy->getElementCount().Min);
+    auto *NumEls =
+        llvm::ConstantInt::get(Ty, VTy->getElementCount().getKnownMinValue());
 
     Function *F = CGM.getIntrinsic(Intrinsic::vscale, Ty);
     return Builder.CreateMul(NumEls, Builder.CreateCall(F));
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index d90cffb4bb95c..8a85a24910e4e 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -726,7 +726,7 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) {
     {
       ASTContext::BuiltinVectorTypeInfo Info =
           CGM.getContext().getBuiltinVectorTypeInfo(BT);
-      unsigned NumElemsPerVG = (Info.EC.Min * Info.NumVectors) / 2;
+      unsigned NumElemsPerVG = (Info.EC.getKnownMinValue() * Info.NumVectors) / 2;
 
       // Debuggers can't extract 1bit from a vector, so will display a
       // bitpattern for svbool_t instead.
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 9c072d4160756..aede8a53ba909 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -586,7 +586,8 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
       ASTContext::BuiltinVectorTypeInfo Info =
           Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(Ty));
       return llvm::ScalableVectorType::get(ConvertType(Info.ElementType),
-                                           Info.EC.Min * Info.NumVectors);
+                                           Info.EC.getKnownMinValue() *
+                                               Info.NumVectors);
     }
     case BuiltinType::Dependent:
 #define BUILTIN_TYPE(Id, SingletonId)
diff --git a/clang/lib/Tooling/Syntax/BuildTree.cpp b/clang/lib/Tooling/Syntax/BuildTree.cpp
index b07e9c3faf9d5..a9f326439a2a5 100644
--- a/clang/lib/Tooling/Syntax/BuildTree.cpp
+++ b/clang/lib/Tooling/Syntax/BuildTree.cpp
@@ -197,18 +197,58 @@ static syntax::NodeKind getOperatorNodeKind(const CXXOperatorCallExpr &E) {
   llvm_unreachable("Unknown OverloadedOperatorKind enum");
 }
 
+/// Get the start of the qualified name. In the examples below it gives the
+/// location of the `^`:
+///     `int ^a;`
+///     `int *^a;`
+///     `int ^a::S::f(){}`
+static SourceLocation getQualifiedNameStart(NamedDecl *D) {
+  assert((isa<DeclaratorDecl, TypedefNameDecl>(D)) &&
+         "only DeclaratorDecl and TypedefNameDecl are supported.");
+
+  auto DN = D->getDeclName();
+  bool IsAnonymous = DN.isIdentifier() && !DN.getAsIdentifierInfo();
+  if (IsAnonymous)
+    return SourceLocation();
+
+  if (const auto *DD = dyn_cast<DeclaratorDecl>(D)) {
+    if (DD->getQualifierLoc()) {
+      return DD->getQualifierLoc().getBeginLoc();
+    }
+  }
+
+  return D->getLocation();
+}
+
+/// Gets the range of the initializer inside an init-declarator C++ [dcl.decl].
+///     `int a;` -> range of ``,
+///     `int *a = nullptr` -> range of `= nullptr`.
+///     `int a{}` -> range of `{}`.
+///     `int a()` -> range of `()`.
+static SourceRange getInitializerRange(Decl *D) {
+  if (auto *V = dyn_cast<VarDecl>(D)) {
+    auto *I = V->getInit();
+    // Initializers in range-based-for are not part of the declarator
+    if (I && !V->isCXXForRangeDecl())
+      return I->getSourceRange();
+  }
+
+  return SourceRange();
+}
+
 /// Gets the range of declarator as defined by the C++ grammar. E.g.
 ///     `int a;` -> range of `a`,
 ///     `int *a;` -> range of `*a`,
 ///     `int a[10];` -> range of `a[10]`,
 ///     `int a[1][2][3];` -> range of `a[1][2][3]`,
 ///     `int *a = nullptr` -> range of `*a = nullptr`.
-/// FIMXE: \p Name must be a source range, e.g. for `operator+`.
+///     `int S::f(){}` -> range of `S::f()`.
+/// FIXME: \p Name must be a source range.
 static SourceRange getDeclaratorRange(const SourceManager &SM, TypeLoc T,
                                       SourceLocation Name,
                                       SourceRange Initializer) {
   SourceLocation Start = GetStartLoc().Visit(T);
-  SourceLocation End = T.getSourceRange().getEnd();
+  SourceLocation End = T.getEndLoc();
   assert(End.isValid());
   if (Name.isValid()) {
     if (Start.isInvalid())
@@ -378,11 +418,9 @@ class syntax::TreeBuilder {
 
   /// Returns true if \p D is the last declarator in a chain and is thus
   /// reponsible for creating SimpleDeclaration for the whole chain.
-  template <class T>
-  bool isResponsibleForCreatingDeclaration(const T *D) const {
-    static_assert((std::is_base_of<DeclaratorDecl, T>::value ||
-                   std::is_base_of<TypedefNameDecl, T>::value),
-                  "only DeclaratorDecl and TypedefNameDecl are supported.");
+  bool isResponsibleForCreatingDeclaration(const Decl *D) const {
+    assert((isa<DeclaratorDecl, TypedefNameDecl>(D)) &&
+           "only DeclaratorDecl and TypedefNameDecl are supported.");
 
     const Decl *Next = D->getNextDeclInContext();
 
@@ -390,15 +428,14 @@ class syntax::TreeBuilder {
     if (Next == nullptr) {
       return true;
     }
-    const auto *NextT = dyn_cast<T>(Next);
 
     // Next sibling is not the same type, this one is responsible.
-    if (NextT == nullptr) {
+    if (D->getKind() != Next->getKind()) {
       return true;
     }
     // Next sibling doesn't begin at the same loc, it must be a different
     // declaration, so this declarator is responsible.
-    if (NextT->getBeginLoc() != D->getBeginLoc()) {
+    if (Next->getBeginLoc() != D->getBeginLoc()) {
       return true;
     }
 
@@ -1405,43 +1442,12 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
   }
 
 private:
-  template <class T> SourceLocation getQualifiedNameStart(T *D) {
-    static_assert((std::is_base_of<DeclaratorDecl, T>::value ||
-                   std::is_base_of<TypedefNameDecl, T>::value),
-                  "only DeclaratorDecl and TypedefNameDecl are supported.");
-
-    auto DN = D->getDeclName();
-    bool IsAnonymous = DN.isIdentifier() && !DN.getAsIdentifierInfo();
-    if (IsAnonymous)
-      return SourceLocation();
-
-    if (const auto *DD = dyn_cast<DeclaratorDecl>(D)) {
-      if (DD->getQualifierLoc()) {
-        return DD->getQualifierLoc().getBeginLoc();
-      }
-    }
-
-    return D->getLocation();
-  }
-
-  SourceRange getInitializerRange(Decl *D) {
-    if (auto *V = dyn_cast<VarDecl>(D)) {
-      auto *I = V->getInit();
-      // Initializers in range-based-for are not part of the declarator
-      if (I && !V->isCXXForRangeDecl())
-        return I->getSourceRange();
-    }
-
-    return SourceRange();
-  }
-
   /// Folds SimpleDeclarator node (if present) and in case this is the last
   /// declarator in the chain it also folds SimpleDeclaration node.
   template <class T> bool processDeclaratorAndDeclaration(T *D) {
-    SourceRange Initializer = getInitializerRange(D);
-    auto Range = getDeclaratorRange(Builder.sourceManager(),
-                                    D->getTypeSourceInfo()->getTypeLoc(),
-                                    getQualifiedNameStart(D), Initializer);
+    auto Range = getDeclaratorRange(
+        Builder.sourceManager(), D->getTypeSourceInfo()->getTypeLoc(),
+        getQualifiedNameStart(D), getInitializerRange(D));
 
     // There doesn't have to be a declarator (e.g. `void foo(int)` only has
     // declaration, but no declarator).
@@ -1464,10 +1470,8 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
 
     auto ReturnedType = L.getReturnLoc();
     // Build node for the declarator, if any.
-    auto ReturnDeclaratorRange =
-        getDeclaratorRange(this->Builder.sourceManager(), ReturnedType,
-                           /*Name=*/SourceLocation(),
-                           /*Initializer=*/SourceLocation());
+    auto ReturnDeclaratorRange = SourceRange(GetStartLoc().Visit(ReturnedType),
+                                             ReturnedType.getEndLoc());
     syntax::SimpleDeclarator *ReturnDeclarator = nullptr;
     if (ReturnDeclaratorRange.isValid()) {
       ReturnDeclarator = new (allocator()) syntax::SimpleDeclarator;
diff --git a/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp b/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
index a07187e22e930..aab20008a4974 100644
--- a/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
+++ b/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
@@ -3123,6 +3123,35 @@ SimpleDeclaration
 )txt"}));
 }
 
+TEST_P(SyntaxTreeTest, OutOfLineMemberFunctionDefinition) {
+  if (!GetParam().isCXX11OrLater()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
+      R"cpp(
+struct S {
+  void f();
+};
+[[void S::f(){}]]
+)cpp",
+      {R"txt(
+SimpleDeclaration
+|-'void'
+|-SimpleDeclarator Declarator
+| |-NestedNameSpecifier
+| | |-IdentifierNameSpecifier ListElement
+| | | `-'S'
+| | `-'::' ListDelimiter
+| |-'f'
+| `-ParametersAndQualifiers
+|   |-'(' OpenParen
+|   `-')' CloseParen
+`-CompoundStatement
+  |-'{' OpenParen
+  `-'}' CloseParen
+)txt"}));
+}
+
 TEST_P(SyntaxTreeTest, ConversionMemberFunction) {
   if (!GetParam().isCXX()) {
     return;
@@ -3792,6 +3821,53 @@ TranslationUnit Detached
 )txt"));
 }
 
+TEST_P(SyntaxTreeTest, InitDeclarator_Brace) {
+  if (!GetParam().isCXX11OrLater()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqual(
+      R"cpp(
+int a {};
+)cpp",
+      R"txt(
+TranslationUnit Detached
+`-SimpleDeclaration
+  |-'int'
+  |-SimpleDeclarator Declarator
+  | |-'a'
+  | `-UnknownExpression
+  |   `-UnknownExpression
+  |     |-'{'
+  |     `-'}'
+  `-';'
+)txt"));
+}
+
+TEST_P(SyntaxTreeTest, InitDeclarator_Paren) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
+      R"cpp(
+struct S {
+  S(int);
+};
+[[S s(1);]]
+)cpp",
+      {R"txt(
+SimpleDeclaration
+|-'S'
+|-SimpleDeclarator Declarator
+| `-UnknownExpression
+|   |-'s'
+|   |-'('
+|   |-IntegerLiteralExpression
+|   | `-'1' LiteralToken
+|   `-')'
+`-';'
+)txt"}));
+}
+
 TEST_P(SyntaxTreeTest, ArrayDeclarator_Simple) {
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index a3e624842700b..ffbec74c61d02 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -130,8 +130,8 @@ class IntrinsicCostAttributes {
                           unsigned Factor);
   IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI,
                           ElementCount Factor)
-      : IntrinsicCostAttributes(Id, CI, Factor.Min) {
-    assert(!Factor.Scalable);
+      : IntrinsicCostAttributes(Id, CI, Factor.getKnownMinValue()) {
+    assert(!Factor.isScalable());
   }
 
   IntrinsicCostAttributes(Intrinsic::ID Id, const CallBase &CI,
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 527bba67b2579..074960e7ced20 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -115,7 +115,7 @@ struct VFShape {
       Parameters.push_back(
           VFParameter({CI.arg_size(), VFParamKind::GlobalPredicate}));
 
-    return {EC.Min, EC.Scalable, Parameters};
+    return {EC.getKnownMinValue(), EC.isScalable(), Parameters};
   }
   /// Sanity check on the Parameters in the VFShape.
   bool hasValidParameterList() const;
diff --git a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
index a8a436337e075..1d2251dbededb 100644
--- a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
+++ b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
@@ -202,6 +202,11 @@ class ReachingDefAnalysis : public MachineFunctionPass {
   void getGlobalUses(MachineInstr *MI, int PhysReg,
                      InstSet &Uses) const;
 
+  /// Collect all possible definitions of the value stored in PhysReg, which is
+  /// used by MI.
+  void getGlobalReachingDefs(MachineInstr *MI, int PhysReg,
+                             InstSet &Defs) const;
+
   /// Return whether From can be moved forwards to just before To.
   bool isSafeToMoveForwards(MachineInstr *From, MachineInstr *To) const;
 
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.h b/llvm/include/llvm/CodeGen/ValueTypes.h
index db8161caf7d2e..2e172bf422537 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.h
+++ b/llvm/include/llvm/CodeGen/ValueTypes.h
@@ -304,7 +304,7 @@ namespace llvm {
 
     /// Given a vector type, return the minimum number of elements it contains.
     unsigned getVectorMinNumElements() const {
-      return getVectorElementCount().Min;
+      return getVectorElementCount().getKnownMinValue();
     }
 
     /// Return the size of the specified value type in bits.
@@ -383,7 +383,7 @@ namespace llvm {
     EVT getHalfNumVectorElementsVT(LLVMContext &Context) const {
       EVT EltVT = getVectorElementType();
       auto EltCnt = getVectorElementCount();
-      assert(!(EltCnt.Min & 1) && "Splitting vector, but not in half!");
+      assert(EltCnt.isKnownEven() && "Splitting vector, but not in half!");
       return EVT::getVectorVT(Context, EltVT, EltCnt / 2);
     }
 
@@ -398,7 +398,8 @@ namespace llvm {
     EVT getPow2VectorType(LLVMContext &Context) const {
       if (!isPow2VectorType()) {
         ElementCount NElts = getVectorElementCount();
-        NElts.Min = 1 << Log2_32_Ceil(NElts.Min);
+        unsigned NewMinCount = 1 << Log2_32_Ceil(NElts.getKnownMinValue());
+        NElts = ElementCount::get(NewMinCount, NElts.isScalable());
         return EVT::getVectorVT(Context, getVectorElementType(), NElts);
       }
       else {
diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h
index 579275ab1f822..a58f381300161 100644
--- a/llvm/include/llvm/IR/DataLayout.h
+++ b/llvm/include/llvm/IR/DataLayout.h
@@ -696,9 +696,9 @@ inline TypeSize DataLayout::getTypeSizeInBits(Type *Ty) const {
   case Type::ScalableVectorTyID: {
     VectorType *VTy = cast<VectorType>(Ty);
     auto EltCnt = VTy->getElementCount();
-    uint64_t MinBits = EltCnt.Min *
-                        getTypeSizeInBits(VTy->getElementType()).getFixedSize();
-    return TypeSize(MinBits, EltCnt.Scalable);
+    uint64_t MinBits = EltCnt.getKnownMinValue() *
+                       getTypeSizeInBits(VTy->getElementType()).getFixedSize();
+    return TypeSize(MinBits, EltCnt.isScalable());
   }
   default:
     llvm_unreachable("DataLayout::getTypeSizeInBits(): Unsupported type");
diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index 1837f6808f24d..25ece7a060fd4 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -426,16 +426,16 @@ class VectorType : public Type {
   unsigned getNumElements() const {
     ElementCount EC = getElementCount();
 #ifdef STRICT_FIXED_SIZE_VECTORS
-    assert(!EC.Scalable &&
+    assert(!EC.isScalable() &&
            "Request for fixed number of elements from scalable vector");
-    return EC.Min;
+    return EC.getKnownMinValue();
 #else
-    if (EC.Scalable)
+    if (EC.isScalable())
       WithColor::warning()
           << "The code that requested the fixed number of elements has made "
              "the assumption that this vector is not scalable. This assumption "
              "was not correct, and this may lead to broken code\n";
-    return EC.Min;
+    return EC.getKnownMinValue();
 #endif
   }
 
@@ -512,8 +512,8 @@ class VectorType : public Type {
   /// input type and the same element type.
   static VectorType *getHalfElementsVectorType(VectorType *VTy) {
     auto EltCnt = VTy->getElementCount();
-    assert ((EltCnt.Min & 1) == 0 &&
-            "Cannot halve vector with odd number of elements.");
+    assert(EltCnt.isKnownEven() &&
+           "Cannot halve vector with odd number of elements.");
     return VectorType::get(VTy->getElementType(), EltCnt/2);
   }
 
@@ -521,7 +521,8 @@ class VectorType : public Type {
   /// input type and the same element type.
   static VectorType *getDoubleElementsVectorType(VectorType *VTy) {
     auto EltCnt = VTy->getElementCount();
-    assert((EltCnt.Min * 2ull) <= UINT_MAX && "Too many elements in vector");
+    assert((EltCnt.getKnownMinValue() * 2ull) <= UINT_MAX &&
+           "Too many elements in vector");
     return VectorType::get(VTy->getElementType(), EltCnt * 2);
   }
 
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index ac7ce75a9f310..7b41dced564d4 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -2046,8 +2046,9 @@ class ShuffleVectorInst : public Instruction {
   /// Examples: shufflevector <4 x n> A, <4 x n> B, <1,2,3>
   ///           shufflevector <4 x n> A, <4 x n> B, <1,2,3,4,5>
   bool changesLength() const {
-    unsigned NumSourceElts =
-        cast<VectorType>(Op<0>()->getType())->getElementCount().Min;
+    unsigned NumSourceElts = cast<VectorType>(Op<0>()->getType())
+                                 ->getElementCount()
+                                 .getKnownMinValue();
     unsigned NumMaskElts = ShuffleMask.size();
     return NumSourceElts != NumMaskElts;
   }
diff --git a/llvm/include/llvm/Support/MachineValueType.h b/llvm/include/llvm/Support/MachineValueType.h
index 172d4fd8c2753..d88efcef731f7 100644
--- a/llvm/include/llvm/Support/MachineValueType.h
+++ b/llvm/include/llvm/Support/MachineValueType.h
@@ -424,7 +424,7 @@ namespace llvm {
     MVT getHalfNumVectorElementsVT() const {
       MVT EltVT = getVectorElementType();
       auto EltCnt = getVectorElementCount();
-      assert(!(EltCnt.Min & 1) && "Splitting vector, but not in half!");
+      assert(EltCnt.isKnownEven() && "Splitting vector, but not in half!");
       return getVectorVT(EltVT, EltCnt / 2);
     }
 
@@ -742,7 +742,7 @@ namespace llvm {
 
     /// Given a vector type, return the minimum number of elements it contains.
     unsigned getVectorMinNumElements() const {
-      return getVectorElementCount().Min;
+      return getVectorElementCount().getKnownMinValue();
     }
 
     /// Returns the size of the specified MVT in bits.
@@ -1207,9 +1207,9 @@ namespace llvm {
     }
 
     static MVT getVectorVT(MVT VT, ElementCount EC) {
-      if (EC.Scalable)
-        return getScalableVectorVT(VT, EC.Min);
-      return getVectorVT(VT, EC.Min);
+      if (EC.isScalable())
+        return getScalableVectorVT(VT, EC.getKnownMinValue());
+      return getVectorVT(VT, EC.getKnownMinValue());
     }
 
     /// Return the value type corresponding to the specified type.  This returns
diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index 3f67e0cfc63ee..fe564b634ec72 100644
--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -26,6 +26,11 @@ namespace llvm {
 template <typename T> struct DenseMapInfo;
 
 class ElementCount {
+private:
+  unsigned Min;  // Minimum number of vector elements.
+  bool Scalable; // If true, NumElements is a multiple of 'Min' determined
+                 // at runtime rather than compile time.
+
 public:
   /// Prevent code from using initializer-list contructors like
   /// ElementCount EC = {<unsigned>, <bool>}. The static `get*`
@@ -35,10 +40,6 @@ class ElementCount {
   ElementCount(unsigned Min, bool Scalable) : Min(Min), Scalable(Scalable) {}
 
 public:
-  unsigned Min;  // Minimum number of vector elements.
-  bool Scalable; // If true, NumElements is a multiple of 'Min' determined
-                 // at runtime rather than compile time.
-
   ElementCount() = default;
 
   ElementCount operator*(unsigned RHS) {
@@ -58,6 +59,16 @@ class ElementCount {
   bool operator==(unsigned RHS) const { return Min == RHS && !Scalable; }
   bool operator!=(unsigned RHS) const { return !(*this == RHS); }
 
+  ElementCount &operator*=(unsigned RHS) {
+    Min *= RHS;
+    return *this;
+  }
+
+  ElementCount &operator/=(unsigned RHS) {
+    Min /= RHS;
+    return *this;
+  }
+
   ElementCount NextPowerOf2() const {
     return {(unsigned)llvm::NextPowerOf2(Min), Scalable};
   }
@@ -81,11 +92,21 @@ class ElementCount {
   ///
   ///@{ No elements..
   bool isZero() const { return Min == 0; }
+  /// At least one element.
+  bool isNonZero() const { return Min != 0; }
+  /// A return value of true indicates we know at compile time that the number
+  /// of elements (vscale * Min) is definitely even. However, returning false
+  /// does not guarantee that the total number of elements is odd.
+  bool isKnownEven() const { return (Min & 0x1) == 0; }
   /// Exactly one element.
   bool isScalar() const { return !Scalable && Min == 1; }
   /// One or more elements.
   bool isVector() const { return (Scalable && Min != 0) || Min > 1; }
   ///@}
+
+  unsigned getKnownMinValue() const { return Min; }
+
+  bool isScalable() const { return Scalable; }
 };
 
 /// Stream operator function for `ElementCount`.
@@ -322,10 +343,11 @@ template <> struct DenseMapInfo<ElementCount> {
     return ElementCount::getFixed(~0U - 1);
   }
   static unsigned getHashValue(const ElementCount& EltCnt) {
-    if (EltCnt.Scalable)
-      return (EltCnt.Min * 37U) - 1U;
+    unsigned HashVal = EltCnt.getKnownMinValue() * 37U;
+    if (EltCnt.isScalable())
+      return (HashVal - 1U);
 
-    return EltCnt.Min * 37U;
+    return HashVal;
   }
 
   static bool isEqual(const ElementCount& LHS, const ElementCount& RHS) {
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 3746bc66e426a..eb41257bf5ad5 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4550,7 +4550,7 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1,
   unsigned MaskNumElts = Mask.size();
   ElementCount InVecEltCount = InVecTy->getElementCount();
 
-  bool Scalable = InVecEltCount.Scalable;
+  bool Scalable = InVecEltCount.isScalable();
 
   SmallVector<int, 32> Indices;
   Indices.assign(Mask.begin(), Mask.end());
@@ -4559,7 +4559,7 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1,
   // replace that input vector with undef.
   if (!Scalable) {
     bool MaskSelects0 = false, MaskSelects1 = false;
-    unsigned InVecNumElts = InVecEltCount.Min;
+    unsigned InVecNumElts = InVecEltCount.getKnownMinValue();
     for (unsigned i = 0; i != MaskNumElts; ++i) {
       if (Indices[i] == -1)
         continue;
@@ -4588,7 +4588,8 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1,
   // is not known at compile time for scalable vectors
   if (!Scalable && Op0Const && !Op1Const) {
     std::swap(Op0, Op1);
-    ShuffleVectorInst::commuteShuffleMask(Indices, InVecEltCount.Min);
+    ShuffleVectorInst::commuteShuffleMask(Indices,
+                                          InVecEltCount.getKnownMinValue());
   }
 
   // A splat of an inserted scalar constant becomes a vector constant:
diff --git a/llvm/lib/Analysis/VFABIDemangling.cpp b/llvm/lib/Analysis/VFABIDemangling.cpp
index 56155b28132eb..4cdffa63135fb 100644
--- a/llvm/lib/Analysis/VFABIDemangling.cpp
+++ b/llvm/lib/Analysis/VFABIDemangling.cpp
@@ -442,7 +442,7 @@ Optional<VFInfo> VFABI::tryDemangleForVFABI(StringRef MangledName,
     if (!F)
       return None;
     const ElementCount EC = getECFromSignature(F->getFunctionType());
-    VF = EC.Min;
+    VF = EC.getKnownMinValue();
   }
 
   // Sanity checks.
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index f8a5cecc16a81..b592412ed0b6f 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -4808,7 +4808,8 @@ static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly) {
     auto *VTy = cast<VectorType>(Op->getOperand(0)->getType());
     unsigned IdxOp = Op->getOpcode() == Instruction::InsertElement ? 2 : 1;
     auto *Idx = dyn_cast<ConstantInt>(Op->getOperand(IdxOp));
-    if (!Idx || Idx->getZExtValue() >= VTy->getElementCount().Min)
+    if (!Idx ||
+        Idx->getZExtValue() >= VTy->getElementCount().getKnownMinValue())
       return true;
     return false;
   }
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 329cbe4020a3f..f2630903dba1c 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -970,7 +970,7 @@ void ModuleBitcodeWriter::writeTypeTable() {
       // VECTOR [numelts, eltty] or
       //        [numelts, eltty, scalable]
       Code = bitc::TYPE_CODE_VECTOR;
-      TypeVals.push_back(VT->getElementCount().Min);
+      TypeVals.push_back(VT->getElementCount().getKnownMinValue());
       TypeVals.push_back(VE.getTypeID(VT->getElementType()));
       if (isa<ScalableVectorType>(VT))
         TypeVals.push_back(true);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 86b5d2055f550..2034fd0730ebd 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -6957,10 +6957,10 @@ class VectorPromoteHelper {
     if (UseSplat)
       return ConstantVector::getSplat(EC, Val);
 
-    if (!EC.Scalable) {
+    if (!EC.isScalable()) {
       SmallVector<Constant *, 4> ConstVec;
       UndefValue *UndefVal = UndefValue::get(Val->getType());
-      for (unsigned Idx = 0; Idx != EC.Min; ++Idx) {
+      for (unsigned Idx = 0; Idx != EC.getKnownMinValue(); ++Idx) {
         if (Idx == ExtractIdx)
           ConstVec.push_back(Val);
         else
diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
index cb53ea47e79fc..5a4837079bed9 100644
--- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
+++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -389,6 +389,19 @@ ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, int PhysReg,
   }
 }
 
+void
+ReachingDefAnalysis::getGlobalReachingDefs(MachineInstr *MI, int PhysReg,
+                                           InstSet &Defs) const {
+  if (auto *Def = getUniqueReachingMIDef(MI, PhysReg)) {
+    Defs.insert(Def);
+    return;
+  }
+
+  SmallPtrSet<MachineBasicBlock *, 2> Visited;
+  for (auto *MBB : MI->getParent()->predecessors())
+    getLiveOuts(MBB, PhysReg, Defs);
+}
+
 void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, int PhysReg,
                                       InstSet &Defs) const {
   SmallPtrSet<MachineBasicBlock*, 2> VisitedBBs;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 59edd03b7ec82..31ac5d92ffe63 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18994,7 +18994,7 @@ static SDValue combineConcatVectorOfCasts(SDNode *N, SelectionDAG &DAG) {
   // check the other type in the cast to make sure this is really legal.
   EVT VT = N->getValueType(0);
   EVT SrcEltVT = SrcVT.getVectorElementType();
-  unsigned NumElts = SrcVT.getVectorElementCount().Min * N->getNumOperands();
+  ElementCount NumElts = SrcVT.getVectorElementCount() * N->getNumOperands();
   EVT ConcatSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcEltVT, NumElts);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   switch (CastOpcode) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 85dc150e14613..1a2c77974c2b9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -428,10 +428,10 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
     // vector widening case (e.g. <2 x float> -> <4 x float>).  Extract the
     // elements we want.
     if (PartEVT.getVectorElementType() == ValueVT.getVectorElementType()) {
-      assert((PartEVT.getVectorElementCount().Min >
-              ValueVT.getVectorElementCount().Min) &&
-             (PartEVT.getVectorElementCount().Scalable ==
-              ValueVT.getVectorElementCount().Scalable) &&
+      assert((PartEVT.getVectorElementCount().getKnownMinValue() >
+              ValueVT.getVectorElementCount().getKnownMinValue()) &&
+             (PartEVT.getVectorElementCount().isScalable() ==
+              ValueVT.getVectorElementCount().isScalable()) &&
              "Cannot narrow, it would be a lossy transformation");
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
                          DAG.getVectorIdxConstant(0, DL));
@@ -3751,7 +3751,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
   if (IsVectorGEP && !N.getValueType().isVector()) {
     LLVMContext &Context = *DAG.getContext();
     EVT VT = EVT::getVectorVT(Context, N.getValueType(), VectorElementCount);
-    if (VectorElementCount.Scalable)
+    if (VectorElementCount.isScalable())
       N = DAG.getSplatVector(VT, dl, N);
     else
       N = DAG.getSplatBuildVector(VT, dl, N);
@@ -3824,7 +3824,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       if (!IdxN.getValueType().isVector() && IsVectorGEP) {
         EVT VT = EVT::getVectorVT(*Context, IdxN.getValueType(),
                                   VectorElementCount);
-        if (VectorElementCount.Scalable)
+        if (VectorElementCount.isScalable())
           IdxN = DAG.getSplatVector(VT, dl, IdxN);
         else
           IdxN = DAG.getSplatBuildVector(VT, dl, IdxN);
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index c71bf7c74808e..fe9feb5f116b6 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -499,14 +499,14 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI,
   // Return false if any operands requested for folding are not foldable (not
   // part of the stackmap's live values).
   for (unsigned Op : Ops) {
-    // Caller is expected to avoid passing in tied operands
-    assert(!MI.getOperand(Op).isTied());
     if (Op < NumDefs) {
       assert(DefToFoldIdx == MI.getNumOperands() && "Folding multiple defs");
       DefToFoldIdx = Op;
     } else if (Op < StartIdx) {
       return nullptr;
     }
+    if (MI.getOperand(Op).isTied())
+      return nullptr;
   }
 
   MachineInstr *NewMI =
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 40bb45a584dbd..958bb7939046b 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -964,23 +964,24 @@ static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
 
   // Scalable vectors cannot be scalarized, so splitting or widening is
   // required.
-  if (VT.isScalableVector() && !isPowerOf2_32(EC.Min))
+  if (VT.isScalableVector() && !isPowerOf2_32(EC.getKnownMinValue()))
     llvm_unreachable(
         "Splitting or widening of non-power-of-2 MVTs is not implemented.");
 
   // FIXME: We don't support non-power-of-2-sized vectors for now.
   // Ideally we could break down into LHS/RHS like LegalizeDAG does.
-  if (!isPowerOf2_32(EC.Min)) {
+  if (!isPowerOf2_32(EC.getKnownMinValue())) {
     // Split EC to unit size (scalable property is preserved).
-    NumVectorRegs = EC.Min;
-    EC = EC / NumVectorRegs;
+    NumVectorRegs = EC.getKnownMinValue();
+    EC = ElementCount::getFixed(1);
   }
 
   // Divide the input until we get to a supported size. This will
   // always end up with an EC that represent a scalar or a scalable
   // scalar.
-  while (EC.Min > 1 && !TLI->isTypeLegal(MVT::getVectorVT(EltTy, EC))) {
-    EC.Min >>= 1;
+  while (EC.getKnownMinValue() > 1 &&
+         !TLI->isTypeLegal(MVT::getVectorVT(EltTy, EC))) {
+    EC /= 2;
     NumVectorRegs <<= 1;
   }
 
@@ -1315,13 +1316,15 @@ void TargetLoweringBase::computeRegisterProperties(
     }
 
     case TypeWidenVector:
-      if (isPowerOf2_32(EC.Min)) {
+      if (isPowerOf2_32(EC.getKnownMinValue())) {
         // Try to widen the vector.
         for (unsigned nVT = i + 1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
           MVT SVT = (MVT::SimpleValueType) nVT;
           if (SVT.getVectorElementType() == EltVT &&
               SVT.isScalableVector() == IsScalable &&
-              SVT.getVectorElementCount().Min > EC.Min && isTypeLegal(SVT)) {
+              SVT.getVectorElementCount().getKnownMinValue() >
+                  EC.getKnownMinValue() &&
+              isTypeLegal(SVT)) {
             TransformToType[i] = SVT;
             RegisterTypeForVT[i] = SVT;
             NumRegistersForVT[i] = 1;
@@ -1365,10 +1368,10 @@ void TargetLoweringBase::computeRegisterProperties(
           ValueTypeActions.setTypeAction(VT, TypeScalarizeVector);
         else if (PreferredAction == TypeSplitVector)
           ValueTypeActions.setTypeAction(VT, TypeSplitVector);
-        else if (EC.Min > 1)
+        else if (EC.getKnownMinValue() > 1)
           ValueTypeActions.setTypeAction(VT, TypeSplitVector);
         else
-          ValueTypeActions.setTypeAction(VT, EC.Scalable
+          ValueTypeActions.setTypeAction(VT, EC.isScalable()
                                                  ? TypeScalarizeScalableVector
                                                  : TypeScalarizeVector);
       } else {
@@ -1426,7 +1429,8 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT
   // This handles things like <2 x float> -> <4 x float> and
   // <4 x i1> -> <4 x i32>.
   LegalizeTypeAction TA = getTypeAction(Context, VT);
-  if (EltCnt.Min != 1 && (TA == TypeWidenVector || TA == TypePromoteInteger)) {
+  if (EltCnt.getKnownMinValue() != 1 &&
+      (TA == TypeWidenVector || TA == TypePromoteInteger)) {
     EVT RegisterEVT = getTypeToTransformTo(Context, VT);
     if (isTypeLegal(RegisterEVT)) {
       IntermediateVT = RegisterEVT;
@@ -1443,7 +1447,7 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT
 
   // Scalable vectors cannot be scalarized, so handle the legalisation of the
   // types like done elsewhere in SelectionDAG.
-  if (VT.isScalableVector() && !isPowerOf2_32(EltCnt.Min)) {
+  if (VT.isScalableVector() && !isPowerOf2_32(EltCnt.getKnownMinValue())) {
     LegalizeKind LK;
     EVT PartVT = VT;
     do {
@@ -1452,15 +1456,15 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT
       PartVT = LK.second;
     } while (LK.first != TypeLegal);
 
-    NumIntermediates =
-        VT.getVectorElementCount().Min / PartVT.getVectorElementCount().Min;
+    NumIntermediates = VT.getVectorElementCount().getKnownMinValue() /
+                       PartVT.getVectorElementCount().getKnownMinValue();
 
     // FIXME: This code needs to be extended to handle more complex vector
     // breakdowns, like nxv7i64 -> nxv8i64 -> 4 x nxv2i64. Currently the only
     // supported cases are vectors that are broken down into equal parts
     // such as nxv6i64 -> 3 x nxv2i64.
-    assert(NumIntermediates * PartVT.getVectorElementCount().Min ==
-               VT.getVectorElementCount().Min &&
+    assert((PartVT.getVectorElementCount() * NumIntermediates) ==
+               VT.getVectorElementCount() &&
            "Expected an integer multiple of PartVT");
     IntermediateVT = PartVT;
     RegisterVT = getRegisterType(Context, IntermediateVT);
@@ -1469,16 +1473,16 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT
 
   // FIXME: We don't support non-power-of-2-sized vectors for now.  Ideally
   // we could break down into LHS/RHS like LegalizeDAG does.
-  if (!isPowerOf2_32(EltCnt.Min)) {
-    NumVectorRegs = EltCnt.Min;
-    EltCnt.Min = 1;
+  if (!isPowerOf2_32(EltCnt.getKnownMinValue())) {
+    NumVectorRegs = EltCnt.getKnownMinValue();
+    EltCnt = ElementCount::getFixed(1);
   }
 
   // Divide the input until we get to a supported size.  This will always
   // end with a scalar if the target doesn't support vectors.
-  while (EltCnt.Min > 1 &&
+  while (EltCnt.getKnownMinValue() > 1 &&
          !isTypeLegal(EVT::getVectorVT(Context, EltTy, EltCnt))) {
-    EltCnt.Min >>= 1;
+    EltCnt /= 2;
     NumVectorRegs <<= 1;
   }
 
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index c3974519bfc65..72eda95e72c86 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -122,13 +122,13 @@ EVT EVT::getExtendedVectorElementType() const {
 unsigned EVT::getExtendedVectorNumElements() const {
   assert(isExtended() && "Type is not extended!");
   ElementCount EC = cast<VectorType>(LLVMTy)->getElementCount();
-  if (EC.Scalable) {
+  if (EC.isScalable()) {
     WithColor::warning()
         << "The code that requested the fixed number of elements has made the "
            "assumption that this vector is not scalable. This assumption was "
            "not correct, and this may lead to broken code\n";
   }
-  return EC.Min;
+  return EC.getKnownMinValue();
 }
 
 ElementCount EVT::getExtendedVectorElementCount() const {
@@ -150,9 +150,9 @@ std::string EVT::getEVTString() const {
   switch (V.SimpleTy) {
   default:
     if (isVector())
-      return (isScalableVector() ? "nxv" : "v")
-             + utostr(getVectorElementCount().Min)
-             + getVectorElementType().getEVTString();
+      return (isScalableVector() ? "nxv" : "v") +
+             utostr(getVectorElementCount().getKnownMinValue()) +
+             getVectorElementType().getEVTString();
     if (isInteger())
       return "i" + utostr(getSizeInBits());
     if (isFloatingPoint())
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 0b2ac8582a62b..8cb1883da68e4 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -656,9 +656,9 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) {
     VectorType *PTy = cast<VectorType>(Ty);
     ElementCount EC = PTy->getElementCount();
     OS << "<";
-    if (EC.Scalable)
+    if (EC.isScalable())
       OS << "vscale x ";
-    OS << EC.Min << " x ";
+    OS << EC.getKnownMinValue() << " x ";
     print(PTy->getElementType(), OS);
     OS << '>';
     return;
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index e5c4250665e41..468dce95a29ad 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -931,7 +931,7 @@ Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *V2,
   // If the mask is all zeros this is a splat, no need to go through all
   // elements.
   if (all_of(Mask, [](int Elt) { return Elt == 0; }) &&
-      !MaskEltCount.Scalable) {
+      !MaskEltCount.isScalable()) {
     Type *Ty = IntegerType::get(V1->getContext(), 32);
     Constant *Elt =
         ConstantExpr::getExtractElement(V1, ConstantInt::get(Ty, 0));
@@ -942,7 +942,7 @@ Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *V2,
   if (isa<ScalableVectorType>(V1VTy))
     return nullptr;
 
-  unsigned SrcNumElts = V1VTy->getElementCount().Min;
+  unsigned SrcNumElts = V1VTy->getElementCount().getKnownMinValue();
 
   // Loop over the shuffle mask, evaluating each element.
   SmallVector<Constant*, 32> Result;
@@ -2056,11 +2056,12 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
     SmallVector<Constant*, 4> ResElts;
     Type *Ty = IntegerType::get(C1->getContext(), 32);
     // Compare the elements, producing an i1 result or constant expr.
-    for (unsigned i = 0, e = C1VTy->getElementCount().Min; i != e; ++i) {
+    for (unsigned I = 0, E = C1VTy->getElementCount().getKnownMinValue();
+         I != E; ++I) {
       Constant *C1E =
-        ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, i));
+          ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, I));
       Constant *C2E =
-        ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, i));
+          ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, I));
 
       ResElts.push_back(ConstantExpr::getCompare(pred, C1E, C2E));
     }
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index 8d960ea9a5faa..d84c7bc2da9db 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -1300,14 +1300,14 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
 }
 
 Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) {
-  if (!EC.Scalable) {
+  if (!EC.isScalable()) {
     // If this splat is compatible with ConstantDataVector, use it instead of
     // ConstantVector.
     if ((isa<ConstantFP>(V) || isa<ConstantInt>(V)) &&
         ConstantDataSequential::isElementTypeCompatible(V->getType()))
-      return ConstantDataVector::getSplat(EC.Min, V);
+      return ConstantDataVector::getSplat(EC.getKnownMinValue(), V);
 
-    SmallVector<Constant *, 32> Elts(EC.Min, V);
+    SmallVector<Constant *, 32> Elts(EC.getKnownMinValue(), V);
     return get(Elts);
   }
 
@@ -1324,7 +1324,7 @@ Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) {
   Constant *UndefV = UndefValue::get(VTy);
   V = ConstantExpr::getInsertElement(UndefV, V, ConstantInt::get(I32Ty, 0));
   // Build shuffle mask to perform the splat.
-  SmallVector<int, 8> Zeros(EC.Min, 0);
+  SmallVector<int, 8> Zeros(EC.getKnownMinValue(), 0);
   // Splat.
   return ConstantExpr::getShuffleVector(V, UndefV, Zeros);
 }
@@ -2264,7 +2264,7 @@ Constant *ConstantExpr::getGetElementPtr(Type *Ty, Constant *C,
       if (VectorType *VecTy = dyn_cast<VectorType>(Idx->getType()))
         EltCount = VecTy->getElementCount();
 
-  if (EltCount.Min != 0)
+  if (EltCount.isNonZero())
     ReqTy = VectorType::get(ReqTy, EltCount);
 
   if (OnlyIfReducedTy == ReqTy)
@@ -2284,7 +2284,7 @@ Constant *ConstantExpr::getGetElementPtr(Type *Ty, Constant *C,
 
     if (GTI.isStruct() && Idx->getType()->isVectorTy()) {
       Idx = Idx->getSplatValue();
-    } else if (GTI.isSequential() && EltCount.Min != 0 &&
+    } else if (GTI.isSequential() && EltCount.isNonZero() &&
                !Idx->getType()->isVectorTy()) {
       Idx = ConstantVector::getSplat(EltCount, Idx);
     }
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 71faa5002b9ff..8598acc82804f 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -781,7 +781,7 @@ unsigned LLVMGetPointerAddressSpace(LLVMTypeRef PointerTy) {
 }
 
 unsigned LLVMGetVectorSize(LLVMTypeRef VectorTy) {
-  return unwrap<VectorType>(VectorTy)->getElementCount().Min;
+  return unwrap<VectorType>(VectorTy)->getElementCount().getKnownMinValue();
 }
 
 /*--.. Operations on other types ...........................................--*/
diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp
index 31b227d4a6820..ffb3adcdbf8a0 100644
--- a/llvm/lib/IR/DataLayout.cpp
+++ b/llvm/lib/IR/DataLayout.cpp
@@ -630,7 +630,7 @@ Align DataLayout::getAlignmentInfo(AlignTypeEnum AlignType, uint32_t BitWidth,
     // We're only calculating a natural alignment, so it doesn't have to be
     // based on the full size for scalable vectors. Using the minimum element
     // count should be enough here.
-    Alignment *= cast<VectorType>(Ty)->getElementCount().Min;
+    Alignment *= cast<VectorType>(Ty)->getElementCount().getKnownMinValue();
     Alignment = PowerOf2Ceil(Alignment);
     return Align(Alignment);
    }
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index b29a00c5fe460..e701feae22562 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -714,9 +714,10 @@ static std::string getMangledTypeStr(Type* Ty) {
     Result += "f";
   } else if (VectorType* VTy = dyn_cast<VectorType>(Ty)) {
     ElementCount EC = VTy->getElementCount();
-    if (EC.Scalable)
+    if (EC.isScalable())
       Result += "nx";
-    Result += "v" + utostr(EC.Min) + getMangledTypeStr(VTy->getElementType());
+    Result += "v" + utostr(EC.getKnownMinValue()) +
+              getMangledTypeStr(VTy->getElementType());
   } else if (Ty) {
     switch (Ty->getTypeID()) {
     default: llvm_unreachable("Unhandled type");
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 33a0f5b09d0b9..d6eeffd44b368 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -1003,7 +1003,7 @@ Value *IRBuilderBase::CreateVectorSplat(unsigned NumElts, Value *V,
 
 Value *IRBuilderBase::CreateVectorSplat(ElementCount EC, Value *V,
                                         const Twine &Name) {
-  assert(EC.Min > 0 && "Cannot splat to an empty vector!");
+  assert(EC.isNonZero() && "Cannot splat to an empty vector!");
 
   // First insert it into an undef vector so we can shuffle it.
   Type *I32Ty = getInt32Ty();
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 48f416173dde1..445fad8bcbf41 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -1967,7 +1967,8 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
     return false;
 
   // Make sure the mask elements make sense.
-  int V1Size = cast<VectorType>(V1->getType())->getElementCount().Min;
+  int V1Size =
+      cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
   for (int Elem : Mask)
     if (Elem != UndefMaskElem && Elem >= V1Size * 2)
       return false;
@@ -2026,22 +2027,22 @@ void ShuffleVectorInst::getShuffleMask(const Constant *Mask,
   ElementCount EC = cast<VectorType>(Mask->getType())->getElementCount();
 
   if (isa<ConstantAggregateZero>(Mask)) {
-    Result.resize(EC.Min, 0);
+    Result.resize(EC.getKnownMinValue(), 0);
     return;
   }
 
-  Result.reserve(EC.Min);
+  Result.reserve(EC.getKnownMinValue());
 
-  if (EC.Scalable) {
+  if (EC.isScalable()) {
     assert((isa<ConstantAggregateZero>(Mask) || isa<UndefValue>(Mask)) &&
            "Scalable vector shuffle mask must be undef or zeroinitializer");
     int MaskVal = isa<UndefValue>(Mask) ? -1 : 0;
-    for (unsigned I = 0; I < EC.Min; ++I)
+    for (unsigned I = 0; I < EC.getKnownMinValue(); ++I)
       Result.emplace_back(MaskVal);
     return;
   }
 
-  unsigned NumElts = EC.Min;
+  unsigned NumElts = EC.getKnownMinValue();
 
   if (auto *CDS = dyn_cast<ConstantDataSequential>(Mask)) {
     for (unsigned i = 0; i != NumElts; ++i)
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index c4e06cd979ed6..b6b036d0bbca2 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -280,8 +280,8 @@ bool VPIntrinsic::canIgnoreVectorLengthParam() const {
   // the operation. This function returns true when this is detected statically
   // in the IR.
 
-  // Check whether "W == vscale * EC.Min"
-  if (EC.Scalable) {
+  // Check whether "W == vscale * EC.getKnownMinValue()"
+  if (EC.isScalable()) {
     // Undig the DL
     auto ParMod = this->getModule();
     if (!ParMod)
@@ -291,8 +291,8 @@ bool VPIntrinsic::canIgnoreVectorLengthParam() const {
     // Compare vscale patterns
     uint64_t VScaleFactor;
     if (match(VLParam, m_c_Mul(m_ConstantInt(VScaleFactor), m_VScale(DL))))
-      return VScaleFactor >= EC.Min;
-    return (EC.Min == 1) && match(VLParam, m_VScale(DL));
+      return VScaleFactor >= EC.getKnownMinValue();
+    return (EC.getKnownMinValue() == 1) && match(VLParam, m_VScale(DL));
   }
 
   // standard SIMD operation
@@ -301,7 +301,7 @@ bool VPIntrinsic::canIgnoreVectorLengthParam() const {
     return false;
 
   uint64_t VLNum = VLConst->getZExtValue();
-  if (VLNum >= EC.Min)
+  if (VLNum >= EC.getKnownMinValue())
     return true;
 
   return false;
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 1cf0d1345ab31..ce374faa5ced2 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -128,7 +128,7 @@ TypeSize Type::getPrimitiveSizeInBits() const {
     ElementCount EC = VTy->getElementCount();
     TypeSize ETS = VTy->getElementType()->getPrimitiveSizeInBits();
     assert(!ETS.isScalable() && "Vector type should have fixed-width elements");
-    return {ETS.getFixedSize() * EC.Min, EC.Scalable};
+    return {ETS.getFixedSize() * EC.getKnownMinValue(), EC.isScalable()};
   }
   default: return TypeSize::Fixed(0);
   }
@@ -598,10 +598,10 @@ VectorType::VectorType(Type *ElType, unsigned EQ, Type::TypeID TID)
 }
 
 VectorType *VectorType::get(Type *ElementType, ElementCount EC) {
-  if (EC.Scalable)
-    return ScalableVectorType::get(ElementType, EC.Min);
+  if (EC.isScalable())
+    return ScalableVectorType::get(ElementType, EC.getKnownMinValue());
   else
-    return FixedVectorType::get(ElementType, EC.Min);
+    return FixedVectorType::get(ElementType, EC.getKnownMinValue());
 }
 
 bool VectorType::isValidElementType(Type *ElemTy) {
diff --git a/llvm/lib/ObjectYAML/DWARFEmitter.cpp b/llvm/lib/ObjectYAML/DWARFEmitter.cpp
index bbcf56cc0cec0..912213517334c 100644
--- a/llvm/lib/ObjectYAML/DWARFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/DWARFEmitter.cpp
@@ -95,8 +95,8 @@ Error DWARFYAML::emitDebugStr(raw_ostream &OS, const DWARFYAML::Data &DI) {
 }
 
 Error DWARFYAML::emitDebugAbbrev(raw_ostream &OS, const DWARFYAML::Data &DI) {
-  uint64_t AbbrevCode = 0;
   for (const DWARFYAML::AbbrevTable &AbbrevTable : DI.DebugAbbrev) {
+    uint64_t AbbrevCode = 0;
     for (const DWARFYAML::Abbrev &AbbrevDecl : AbbrevTable.Table) {
       AbbrevCode =
           AbbrevDecl.Code ? (uint64_t)*AbbrevDecl.Code : AbbrevCode + 1;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 184458607c3cc..fda514b2006c2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -4827,7 +4827,8 @@ static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT,
     return EVT();
 
   ElementCount EC = PredVT.getVectorElementCount();
-  EVT ScalarVT = EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.Min);
+  EVT ScalarVT =
+      EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
   EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC * NumVec);
 
   return MemVT;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c904fc7f8c93c..e68183dc46a2c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3532,8 +3532,9 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
     // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
     // the custom lowering, as there are no un-paired non-temporal stores and
     // legalization will break up 256 bit inputs.
+    ElementCount EC = MemVT.getVectorElementCount();
     if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
-        MemVT.getVectorElementCount().Min % 2u == 0 &&
+        EC.isKnownEven() &&
         ((MemVT.getScalarSizeInBits() == 8u ||
           MemVT.getScalarSizeInBits() == 16u ||
           MemVT.getScalarSizeInBits() == 32u ||
@@ -3542,11 +3543,11 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
                       MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
                       StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
-      SDValue Hi = DAG.getNode(
-          ISD::EXTRACT_SUBVECTOR, Dl,
-          MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
-          StoreNode->getValue(),
-          DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64));
+      SDValue Hi =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
+                      MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
+                      StoreNode->getValue(),
+                      DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
       SDValue Result = DAG.getMemIntrinsicNode(
           AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
           {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
@@ -10370,7 +10371,7 @@ SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
       {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
 
   std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
-  assert(VT.getVectorElementCount().Min % N == 0 &&
+  assert(VT.getVectorElementCount().getKnownMinValue() % N == 0 &&
          "invalid tuple vector type!");
 
   EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
@@ -14443,7 +14444,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
 
       uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
       EVT ResVT = N->getValueType(0);
-      uint64_t NumLanes = ResVT.getVectorElementCount().Min;
+      uint64_t NumLanes = ResVT.getVectorElementCount().getKnownMinValue();
       SDValue ExtIdx = DAG.getVectorIdxConstant(IdxConst * NumLanes, DL);
       SDValue Val =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1, ExtIdx);
@@ -14457,10 +14458,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
       SDValue Vec = N->getOperand(4);
 
       EVT TupleVT = Tuple.getValueType();
-      uint64_t TupleLanes = TupleVT.getVectorElementCount().Min;
+      uint64_t TupleLanes = TupleVT.getVectorElementCount().getKnownMinValue();
 
       uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
-      uint64_t NumLanes = Vec.getValueType().getVectorElementCount().Min;
+      uint64_t NumLanes =
+          Vec.getValueType().getVectorElementCount().getKnownMinValue();
 
       if ((TupleLanes % NumLanes) != 0)
         report_fatal_error("invalid tuple vector!");
@@ -14696,7 +14698,7 @@ void AArch64TargetLowering::ReplaceExtractSubVectorResults(
 
   ElementCount ResEC = VT.getVectorElementCount();
 
-  if (InVT.getVectorElementCount().Min != (ResEC.Min * 2))
+  if (InVT.getVectorElementCount() != (ResEC * 2))
     return;
 
   auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
@@ -14704,7 +14706,7 @@ void AArch64TargetLowering::ReplaceExtractSubVectorResults(
     return;
 
   unsigned Index = CIndex->getZExtValue();
-  if ((Index != 0) && (Index != ResEC.Min))
+  if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
     return;
 
   unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 4d1ab88fe3b2c..a98590fd79c68 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -201,17 +201,6 @@ namespace {
     PredicatedMI *getDivergent() const { return Divergent; }
   };
 
-  struct Reduction {
-    MachineInstr *Init;
-    MachineInstr &Copy;
-    MachineInstr &Reduce;
-    MachineInstr &VPSEL;
-
-    Reduction(MachineInstr *Init, MachineInstr *Mov, MachineInstr *Add,
-              MachineInstr *Sel)
-      : Init(Init), Copy(*Mov), Reduce(*Add), VPSEL(*Sel) { }
-  };
-
   struct LowOverheadLoop {
 
     MachineLoop &ML;
@@ -232,7 +221,6 @@ namespace {
     SetVector<MachineInstr*> CurrentPredicate;
     SmallVector<VPTBlock, 4> VPTBlocks;
     SmallPtrSet<MachineInstr*, 4> ToRemove;
-    SmallVector<std::unique_ptr<Reduction>, 1> Reductions;
     SmallPtrSet<MachineInstr*, 4> BlockMasksToRecompute;
     bool Revert = false;
     bool CannotTailPredicate = false;
@@ -270,10 +258,6 @@ namespace {
     // of elements to the loop start instruction.
     bool ValidateTailPredicate(MachineInstr *StartInsertPt);
 
-    // See whether the live-out instructions are a reduction that we can fixup
-    // later.
-    bool FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers);
-
     // Check that any values available outside of the loop will be the same
     // after tail predication conversion.
     bool ValidateLiveOuts();
@@ -365,8 +349,6 @@ namespace {
 
     void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
 
-    void FixupReductions(LowOverheadLoop &LoLoop) const;
-
     MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
 
     void Expand(LowOverheadLoop &LoLoop);
@@ -447,8 +429,10 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
     }
   }
 
-  if (!ValidateLiveOuts())
+  if (!ValidateLiveOuts()) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
     return false;
+  }
 
   // For tail predication, we need to provide the number of elements, instead
   // of the iteration count, to the loop start instruction. The number of
@@ -636,7 +620,6 @@ static bool canGenerateNonZeros(const MachineInstr &MI) {
   return false;
 }
 
-
 // Look at its register uses to see if it only can only receive zeros
 // into its false lanes which would then produce zeros. Also check that
 // the output register is also defined by an FalseLanesZero instruction
@@ -649,120 +632,40 @@ static bool producesFalseLanesZero(MachineInstr &MI,
   if (canGenerateNonZeros(MI))
     return false;
 
+  bool isPredicated = isVectorPredicated(&MI);
+  // Predicated loads will write zeros to the falsely predicated bytes of the
+  // destination register.
+  if (MI.mayLoad())
+    return isPredicated;
+
+  auto IsZeroInit = [](MachineInstr *Def) {
+    return !isVectorPredicated(Def) &&
+           Def->getOpcode() == ARM::MVE_VMOVimmi32 &&
+           Def->getOperand(1).getImm() == 0;
+  };
+
   bool AllowScalars = isHorizontalReduction(MI);
   for (auto &MO : MI.operands()) {
     if (!MO.isReg() || !MO.getReg())
       continue;
     if (!isRegInClass(MO, QPRs) && AllowScalars)
       continue;
-    if (auto *OpDef = RDA.getMIOperand(&MI, MO))
-      if (FalseLanesZero.count(OpDef))
-       continue;
-    return false;
-  }
-  LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
-  return true;
-}
-
-bool
-LowOverheadLoop::FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers) {
-  // Also check for reductions where the operation needs to be merging values
-  // from the last and previous loop iterations. This means an instruction
-  // producing a value and a vmov storing the value calculated in the previous
-  // iteration. So we can have two live-out regs, one produced by a vmov and
-  // both being consumed by a vpsel.
-  LLVM_DEBUG(dbgs() << "ARM Loops: Looking for reduction live-outs:\n";
-             for (auto *MI : LiveMIs)
-               dbgs() << " - " << *MI);
-
-  if (!Preheader)
-    return false;
 
-  // Expect a vmov, a vadd and a single vpsel user.
-  // TODO: This means we can't currently support multiple reductions in the
-  // loop.
-  if (LiveMIs.size() != 2 || LiveOutUsers.size() != 1)
-    return false;
-
-  MachineInstr *VPSEL = *LiveOutUsers.begin();
-  if (VPSEL->getOpcode() != ARM::MVE_VPSEL)
-    return false;
-
-  unsigned VPRIdx = llvm::findFirstVPTPredOperandIdx(*VPSEL) + 1;
-  MachineInstr *Pred = RDA.getMIOperand(VPSEL, VPRIdx);
-  if (!Pred || Pred != VCTP) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Not using equivalent predicate.\n");
-    return false;
-  }
-
-  MachineInstr *Reduce = RDA.getMIOperand(VPSEL, 1);
-  if (!Reduce)
-    return false;
-
-  assert(LiveMIs.count(Reduce) && "Expected MI to be live-out");
-
-  // TODO: Support more operations than VADD.
-  switch (VCTP->getOpcode()) {
-  default:
-    return false;
-  case ARM::MVE_VCTP8:
-    if (Reduce->getOpcode() != ARM::MVE_VADDi8)
-      return false;
-    break;
-  case ARM::MVE_VCTP16:
-    if (Reduce->getOpcode() != ARM::MVE_VADDi16)
-      return false;
-    break;
-  case ARM::MVE_VCTP32:
-    if (Reduce->getOpcode() != ARM::MVE_VADDi32)
+    // Check that this instruction will produce zeros in its false lanes:
+    // - If it only consumes false lanes zero or constant 0 (vmov #0)
+    // - If it's predicated, it only matters that it's def register already has
+    //   false lane zeros, so we can ignore the uses.
+    SmallPtrSet<MachineInstr *, 2> Defs;
+    RDA.getGlobalReachingDefs(&MI, MO.getReg(), Defs);
+    for (auto *Def : Defs) {
+      if (Def == &MI || FalseLanesZero.count(Def) || IsZeroInit(Def))
+        continue;
+      if (MO.isUse() && isPredicated)
+        continue;
       return false;
-    break;
-  }
-
-  // Test that the reduce op is overwriting ones of its operands.
-  if (Reduce->getOperand(0).getReg() != Reduce->getOperand(1).getReg() &&
-      Reduce->getOperand(0).getReg() != Reduce->getOperand(2).getReg()) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Reducing op isn't overwriting itself.\n");
-    return false;
-  }
-
-  // Check that the VORR is actually a VMOV.
-  MachineInstr *Copy = RDA.getMIOperand(VPSEL, 2);
-  if (!Copy || Copy->getOpcode() != ARM::MVE_VORR ||
-      !Copy->getOperand(1).isReg() || !Copy->getOperand(2).isReg() ||
-      Copy->getOperand(1).getReg() != Copy->getOperand(2).getReg())
-    return false;
-
-  assert(LiveMIs.count(Copy) && "Expected MI to be live-out");
-
-  // Check that the vadd and vmov are only used by each other and the vpsel.
-  SmallPtrSet<MachineInstr*, 2> CopyUsers;
-  RDA.getGlobalUses(Copy, Copy->getOperand(0).getReg(), CopyUsers);
-  if (CopyUsers.size() > 2 || !CopyUsers.count(Reduce)) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Copy users unsupported.\n");
-    return false;
-  }
-
-  SmallPtrSet<MachineInstr*, 2> ReduceUsers;
-  RDA.getGlobalUses(Reduce, Reduce->getOperand(0).getReg(), ReduceUsers);
-  if (ReduceUsers.size() > 2 || !ReduceUsers.count(Copy)) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Reduce users unsupported.\n");
-    return false;
+    }
   }
-
-  // Then find whether there's an instruction initialising the register that
-  // is storing the reduction.
-  SmallPtrSet<MachineInstr*, 2> Incoming;
-  RDA.getLiveOuts(Preheader, Copy->getOperand(1).getReg(), Incoming);
-  if (Incoming.size() > 1)
-    return false;
-
-  MachineInstr *Init = Incoming.empty() ? nullptr : *Incoming.begin();
-  LLVM_DEBUG(dbgs() << "ARM Loops: Found a reduction:\n"
-             << " - " << *Copy
-             << " - " << *Reduce
-             << " - " << *VPSEL);
-  Reductions.push_back(std::make_unique<Reduction>(Init, Copy, Reduce, VPSEL));
+  LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
   return true;
 }
 
@@ -803,28 +706,20 @@ bool LowOverheadLoop::ValidateLiveOuts() {
     if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode()))
       continue;
 
-    // Predicated loads will write zeros to the falsely predicated bytes of the
-    // destination register.
-    if (isVectorPredicated(&MI)) {
-      if (MI.mayLoad())
-        FalseLanesZero.insert(&MI);
-      Predicated.insert(&MI);
-      continue;
-    }
+    bool isPredicated = isVectorPredicated(&MI);
+    bool retainsOrReduces =
+      retainsPreviousHalfElement(MI) || isHorizontalReduction(MI);
 
-    if (MI.getNumDefs() == 0)
+    if (isPredicated)
+      Predicated.insert(&MI);
+    if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero))
+      FalseLanesZero.insert(&MI);
+    else if (MI.getNumDefs() == 0)
       continue;
-
-    if (!producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero)) {
-      // We require retaining and horizontal operations to operate upon zero'd
-      // false lanes to ensure the conversion doesn't change the output.
-      if (retainsPreviousHalfElement(MI) || isHorizontalReduction(MI))
-        return false;
-      // Otherwise we need to evaluate this instruction later to see whether
-      // unknown false lanes will get masked away by their user(s).
+    else if (!isPredicated && retainsOrReduces)
+      return false;
+    else
       FalseLanesUnknown.insert(&MI);
-    } else if (!isHorizontalReduction(MI))
-      FalseLanesZero.insert(&MI);
   }
 
   auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO,
@@ -853,48 +748,44 @@ bool LowOverheadLoop::ValidateLiveOuts() {
         LLVM_DEBUG(dbgs() << "ARM Loops: Found an unknown def of : "
                           << TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
         NonPredicated.insert(MI);
-        continue;
+        break;
       }
     }
     // Any unknown false lanes have been masked away by the user(s).
-    Predicated.insert(MI);
+    if (!NonPredicated.contains(MI))
+      Predicated.insert(MI);
   }
 
   SmallPtrSet<MachineInstr *, 2> LiveOutMIs;
-  SmallPtrSet<MachineInstr*, 2> LiveOutUsers;
   SmallVector<MachineBasicBlock *, 2> ExitBlocks;
   ML.getExitBlocks(ExitBlocks);
   assert(ML.getNumBlocks() == 1 && "Expected single block loop!");
   assert(ExitBlocks.size() == 1 && "Expected a single exit block");
   MachineBasicBlock *ExitBB = ExitBlocks.front();
   for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {
+    // TODO: Instead of blocking predication, we could move the vctp to the exit
+    // block and calculate it's operand there in or the preheader.
+    if (RegMask.PhysReg == ARM::VPR)
+      return false;
     // Check Q-regs that are live in the exit blocks. We don't collect scalars
     // because they won't be affected by lane predication.
-    if (QPRs->contains(RegMask.PhysReg)) {
+    if (QPRs->contains(RegMask.PhysReg))
       if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg))
         LiveOutMIs.insert(MI);
-      RDA.getLiveInUses(ExitBB, RegMask.PhysReg, LiveOutUsers);
-    }
   }
 
-  // If we have any non-predicated live-outs, they need to be part of a
-  // reduction that we can fixup later. The reduction that the form of an
-  // operation that uses its previous values through a vmov and then a vpsel
-  // resides in the exit blocks to select the final bytes from n and n-1
-  // iterations.
-  if (!NonPredicated.empty() &&
-      !FindValidReduction(NonPredicated, LiveOutUsers))
-    return false;
-
   // We've already validated that any VPT predication within the loop will be
   // equivalent when we perform the predication transformation; so we know that
   // any VPT predicated instruction is predicated upon VCTP. Any live-out
   // instruction needs to be predicated, so check this here. The instructions
   // in NonPredicated have been found to be a reduction that we can ensure its
   // legality.
-  for (auto *MI : LiveOutMIs)
-    if (!isVectorPredicated(MI) && !NonPredicated.count(MI))
+  for (auto *MI : LiveOutMIs) {
+    if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Unable to handle live out: " << *MI);
       return false;
+    }
+  }
 
   return true;
 }
@@ -1360,61 +1251,6 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
   return &*MIB;
 }
 
-void ARMLowOverheadLoops::FixupReductions(LowOverheadLoop &LoLoop) const {
-  LLVM_DEBUG(dbgs() << "ARM Loops: Fixing up reduction(s).\n");
-  auto BuildMov = [this](MachineInstr &InsertPt, Register To, Register From) {
-    MachineBasicBlock *MBB = InsertPt.getParent();
-    MachineInstrBuilder MIB =
-      BuildMI(*MBB, &InsertPt, InsertPt.getDebugLoc(), TII->get(ARM::MVE_VORR));
-    MIB.addDef(To);
-    MIB.addReg(From);
-    MIB.addReg(From);
-    MIB.addImm(0);
-    MIB.addReg(0);
-    MIB.addReg(To);
-    LLVM_DEBUG(dbgs() << "ARM Loops: Inserted VMOV: " << *MIB);
-  };
-
-  for (auto &Reduction : LoLoop.Reductions) {
-    MachineInstr &Copy = Reduction->Copy;
-    MachineInstr &Reduce = Reduction->Reduce;
-    Register DestReg = Copy.getOperand(0).getReg();
-
-    // Change the initialiser if present
-    if (Reduction->Init) {
-      MachineInstr *Init = Reduction->Init;
-
-      for (unsigned i = 0; i < Init->getNumOperands(); ++i) {
-        MachineOperand &MO = Init->getOperand(i);
-        if (MO.isReg() && MO.isUse() && MO.isTied() &&
-            Init->findTiedOperandIdx(i) == 0)
-          Init->getOperand(i).setReg(DestReg);
-      }
-      Init->getOperand(0).setReg(DestReg);
-      LLVM_DEBUG(dbgs() << "ARM Loops: Changed init regs: " << *Init);
-    } else
-      BuildMov(LoLoop.Preheader->instr_back(), DestReg, Copy.getOperand(1).getReg());
-
-    // Change the reducing op to write to the register that is used to copy
-    // its value on the next iteration. Also update the tied-def operand.
-    Reduce.getOperand(0).setReg(DestReg);
-    Reduce.getOperand(5).setReg(DestReg);
-    LLVM_DEBUG(dbgs() << "ARM Loops: Changed reduction regs: " << Reduce);
-
-    // Instead of a vpsel, just copy the register into the necessary one.
-    MachineInstr &VPSEL = Reduction->VPSEL;
-    if (VPSEL.getOperand(0).getReg() != DestReg)
-      BuildMov(VPSEL, VPSEL.getOperand(0).getReg(), DestReg);
-
-    // Remove the unnecessary instructions.
-    LLVM_DEBUG(dbgs() << "ARM Loops: Removing:\n"
-               << " - " << Copy
-               << " - " << VPSEL << "\n");
-    Copy.eraseFromParent();
-    VPSEL.eraseFromParent();
-  }
-}
-
 void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
   auto RemovePredicate = [](MachineInstr *MI) {
     LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
@@ -1568,10 +1404,8 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
     RemoveDeadBranch(LoLoop.Start);
     LoLoop.End = ExpandLoopEnd(LoLoop);
     RemoveDeadBranch(LoLoop.End);
-    if (LoLoop.IsTailPredicationLegal()) {
+    if (LoLoop.IsTailPredicationLegal())
       ConvertVPTBlocks(LoLoop);
-      FixupReductions(LoLoop);
-    }
     for (auto *I : LoLoop.ToRemove) {
       LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);
       I->eraseFromParent();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index a4b38e8082224..0a289fd4e70ff 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -340,17 +340,17 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
   auto *IndexC = dyn_cast<ConstantInt>(Index);
   if (IndexC) {
     ElementCount EC = EI.getVectorOperandType()->getElementCount();
-    unsigned NumElts = EC.Min;
+    unsigned NumElts = EC.getKnownMinValue();
 
     // InstSimplify should handle cases where the index is invalid.
     // For fixed-length vector, it's invalid to extract out-of-range element.
-    if (!EC.Scalable && IndexC->getValue().uge(NumElts))
+    if (!EC.isScalable() && IndexC->getValue().uge(NumElts))
       return nullptr;
 
     // This instruction only demands the single element from the input vector.
     // Skip for scalable type, the number of elements is unknown at
     // compile-time.
-    if (!EC.Scalable && NumElts != 1) {
+    if (!EC.isScalable() && NumElts != 1) {
       // If the input vector has a single use, simplify it based on this use
       // property.
       if (SrcVec->hasOneUse()) {
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index 423c4baf262ad..57befc9c3cfb3 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -1976,18 +1976,18 @@ bool llvm::runIPSCCP(
       // inaccessiblemem_or_argmemonly attributes do not hold any longer. Remove
       // them from both the function and callsites.
       if (ReplacedPointerArg) {
-        SmallVector<Attribute::AttrKind, 2> AttributesToRemove = {
-            Attribute::ArgMemOnly, Attribute::InaccessibleMemOrArgMemOnly};
-        for (auto Attr : AttributesToRemove)
-          F.removeFnAttr(Attr);
+        AttrBuilder AttributesToRemove;
+        AttributesToRemove.addAttribute(Attribute::ArgMemOnly);
+        AttributesToRemove.addAttribute(Attribute::InaccessibleMemOrArgMemOnly);
+        F.removeAttributes(AttributeList::FunctionIndex, AttributesToRemove);
 
         for (User *U : F.users()) {
           auto *CB = dyn_cast<CallBase>(U);
           if (!CB || CB->getCalledFunction() != &F)
             continue;
 
-          for (auto Attr : AttributesToRemove)
-            CB->removeAttribute(AttributeList::FunctionIndex, Attr);
+          CB->removeAttributes(AttributeList::FunctionIndex,
+                               AttributesToRemove);
         }
       }
     }
diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
index 101cb232d8aed..dfab9369c7b72 100644
--- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -488,12 +488,13 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
   case Type::ScalableVectorTyID: {
     auto *STyL = cast<VectorType>(TyL);
     auto *STyR = cast<VectorType>(TyR);
-    if (STyL->getElementCount().Scalable != STyR->getElementCount().Scalable)
-      return cmpNumbers(STyL->getElementCount().Scalable,
-                        STyR->getElementCount().Scalable);
-    if (STyL->getElementCount().Min != STyR->getElementCount().Min)
-      return cmpNumbers(STyL->getElementCount().Min,
-                        STyR->getElementCount().Min);
+    if (STyL->getElementCount().isScalable() !=
+        STyR->getElementCount().isScalable())
+      return cmpNumbers(STyL->getElementCount().isScalable(),
+                        STyR->getElementCount().isScalable());
+    if (STyL->getElementCount() != STyR->getElementCount())
+      return cmpNumbers(STyL->getElementCount().getKnownMinValue(),
+                        STyR->getElementCount().getKnownMinValue());
     return cmpTypes(STyL->getElementType(), STyR->getElementType());
   }
   }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f999c5af7f475..1f82995588b09 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -342,7 +342,7 @@ static Type *getMemInstValueType(Value *I) {
 /// type is irregular if its allocated size doesn't equal the store size of an
 /// element of the corresponding vector type at the given vectorization factor.
 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
-  assert(!VF.Scalable && "scalable vectors not yet supported.");
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   // Determine if an array of VF elements of type Ty is "bitcast compatible"
   // with a <VF x Ty> vector.
   if (VF.isVector()) {
@@ -899,8 +899,9 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr)
     const DILocation *DIL = Inst->getDebugLoc();
     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
         !isa<DbgInfoIntrinsic>(Inst)) {
-      assert(!VF.Scalable && "scalable vectors not yet supported.");
-      auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF.Min);
+      assert(!VF.isScalable() && "scalable vectors not yet supported.");
+      auto NewDIL =
+          DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
       if (NewDIL)
         B.SetCurrentDebugLocation(NewDIL.getValue());
       else
@@ -1216,7 +1217,7 @@ class LoopVectorizationCostModel {
   /// width \p VF. Return CM_Unknown if this instruction did not pass
   /// through the cost modeling.
   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
-    assert(!VF.Scalable && "scalable vectors not yet supported.");
+    assert(!VF.isScalable() && "scalable vectors not yet supported.");
     assert(VF.isVector() && "Expected VF >=2");
 
     // Cost model is not run in the VPlan-native path - return conservative
@@ -1837,7 +1838,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
 
   // Multiply the vectorization factor by the step using integer or
   // floating-point arithmetic as appropriate.
-  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF.Min);
+  Value *ConstVF =
+      getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
 
   // Create a vector splat to use in the induction update.
@@ -1845,7 +1847,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
   // FIXME: If the step is non-constant, we create the vector splat with
   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
   //        handle a constant vector splat.
-  assert(!VF.Scalable && "scalable vectors not yet supported.");
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   Value *SplatVF = isa<Constant>(Mul)
                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
                        : Builder.CreateVectorSplat(VF, Mul);
@@ -1982,9 +1984,10 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
     for (unsigned Part = 0; Part < UF; ++Part) {
-      assert(!VF.Scalable && "scalable vectors not yet supported.");
-      Value *EntryPart = getStepVector(Broadcasted, VF.Min * Part, Step,
-                                       ID.getInductionOpcode());
+      assert(!VF.isScalable() && "scalable vectors not yet supported.");
+      Value *EntryPart =
+          getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
+                        ID.getInductionOpcode());
       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
       if (Trunc)
         addMetadata(EntryPart, Trunc);
@@ -2093,7 +2096,7 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
                                            const InductionDescriptor &ID) {
   // We shouldn't have to build scalar steps if we aren't vectorizing.
   assert(VF.isVector() && "VF should be greater than one");
-  assert(!VF.Scalable &&
+  assert(!VF.isScalable() &&
          "the code below assumes a fixed number of elements at compile time");
   // Get the value type and ensure it and the step have the same integer type.
   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
@@ -2118,12 +2121,12 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
   unsigned Lanes =
       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
           ? 1
-          : VF.Min;
+          : VF.getKnownMinValue();
   // Compute the scalar steps and save the results in VectorLoopValueMap.
   for (unsigned Part = 0; Part < UF; ++Part) {
     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
-      auto *StartIdx =
-          getSignedIntOrFpConstant(ScalarIVTy, VF.Min * Part + Lane);
+      auto *StartIdx = getSignedIntOrFpConstant(
+          ScalarIVTy, VF.getKnownMinValue() * Part + Lane);
       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
@@ -2166,9 +2169,10 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
     // is known to be uniform after vectorization, this corresponds to lane zero
     // of the Part unroll iteration. Otherwise, the last instruction is the one
     // we created for the last vector lane of the Part unroll iteration.
-    assert(!VF.Scalable && "scalable vectors not yet supported.");
-    unsigned LastLane =
-        Cost->isUniformAfterVectorization(I, VF) ? 0 : VF.Min - 1;
+    assert(!VF.isScalable() && "scalable vectors not yet supported.");
+    unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
+                            ? 0
+                            : VF.getKnownMinValue() - 1;
     auto *LastInst = cast<Instruction>(
         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
 
@@ -2190,10 +2194,10 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
     } else {
       // Initialize packing with insertelements to start from undef.
-      assert(!VF.Scalable && "VF is assumed to be non scalable.");
+      assert(!VF.isScalable() && "VF is assumed to be non scalable.");
       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
       VectorLoopValueMap.setVectorValue(V, Part, Undef);
-      for (unsigned Lane = 0; Lane < VF.Min; ++Lane)
+      for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
         packScalarIntoVectorValue(V, {Part, Lane});
       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
     }
@@ -2257,10 +2261,10 @@ void InnerLoopVectorizer::packScalarIntoVectorValue(
 
 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
   assert(Vec->getType()->isVectorTy() && "Invalid type");
-  assert(!VF.Scalable && "Cannot reverse scalable vectors");
+  assert(!VF.isScalable() && "Cannot reverse scalable vectors");
   SmallVector<int, 8> ShuffleMask;
-  for (unsigned i = 0; i < VF.Min; ++i)
-    ShuffleMask.push_back(VF.Min - i - 1);
+  for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
+    ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
 
   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
                                      ShuffleMask, "reverse");
@@ -2314,7 +2318,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
   // Prepare for the vector type of the interleaved load/store.
   Type *ScalarTy = getMemInstValueType(Instr);
   unsigned InterleaveFactor = Group->getFactor();
-  assert(!VF.Scalable && "scalable vectors not yet supported.");
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
 
   // Prepare for the new pointers.
@@ -2331,10 +2335,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
   // pointer operand of the interleaved access is supposed to be uniform. For
   // uniform instructions, we're only required to generate a value for the
   // first vector lane in each unroll iteration.
-  assert(!VF.Scalable &&
+  assert(!VF.isScalable() &&
          "scalable vector reverse operation is not implemented");
   if (Group->isReverse())
-    Index += (VF.Min - 1) * Group->getFactor();
+    Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
 
   for (unsigned Part = 0; Part < UF; Part++) {
     Value *AddrPart = State.get(Addr, {Part, 0});
@@ -2369,8 +2373,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
 
   Value *MaskForGaps = nullptr;
   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
-    assert(!VF.Scalable && "scalable vectors not yet supported.");
-    MaskForGaps = createBitMaskForGaps(Builder, VF.Min, *Group);
+    assert(!VF.isScalable() && "scalable vectors not yet supported.");
+    MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
     assert(MaskForGaps && "Mask for Gaps is required but it is null");
   }
 
@@ -2387,10 +2391,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
         if (BlockInMask) {
           Value *BlockInMaskPart = State.get(BlockInMask, Part);
           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
-          assert(!VF.Scalable && "scalable vectors not yet supported.");
+          assert(!VF.isScalable() && "scalable vectors not yet supported.");
           Value *ShuffledMask = Builder.CreateShuffleVector(
               BlockInMaskPart, Undefs,
-              createReplicatedMask(InterleaveFactor, VF.Min),
+              createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
               "interleaved.mask");
           GroupMask = MaskForGaps
                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
@@ -2417,15 +2421,16 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
       if (!Member)
         continue;
 
-      assert(!VF.Scalable && "scalable vectors not yet supported.");
-      auto StrideMask = createStrideMask(I, InterleaveFactor, VF.Min);
+      assert(!VF.isScalable() && "scalable vectors not yet supported.");
+      auto StrideMask =
+          createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
       for (unsigned Part = 0; Part < UF; Part++) {
         Value *StridedVec = Builder.CreateShuffleVector(
             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
 
         // If this member has different type, cast the result type.
         if (Member->getType() != ScalarTy) {
-          assert(!VF.Scalable && "VF is assumed to be non scalable.");
+          assert(!VF.isScalable() && "VF is assumed to be non scalable.");
           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
         }
@@ -2440,7 +2445,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
   }
 
   // The sub vector type for current instruction.
-  assert(!VF.Scalable && "VF is assumed to be non scalable.");
+  assert(!VF.isScalable() && "VF is assumed to be non scalable.");
   auto *SubVT = VectorType::get(ScalarTy, VF);
 
   // Vectorize the interleaved store group.
@@ -2469,9 +2474,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
     Value *WideVec = concatenateVectors(Builder, StoredVecs);
 
     // Interleave the elements in the wide vector.
-    assert(!VF.Scalable && "scalable vectors not yet supported.");
+    assert(!VF.isScalable() && "scalable vectors not yet supported.");
     Value *IVec = Builder.CreateShuffleVector(
-        WideVec, UndefVec, createInterleaveMask(VF.Min, InterleaveFactor),
+        WideVec, UndefVec,
+        createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
         "interleaved.vec");
 
     Instruction *NewStoreInstr;
@@ -2480,7 +2486,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
       Value *ShuffledMask = Builder.CreateShuffleVector(
           BlockInMaskPart, Undefs,
-          createReplicatedMask(InterleaveFactor, VF.Min), "interleaved.mask");
+          createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
+          "interleaved.mask");
       NewStoreInstr = Builder.CreateMaskedStore(
           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
     }
@@ -2514,7 +2521,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
 
   Type *ScalarDataTy = getMemInstValueType(Instr);
 
-  assert(!VF.Scalable && "scalable vectors not yet supported.");
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   auto *DataTy = VectorType::get(ScalarDataTy, VF);
   const Align Alignment = getLoadStoreAlignment(Instr);
 
@@ -2550,16 +2557,16 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
       // If the address is consecutive but reversed, then the
       // wide store needs to start at the last vector element.
       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
-          ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.Min)));
+          ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
       PartPtr->setIsInBounds(InBounds);
       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
-          ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.Min)));
+          ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
       PartPtr->setIsInBounds(InBounds);
       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
     } else {
       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
-          ScalarDataTy, Ptr, Builder.getInt32(Part * VF.Min)));
+          ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue())));
       PartPtr->setIsInBounds(InBounds);
     }
 
@@ -2756,8 +2763,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
 
   Type *Ty = TC->getType();
   // This is where we can make the step a runtime constant.
-  assert(!VF.Scalable && "scalable vectorization is not supported yet");
-  Constant *Step = ConstantInt::get(Ty, VF.Min * UF);
+  assert(!VF.isScalable() && "scalable vectorization is not supported yet");
+  Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF);
 
   // If the tail is to be folded by masking, round the number of iterations N
   // up to a multiple of Step instead of rounding down. This is done by first
@@ -2766,10 +2773,10 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
   // that it starts at zero and its Step is a power of two; the loop will then
   // exit, with the last early-exit vector comparison also producing all-true.
   if (Cost->foldTailByMasking()) {
-    assert(isPowerOf2_32(VF.Min * UF) &&
+    assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
            "VF*UF must be a power of 2 when folding tail by masking");
-    TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF.Min * UF - 1),
-                           "n.rnd.up");
+    TC = Builder.CreateAdd(
+        TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
   }
 
   // Now we need to generate the expression for the part of the loop that the
@@ -2846,9 +2853,10 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
   // If tail is to be folded, vector loop takes care of all iterations.
   Value *CheckMinIters = Builder.getFalse();
   if (!Cost->foldTailByMasking()) {
-    assert(!VF.Scalable && "scalable vectors not yet supported.");
+    assert(!VF.isScalable() && "scalable vectors not yet supported.");
     CheckMinIters = Builder.CreateICmp(
-        P, Count, ConstantInt::get(Count->getType(), VF.Min * UF),
+        P, Count,
+        ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF),
         "min.iters.check");
   }
   // Create new preheader for vector loop.
@@ -3303,8 +3311,8 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   Value *StartIdx = ConstantInt::get(IdxTy, 0);
   // The loop step is equal to the vectorization factor (num of SIMD elements)
   // times the unroll factor (num of SIMD instructions).
-  assert(!VF.Scalable && "scalable vectors not yet supported.");
-  Constant *Step = ConstantInt::get(IdxTy, VF.Min * UF);
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
+  Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
   Induction =
       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
@@ -3438,7 +3446,7 @@ static void cse(BasicBlock *BB) {
 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
                                                        ElementCount VF,
                                                        bool &NeedToScalarize) {
-  assert(!VF.Scalable && "scalable vectors not yet supported.");
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   Function *F = CI->getCalledFunction();
   Type *ScalarRetTy = CI->getType();
   SmallVector<Type *, 4> Tys, ScalarTys;
@@ -3463,7 +3471,7 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
   // packing the return values to a vector.
   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
 
-  unsigned Cost = ScalarCallCost * VF.Min + ScalarizationCost;
+  unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
 
   // If we can't emit a vector call for this function, then the currently found
   // cost is the cost we need to return.
@@ -3684,11 +3692,11 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
   // profile is not inherently precise anyway. Note also possible bypass of
   // vector code caused by legality checks is ignored, assigning all the weight
   // to the vector loop, optimistically.
-  assert(!VF.Scalable &&
+  assert(!VF.isScalable() &&
          "cannot use scalable ElementCount to determine unroll factor");
-  setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
-                               LI->getLoopFor(LoopVectorBody),
-                               LI->getLoopFor(LoopScalarBody), VF.Min * UF);
+  setProfileInfoAfterUnrolling(
+      LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
+      LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
 }
 
 void InnerLoopVectorizer::fixCrossIterationPHIs() {
@@ -3769,10 +3777,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   auto *VectorInit = ScalarInit;
   if (VF.isVector()) {
     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
-    assert(!VF.Scalable && "VF is assumed to be non scalable.");
+    assert(!VF.isScalable() && "VF is assumed to be non scalable.");
     VectorInit = Builder.CreateInsertElement(
         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
-        Builder.getInt32(VF.Min - 1), "vector.recur.init");
+        Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
   }
 
   // We constructed a temporary phi node in the first phase of vectorization.
@@ -3813,11 +3821,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // We will construct a vector for the recurrence by combining the values for
   // the current and previous iterations. This is the required shuffle mask.
-  assert(!VF.Scalable);
-  SmallVector<int, 8> ShuffleMask(VF.Min);
-  ShuffleMask[0] = VF.Min - 1;
-  for (unsigned I = 1; I < VF.Min; ++I)
-    ShuffleMask[I] = I + VF.Min - 1;
+  assert(!VF.isScalable());
+  SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
+  ShuffleMask[0] = VF.getKnownMinValue() - 1;
+  for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
+    ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
 
   // The vector from which to take the initial value for the current iteration
   // (actual or unrolled). Initially, this is the vector phi node.
@@ -3846,7 +3854,8 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   if (VF.isVector()) {
     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
     ExtractForScalar = Builder.CreateExtractElement(
-        ExtractForScalar, Builder.getInt32(VF.Min - 1), "vector.recur.extract");
+        ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
+        "vector.recur.extract");
   }
   // Extract the second last element in the middle block if the
   // Phi is used outside the loop. We need to extract the phi itself
@@ -3856,7 +3865,8 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   Value *ExtractForPhiUsedOutsideLoop = nullptr;
   if (VF.isVector())
     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
-        Incoming, Builder.getInt32(VF.Min - 2), "vector.recur.extract.for.phi");
+        Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
+        "vector.recur.extract.for.phi");
   // When loop is unrolled without vectorizing, initialize
   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
   // `Incoming`. This is analogous to the vectorized case above: extracting the
@@ -4013,7 +4023,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
   // entire expression in the smaller type.
   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
-    assert(!VF.Scalable && "scalable vectors not yet supported.");
+    assert(!VF.isScalable() && "scalable vectors not yet supported.");
     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
     Builder.SetInsertPoint(
         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
@@ -4145,7 +4155,7 @@ void InnerLoopVectorizer::clearReductionWrapFlags(
 }
 
 void InnerLoopVectorizer::fixLCSSAPHIs() {
-  assert(!VF.Scalable && "the code below assumes fixed width vectors");
+  assert(!VF.isScalable() && "the code below assumes fixed width vectors");
   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
     if (LCSSAPhi.getNumIncomingValues() == 1) {
       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
@@ -4155,7 +4165,7 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
         LastLane = Cost->isUniformAfterVectorization(
                        cast<Instruction>(IncomingValue), VF)
                        ? 0
-                       : VF.Min - 1;
+                       : VF.getKnownMinValue() - 1;
       // Can be a loop invariant incoming value or the last scalar value to be
       // extracted from the vectorized loop.
       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
@@ -4338,7 +4348,7 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
 
 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
                                               ElementCount VF) {
-  assert(!VF.Scalable && "scalable vectors not yet supported.");
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   PHINode *P = cast<PHINode>(PN);
   if (EnableVPlanNativePath) {
     // Currently we enter here in the VPlan-native path for non-induction
@@ -4403,11 +4413,12 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
       // Determine the number of scalars we need to generate for each unroll
       // iteration. If the instruction is uniform, we only need to generate the
       // first lane. Otherwise, we generate all VF values.
-      unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.Min;
+      unsigned Lanes =
+          Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
       for (unsigned Part = 0; Part < UF; ++Part) {
         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
-          Constant *Idx =
-              ConstantInt::get(PtrInd->getType(), Lane + Part * VF.Min);
+          Constant *Idx = ConstantInt::get(PtrInd->getType(),
+                                           Lane + Part * VF.getKnownMinValue());
           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
           Value *SclrGep =
               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
@@ -4437,8 +4448,9 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
     Value *InductionGEP = GetElementPtrInst::Create(
         ScStValueType->getPointerElementType(), NewPointerPhi,
-        Builder.CreateMul(ScalarStepValue,
-                          ConstantInt::get(PhiType, VF.Min * UF)),
+        Builder.CreateMul(
+            ScalarStepValue,
+            ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
         "ptr.ind", InductionLoc);
     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
 
@@ -4448,15 +4460,17 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
     for (unsigned Part = 0; Part < UF; ++Part) {
       SmallVector<Constant *, 8> Indices;
       // Create a vector of consecutive numbers from zero to VF.
-      for (unsigned i = 0; i < VF.Min; ++i)
-        Indices.push_back(ConstantInt::get(PhiType, i + Part * VF.Min));
+      for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
+        Indices.push_back(
+            ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
       Constant *StartOffset = ConstantVector::get(Indices);
 
       Value *GEP = Builder.CreateGEP(
           ScStValueType->getPointerElementType(), NewPointerPhi,
-          Builder.CreateMul(StartOffset,
-                            Builder.CreateVectorSplat(VF.Min, ScalarStepValue),
-                            "vector.gep"));
+          Builder.CreateMul(
+              StartOffset,
+              Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
+              "vector.gep"));
       VectorLoopValueMap.setVectorValue(P, Part, GEP);
     }
   }
@@ -4483,7 +4497,7 @@ static bool mayDivideByZero(Instruction &I) {
 
 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
                                            VPTransformState &State) {
-  assert(!VF.Scalable && "scalable vectors not yet supported.");
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   switch (I.getOpcode()) {
   case Instruction::Call:
   case Instruction::Br:
@@ -4571,7 +4585,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
     setDebugLocFromInst(Builder, CI);
 
     /// Vectorize casts.
-    assert(!VF.Scalable && "VF is assumed to be non scalable.");
+    assert(!VF.isScalable() && "VF is assumed to be non scalable.");
     Type *DestTy =
         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
 
@@ -4601,7 +4615,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
 
   SmallVector<Type *, 4> Tys;
   for (Value *ArgOperand : CI->arg_operands())
-    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.Min));
+    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
 
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
@@ -4633,7 +4647,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
       // Use vector version of the intrinsic.
       Type *TysForDecl[] = {CI->getType()};
       if (VF.isVector()) {
-        assert(!VF.Scalable && "VF is assumed to be non scalable.");
+        assert(!VF.isScalable() && "VF is assumed to be non scalable.");
         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
       }
       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
@@ -4872,7 +4886,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
 
 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
                                                          ElementCount VF) {
-  assert(!VF.Scalable && "scalable vectors not yet supported.");
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   if (!blockNeedsPredication(I->getParent()))
     return false;
   switch(I->getOpcode()) {
@@ -5357,7 +5371,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
           Selected = false;
       }
       if (Selected) {
-        MaxVF = VFs[i].Min;
+        MaxVF = VFs[i].getKnownMinValue();
         break;
       }
     }
@@ -5558,8 +5572,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   }
 
   // Clamp the interleave ranges to reasonable counts.
-  assert(!VF.Scalable && "scalable vectors not yet supported.");
-  unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF.Min);
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
+  unsigned MaxInterleaveCount =
+      TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
 
   // Check if the user has overridden the max.
   if (VF == 1) {
@@ -5573,7 +5588,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   // If trip count is known or estimated compile time constant, limit the
   // interleave count to be less than the trip count divided by VF.
   if (BestKnownTC) {
-    MaxInterleaveCount = std::min(*BestKnownTC / VF.Min, MaxInterleaveCount);
+    MaxInterleaveCount =
+        std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
   }
 
   // If we did not calculate the cost for VF (because the user selected the VF)
@@ -5745,8 +5761,9 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
     if (Ty->isTokenTy())
       return 0U;
     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
-    assert(!VF.Scalable && "scalable vectors not yet supported.");
-    return std::max<unsigned>(1, VF.Min * TypeSize / WidestRegister);
+    assert(!VF.isScalable() && "scalable vectors not yet supported.");
+    return std::max<unsigned>(1, VF.getKnownMinValue() * TypeSize /
+                                     WidestRegister);
   };
 
   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
@@ -5973,19 +5990,20 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     // the instruction as if it wasn't if-converted and instead remained in the
     // predicated block. We will scale this cost by block probability after
     // computing the scalarization overhead.
-    assert(!VF.Scalable && "scalable vectors not yet supported.");
+    assert(!VF.isScalable() && "scalable vectors not yet supported.");
     unsigned ScalarCost =
-        VF.Min * getInstructionCost(I, ElementCount::getFixed(1)).first;
+        VF.getKnownMinValue() *
+        getInstructionCost(I, ElementCount::getFixed(1)).first;
 
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
       ScalarCost += TTI.getScalarizationOverhead(
           cast<VectorType>(ToVectorTy(I->getType(), VF)),
-          APInt::getAllOnesValue(VF.Min), true, false);
-      assert(!VF.Scalable && "scalable vectors not yet supported.");
+          APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
+      assert(!VF.isScalable() && "scalable vectors not yet supported.");
       ScalarCost +=
-          VF.Min *
+          VF.getKnownMinValue() *
           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
     }
 
@@ -6000,10 +6018,10 @@ int LoopVectorizationCostModel::computePredInstDiscount(
         if (canBeScalarized(J))
           Worklist.push_back(J);
         else if (needsExtract(J, VF)) {
-          assert(!VF.Scalable && "scalable vectors not yet supported.");
+          assert(!VF.isScalable() && "scalable vectors not yet supported.");
           ScalarCost += TTI.getScalarizationOverhead(
               cast<VectorType>(ToVectorTy(J->getType(), VF)),
-              APInt::getAllOnesValue(VF.Min), false, true);
+              APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
         }
       }
 
@@ -6021,7 +6039,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
 
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
-  assert(!VF.Scalable && "scalable vectors not yet supported.");
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   VectorizationCostTy Cost;
 
   // For each block.
@@ -6104,7 +6122,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
                                                         ElementCount VF) {
   assert(VF.isVector() &&
          "Scalarization cost of instruction implies vectorization.");
-  assert(!VF.Scalable && "scalable vectors not yet supported.");
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   Type *ValTy = getMemInstValueType(I);
   auto SE = PSE.getSE();
 
@@ -6117,12 +6135,13 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
 
   // Get the cost of the scalar memory instruction and address computation.
-  unsigned Cost = VF.Min * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+  unsigned Cost =
+      VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
 
   // Don't pass *I here, since it is scalar but will actually be part of a
   // vectorized loop where the user of it is a vectorized instruction.
   const Align Alignment = getLoadStoreAlignment(I);
-  Cost += VF.Min *
+  Cost += VF.getKnownMinValue() *
           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
                               AS, TTI::TCK_RecipThroughput);
 
@@ -6190,9 +6209,10 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
   return TTI.getAddressComputationCost(ValTy) +
          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
                              CostKind) +
-         (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
-                                              Instruction::ExtractElement,
-                                              VectorTy, VF.Min - 1));
+         (isLoopInvariantStoreValue
+              ? 0
+              : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
+                                       VF.getKnownMinValue() - 1));
 }
 
 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
@@ -6218,7 +6238,7 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   assert(Group && "Fail to get an interleaved access group.");
 
   unsigned InterleaveFactor = Group->getFactor();
-  assert(!VF.Scalable && "scalable vectors not yet supported.");
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
 
   // Holds the indices of existing members in an interleaved load group.
@@ -6266,7 +6286,7 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                                ElementCount VF) {
-  assert(!VF.Scalable &&
+  assert(!VF.isScalable() &&
          "the cost model is not yet implemented for scalable vectorization");
   // If we know that this instruction will remain uniform, check the cost of
   // the scalar version.
@@ -6282,22 +6302,24 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     auto InstSet = ForcedScalar->second;
     if (InstSet.count(I))
       return VectorizationCostTy(
-          (getInstructionCost(I, ElementCount::getFixed(1)).first * VF.Min),
+          (getInstructionCost(I, ElementCount::getFixed(1)).first *
+           VF.getKnownMinValue()),
           false);
   }
 
   Type *VectorTy;
   unsigned C = getInstructionCost(I, VF, VectorTy);
 
-  bool TypeNotScalarized = VF.isVector() && VectorTy->isVectorTy() &&
-                           TTI.getNumberOfParts(VectorTy) < VF.Min;
+  bool TypeNotScalarized =
+      VF.isVector() && VectorTy->isVectorTy() &&
+      TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
   return VectorizationCostTy(C, TypeNotScalarized);
 }
 
 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
                                                               ElementCount VF) {
 
-  assert(!VF.Scalable &&
+  assert(!VF.isScalable() &&
          "cannot compute scalarization overhead for scalable vectorization");
   if (VF.isScalar())
     return 0;
@@ -6307,7 +6329,8 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
   if (!RetTy->isVoidTy() &&
       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
     Cost += TTI.getScalarizationOverhead(
-        cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.Min), true, false);
+        cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
+        true, false);
 
   // Some targets keep addresses scalar.
   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6323,13 +6346,12 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
 
   // Skip operands that do not require extraction/scalarization and do not incur
   // any overhead.
-  return Cost +
-         TTI.getOperandsScalarizationOverhead(filterExtractingOperands(Ops, VF),
-                                              VF.Min);
+  return Cost + TTI.getOperandsScalarizationOverhead(
+                    filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
 }
 
 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
-  assert(!VF.Scalable && "scalable vectors not yet supported.");
+  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   if (VF.isScalar())
     return;
   NumPredStores = 0;
@@ -6466,14 +6488,15 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
         // Scalarize a widened load of address.
         setWideningDecision(
             I, VF, CM_Scalarize,
-            (VF.Min * getMemoryInstructionCost(I, ElementCount::getFixed(1))));
+            (VF.getKnownMinValue() *
+             getMemoryInstructionCost(I, ElementCount::getFixed(1))));
       else if (auto Group = getInterleavedAccessGroup(I)) {
         // Scalarize an interleave group of address loads.
         for (unsigned I = 0; I < Group->getFactor(); ++I) {
           if (Instruction *Member = Group->getMember(I))
             setWideningDecision(
                 Member, VF, CM_Scalarize,
-                (VF.Min *
+                (VF.getKnownMinValue() *
                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
         }
       }
@@ -6515,12 +6538,14 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
 
     if (ScalarPredicatedBB) {
       // Return cost for branches around scalarized and predicated blocks.
-      assert(!VF.Scalable && "scalable vectors not yet supported.");
+      assert(!VF.isScalable() && "scalable vectors not yet supported.");
       auto *Vec_i1Ty =
           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
       return (TTI.getScalarizationOverhead(
-                  Vec_i1Ty, APInt::getAllOnesValue(VF.Min), false, true) +
-              (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.Min));
+                  Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
+                  false, true) +
+              (TTI.getCFInstrCost(Instruction::Br, CostKind) *
+               VF.getKnownMinValue()));
     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
       // The back-edge branch will remain, as will all scalar branches.
       return TTI.getCFInstrCost(Instruction::Br, CostKind);
@@ -6537,9 +6562,9 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     // First-order recurrences are replaced by vector shuffles inside the loop.
     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
-      return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                cast<VectorType>(VectorTy), VF.Min - 1,
-                                FixedVectorType::get(RetTy, 1));
+      return TTI.getShuffleCost(
+          TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
+          VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
 
     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
     // converted into select instructions. We require N - 1 selects per phi
@@ -6568,11 +6593,12 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       // that we will create. This cost is likely to be zero. The phi node
       // cost, if any, should be scaled by the block probability because it
       // models a copy at the end of each predicated block.
-      Cost += VF.Min * TTI.getCFInstrCost(Instruction::PHI, CostKind);
+      Cost += VF.getKnownMinValue() *
+              TTI.getCFInstrCost(Instruction::PHI, CostKind);
 
       // The cost of the non-predicated instruction.
-      Cost +=
-          VF.Min * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
+      Cost += VF.getKnownMinValue() *
+              TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
 
       // The cost of insertelement and extractelement instructions needed for
       // scalarization.
@@ -6611,15 +6637,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       Op2VK = TargetTransformInfo::OK_UniformValue;
 
     SmallVector<const Value *, 4> Operands(I->operand_values());
-    unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1;
+    unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
     return N * TTI.getArithmeticInstrCost(
                    I->getOpcode(), VectorTy, CostKind,
                    TargetTransformInfo::OK_AnyValue,
                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
   }
   case Instruction::FNeg: {
-    assert(!VF.Scalable && "VF is assumed to be non scalable.");
-    unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1;
+    assert(!VF.isScalable() && "VF is assumed to be non scalable.");
+    unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
     return N * TTI.getArithmeticInstrCost(
                    I->getOpcode(), VectorTy, CostKind,
                    TargetTransformInfo::OK_AnyValue,
@@ -6633,7 +6659,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
     Type *CondTy = SI->getCondition()->getType();
     if (!ScalarCond) {
-      assert(!VF.Scalable && "VF is assumed to be non scalable.");
+      assert(!VF.isScalable() && "VF is assumed to be non scalable.");
       CondTy = VectorType::get(CondTy, VF);
     }
     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
@@ -6745,8 +6771,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       }
     }
 
-    assert(!VF.Scalable && "VF is assumed to be non scalable");
-    unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1;
+    assert(!VF.isScalable() && "VF is assumed to be non scalable");
+    unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
     return N *
            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
   }
@@ -6761,9 +6787,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   default:
     // The cost of executing VF copies of the scalar instruction. This opcode
     // is unknown. Assume that it is the same as 'mul'.
-    return VF.Min *
-               TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
-                                          CostKind) +
+    return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
+                                       Instruction::Mul, VectorTy, CostKind) +
            getScalarizationOverhead(I, VF);
   } // end of switch.
 }
@@ -6870,7 +6895,7 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
 
 VectorizationFactor
 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
-  assert(!UserVF.Scalable && "scalable vectors not yet supported");
+  assert(!UserVF.isScalable() && "scalable vectors not yet supported");
   ElementCount VF = UserVF;
   // Outer loop handling: They may require CFG and instruction level
   // transformations before even evaluating whether vectorization is profitable.
@@ -6892,10 +6917,11 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
       }
     }
     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
-    assert(isPowerOf2_32(VF.Min) && "VF needs to be a power of two");
+    assert(isPowerOf2_32(VF.getKnownMinValue()) &&
+           "VF needs to be a power of two");
     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
                       << "VF " << VF << " to build VPlans.\n");
-    buildVPlans(VF.Min, VF.Min);
+    buildVPlans(VF.getKnownMinValue(), VF.getKnownMinValue());
 
     // For VPlan build stress testing, we bail out after VPlan construction.
     if (VPlanBuildStressTest)
@@ -6912,9 +6938,10 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
 
 Optional<VectorizationFactor>
 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
-  assert(!UserVF.Scalable && "scalable vectorization not yet handled");
+  assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
   assert(OrigLoop->empty() && "Inner loop expected.");
-  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF.Min, UserIC);
+  Optional<unsigned> MaybeMaxVF =
+      CM.computeMaxVF(UserVF.getKnownMinValue(), UserIC);
   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
     return None;
 
@@ -6934,12 +6961,14 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
 
   if (!UserVF.isZero()) {
     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-    assert(isPowerOf2_32(UserVF.Min) && "VF needs to be a power of two");
+    assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
+           "VF needs to be a power of two");
     // Collect the instructions (and their associated costs) that will be more
     // profitable to scalarize.
     CM.selectUserVectorizationFactor(UserVF);
     CM.collectInLoopReductions();
-    buildVPlansWithVPRecipes(UserVF.Min, UserVF.Min);
+    buildVPlansWithVPRecipes(UserVF.getKnownMinValue(),
+                             UserVF.getKnownMinValue());
     LLVM_DEBUG(printPlans(dbgs()));
     return {{UserVF, 0}};
   }
@@ -7228,7 +7257,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
          "Must be called with either a load or store");
 
   auto willWiden = [&](ElementCount VF) -> bool {
-    assert(!VF.Scalable && "unexpected scalable ElementCount");
+    assert(!VF.isScalable() && "unexpected scalable ElementCount");
     if (VF.isScalar())
       return false;
     LoopVectorizationCostModel::InstWidening Decision =
@@ -7762,7 +7791,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   ElementCount VF = ElementCount::getFixed(Range.Start);
   Plan->addVF(VF);
   RSO << "Initial VPlan for VF={" << VF;
-  for (VF.Min *= 2; VF.Min < Range.End; VF.Min *= 2) {
+  for (VF *= 2; VF.getKnownMinValue() < Range.End; VF *= 2) {
     Plan->addVF(VF);
     RSO << "," << VF;
   }
@@ -7986,7 +8015,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
     if (AlsoPack && State.VF.isVector()) {
       // If we're constructing lane 0, initialize to start from undef.
       if (State.Instance->Lane == 0) {
-        assert(!State.VF.Scalable && "VF is assumed to be non scalable.");
+        assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
         Value *Undef =
             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
@@ -7999,7 +8028,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
   // Generate scalar instances for all VF lanes of all UF parts, unless the
   // instruction is uniform inwhich case generate only the first lane for each
   // of the UF parts.
-  unsigned EndLane = IsUniform ? 1 : State.VF.Min;
+  unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
   for (unsigned Part = 0; Part < State.UF; ++Part)
     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
       State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane},
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index a616de6eb4f07..679445455a454 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -300,8 +300,9 @@ void VPRegionBlock::execute(VPTransformState *State) {
 
   for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) {
     State->Instance->Part = Part;
-    assert(!State->VF.Scalable && "VF is assumed to be non scalable.");
-    for (unsigned Lane = 0, VF = State->VF.Min; Lane < VF; ++Lane) {
+    assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
+    for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
+         ++Lane) {
       State->Instance->Lane = Lane;
       // Visit the VPBlocks connected to \p this, starting from it.
       for (VPBlockBase *Block : RPOT) {
@@ -388,7 +389,7 @@ void VPInstruction::generateInstruction(VPTransformState &State,
     Value *ScalarTC = State.TripCount;
 
     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
-    auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.Min);
+    auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue());
     Instruction *Call = Builder.CreateIntrinsic(
         Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
         {VIVElem0, ScalarTC}, nullptr, "active.lane.mask");
@@ -840,14 +841,16 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
   Type *STy = CanonicalIV->getType();
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   ElementCount VF = State.VF;
-  assert(!VF.Scalable && "the code following assumes non scalables ECs");
-  Value *VStart = VF.isScalar() ? CanonicalIV
-                                : Builder.CreateVectorSplat(VF.Min, CanonicalIV,
-                                                            "broadcast");
+  assert(!VF.isScalable() && "the code following assumes non scalables ECs");
+  Value *VStart = VF.isScalar()
+                      ? CanonicalIV
+                      : Builder.CreateVectorSplat(VF.getKnownMinValue(),
+                                                  CanonicalIV, "broadcast");
   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
     SmallVector<Constant *, 8> Indices;
-    for (unsigned Lane = 0; Lane < VF.Min; ++Lane)
-      Indices.push_back(ConstantInt::get(STy, Part * VF.Min + Lane));
+    for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
+      Indices.push_back(
+          ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane));
     // If VF == 1, there is only one iteration in the loop above, thus the
     // element pushed back into Indices is ConstantInt::get(STy, Part)
     Constant *VStep = VF == 1 ? Indices.back() : ConstantVector::get(Indices);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 6eed236fc1493..078b2ba1c70ac 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -151,14 +151,15 @@ struct VectorizerValueMap {
   /// \return True if the map has a scalar entry for \p Key and \p Instance.
   bool hasScalarValue(Value *Key, const VPIteration &Instance) const {
     assert(Instance.Part < UF && "Queried Scalar Part is too large.");
-    assert(Instance.Lane < VF.Min && "Queried Scalar Lane is too large.");
-    assert(!VF.Scalable && "VF is assumed to be non scalable.");
+    assert(Instance.Lane < VF.getKnownMinValue() &&
+           "Queried Scalar Lane is too large.");
+    assert(!VF.isScalable() && "VF is assumed to be non scalable.");
 
     if (!hasAnyScalarValue(Key))
       return false;
     const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
     assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
-    assert(Entry[Instance.Part].size() == VF.Min &&
+    assert(Entry[Instance.Part].size() == VF.getKnownMinValue() &&
            "ScalarParts has wrong dimensions.");
     return Entry[Instance.Part][Instance.Lane] != nullptr;
   }
@@ -197,7 +198,7 @@ struct VectorizerValueMap {
       // TODO: Consider storing uniform values only per-part, as they occupy
       //       lane 0 only, keeping the other VF-1 redundant entries null.
       for (unsigned Part = 0; Part < UF; ++Part)
-        Entry[Part].resize(VF.Min, nullptr);
+        Entry[Part].resize(VF.getKnownMinValue(), nullptr);
       ScalarMapStorage[Key] = Entry;
     }
     ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir
index 550972e4a4f45..37a7b7bd010dd 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir
@@ -135,34 +135,27 @@ body:             |
   ; CHECK:   successors: %bb.2(0x80000000)
   ; CHECK:   liveins: $r0, $r1, $r2, $r3
   ; CHECK:   $r12 = t2MOVi16 target-flags(arm-lo16) @mask, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r4, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg
   ; CHECK:   $r12 = t2MOVTi16 killed $r12, target-flags(arm-hi16) @mask, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r4 = t2BICri killed renamable $r4, 3, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r5 = t2LDRHi12 killed renamable $r12, 0, 14 /* CC::al */, $noreg :: (dereferenceable load 2 from %ir.mask.gep9)
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r4, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r4, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
   ; CHECK:   $vpr = VMSR_P0 $r5, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 16, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   VSTR_P0_off killed renamable $vpr, $sp, 0, 14 /* CC::al */, $noreg :: (store 4 into %stack.0)
   ; CHECK:   renamable $q0 = MVE_VDUP32 killed renamable $r5, 0, $noreg, undef renamable $q0
   ; CHECK:   $r3 = tMOVr $r0, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
   ; CHECK: bb.2.bb9:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r12
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r3, $r12
   ; CHECK:   renamable $vpr = VLDR_P0_off $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.0)
-  ; CHECK:   MVE_VPST 2, implicit $vpr
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
+  ; CHECK:   MVE_VPST 4, implicit $vpr
   ; CHECK:   renamable $r1, renamable $q1 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4)
   ; CHECK:   renamable $r3, renamable $q2 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4)
-  ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
   ; CHECK:   renamable $r12, renamable $q2 = MVE_VLDRWU32_pre killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.scevgep2, align 8)
   ; CHECK:   MVE_VPTv4u32 8, renamable $q0, killed renamable $q2, 2, implicit-def $vpr
   ; CHECK:   MVE_VSTRWU32 killed renamable $q1, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
   ; CHECK:   $r0 = tMOVr $r3, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.bb27:
   ; CHECK:   $sp = tADDspi $sp, 1, 14 /* CC::al */, $noreg
   ; CHECK:   tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $r5, def $r7, def $pc
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir
index 96652c5d76e93..712faa59fb7d5 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extract-element.mir
@@ -118,27 +118,16 @@ body:             |
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
-  ; CHECK:   renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   dead $lr = t2DLS renamable $r12
-  ; CHECK:   $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $q0, $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
-  ; CHECK:   MVE_VPST 4, implicit $vpr
-  ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2)
-  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2)
-  ; CHECK:   $lr = tMOVr $r3, 14 /* CC::al */, $noreg
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1
+  ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv17, align 2)
+  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv1820, align 2)
   ; CHECK:   renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
-  ; CHECK:   renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
-  ; CHECK:   dead $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.middle.block:
   ; CHECK:   liveins: $q0
   ; CHECK:   $r0 = VMOVRS killed $s3, 14 /* CC::al */, $noreg, implicit killed $q0
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir
index 15166cece6acb..9eb95d7e8072c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/no-vpsel-liveout.mir
@@ -115,27 +115,16 @@ body:             |
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
-  ; CHECK:   renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
-  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   dead $lr = t2DLS renamable $r12
-  ; CHECK:   $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $q0, $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
-  ; CHECK:   MVE_VPST 4, implicit $vpr
-  ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2)
-  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2)
-  ; CHECK:   $lr = tMOVr $r3, 14 /* CC::al */, $noreg
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1
+  ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv17, align 2)
+  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv1820, align 2)
   ; CHECK:   renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
-  ; CHECK:   renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
-  ; CHECK:   dead $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.middle.block:
   ; CHECK:   liveins: $q0
   ; CHECK:   renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir
new file mode 100644
index 0000000000000..f013cb2f86156
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions-vpt-liveout.mir
@@ -0,0 +1,930 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s
+
+--- |
+  define dso_local arm_aapcs_vfpcc i32 @mul_var_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) {
+  entry:
+    %cmp9.not = icmp eq i32 %N, 0
+    %0 = add i32 %N, 3
+    %1 = lshr i32 %0, 2
+    %2 = shl nuw i32 %1, 2
+    %3 = add i32 %2, -4
+    %4 = lshr i32 %3, 2
+    %5 = add nuw nsw i32 %4, 1
+    br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
+
+  vector.ph:                                        ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %5)
+    br label %vector.body
+
+  vector.body:                                      ; preds = %vector.body, %vector.ph
+    %lsr.iv14 = phi i8* [ %scevgep15, %vector.body ], [ %b, %vector.ph ]
+    %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
+    %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ]
+    %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ]
+    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
+    %lsr.iv13 = bitcast i8* %lsr.iv to <4 x i8>*
+    %lsr.iv1416 = bitcast i8* %lsr.iv14 to <4 x i8>*
+    %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
+    %9 = sub i32 %7, 4
+    %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %lsr.iv13, i32 1, <4 x i1> %8, <4 x i8> undef)
+    %10 = zext <4 x i8> %wide.masked.load to <4 x i32>
+    %wide.masked.load12 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %lsr.iv1416, i32 1, <4 x i1> %8, <4 x i8> undef)
+    %11 = zext <4 x i8> %wide.masked.load12 to <4 x i32>
+    %12 = mul nuw nsw <4 x i32> %11, %10
+    %13 = select <4 x i1> %8, <4 x i32> %12, <4 x i32> zeroinitializer
+    %14 = add <4 x i32> %vec.phi, %13
+    %scevgep = getelementptr i8, i8* %lsr.iv, i32 4
+    %scevgep15 = getelementptr i8, i8* %lsr.iv14, i32 4
+    %15 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
+    %16 = icmp ne i32 %15, 0
+    br i1 %16, label %vector.body, label %middle.block
+
+  middle.block:                                     ; preds = %vector.body
+    %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14)
+    br label %for.cond.cleanup
+
+  for.cond.cleanup:                                 ; preds = %middle.block, %entry
+    %res.0.lcssa = phi i32 [ 0, %entry ], [ %17, %middle.block ]
+    ret i32 %res.0.lcssa
+  }
+
+  define dso_local arm_aapcs_vfpcc i32 @add_var_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) {
+  entry:
+    %cmp10.not = icmp eq i32 %N, 0
+    %0 = add i32 %N, 3
+    %1 = lshr i32 %0, 2
+    %2 = shl nuw i32 %1, 2
+    %3 = add i32 %2, -4
+    %4 = lshr i32 %3, 2
+    %5 = add nuw nsw i32 %4, 1
+    br i1 %cmp10.not, label %for.cond.cleanup, label %vector.ph
+
+  vector.ph:                                        ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %5)
+    br label %vector.body
+
+  vector.body:                                      ; preds = %vector.body, %vector.ph
+    %lsr.iv15 = phi i8* [ %scevgep16, %vector.body ], [ %b, %vector.ph ]
+    %lsr.iv = phi i8* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
+    %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ]
+    %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ]
+    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
+    %lsr.iv14 = bitcast i8* %lsr.iv to <4 x i8>*
+    %lsr.iv1517 = bitcast i8* %lsr.iv15 to <4 x i8>*
+    %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
+    %9 = sub i32 %7, 4
+    %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %lsr.iv14, i32 1, <4 x i1> %8, <4 x i8> undef)
+    %10 = zext <4 x i8> %wide.masked.load to <4 x i32>
+    %wide.masked.load13 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %lsr.iv1517, i32 1, <4 x i1> %8, <4 x i8> undef)
+    %11 = zext <4 x i8> %wide.masked.load13 to <4 x i32>
+    %12 = add <4 x i32> %vec.phi, %10
+    %13 = add <4 x i32> %12, %11
+    %14 = select <4 x i1> %8, <4 x i32> %13, <4 x i32> %vec.phi
+    %scevgep = getelementptr i8, i8* %lsr.iv, i32 4
+    %scevgep16 = getelementptr i8, i8* %lsr.iv15, i32 4
+    %15 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
+    %16 = icmp ne i32 %15, 0
+    br i1 %16, label %vector.body, label %middle.block
+
+  middle.block:                                     ; preds = %vector.body
+    %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14)
+    br label %for.cond.cleanup
+
+  for.cond.cleanup:                                 ; preds = %middle.block, %entry
+    %res.0.lcssa = phi i32 [ 0, %entry ], [ %17, %middle.block ]
+    ret i32 %res.0.lcssa
+  }
+
+  define dso_local arm_aapcs_vfpcc i32 @mul_var_i16(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) {
+  entry:
+    %cmp9.not = icmp eq i32 %N, 0
+    %0 = add i32 %N, 3
+    %1 = lshr i32 %0, 2
+    %2 = shl nuw i32 %1, 2
+    %3 = add i32 %2, -4
+    %4 = lshr i32 %3, 2
+    %5 = add nuw nsw i32 %4, 1
+    br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
+
+  vector.ph:                                        ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %5)
+    br label %vector.body
+
+  vector.body:                                      ; preds = %vector.body, %vector.ph
+    %lsr.iv14 = phi i16* [ %scevgep15, %vector.body ], [ %b, %vector.ph ]
+    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
+    %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ]
+    %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ]
+    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
+    %lsr.iv13 = bitcast i16* %lsr.iv to <4 x i16>*
+    %lsr.iv1416 = bitcast i16* %lsr.iv14 to <4 x i16>*
+    %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
+    %9 = sub i32 %7, 4
+    %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv13, i32 2, <4 x i1> %8, <4 x i16> undef)
+    %10 = sext <4 x i16> %wide.masked.load to <4 x i32>
+    %wide.masked.load12 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1416, i32 2, <4 x i1> %8, <4 x i16> undef)
+    %11 = sext <4 x i16> %wide.masked.load12 to <4 x i32>
+    %12 = mul nsw <4 x i32> %11, %10
+    %13 = select <4 x i1> %8, <4 x i32> %12, <4 x i32> zeroinitializer
+    %14 = add <4 x i32> %vec.phi, %13
+    %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
+    %scevgep15 = getelementptr i16, i16* %lsr.iv14, i32 4
+    %15 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
+    %16 = icmp ne i32 %15, 0
+    br i1 %16, label %vector.body, label %middle.block
+
+  middle.block:                                     ; preds = %vector.body
+    %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14)
+    br label %for.cond.cleanup
+
+  for.cond.cleanup:                                 ; preds = %middle.block, %entry
+    %res.0.lcssa = phi i32 [ 0, %entry ], [ %17, %middle.block ]
+    ret i32 %res.0.lcssa
+  }
+
+  define dso_local arm_aapcs_vfpcc i32 @add_var_i16(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) {
+  entry:
+    %cmp10.not = icmp eq i32 %N, 0
+    %0 = add i32 %N, 3
+    %1 = lshr i32 %0, 2
+    %2 = shl nuw i32 %1, 2
+    %3 = add i32 %2, -4
+    %4 = lshr i32 %3, 2
+    %5 = add nuw nsw i32 %4, 1
+    br i1 %cmp10.not, label %for.cond.cleanup, label %vector.ph
+
+  vector.ph:                                        ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %5)
+    br label %vector.body
+
+  vector.body:                                      ; preds = %vector.body, %vector.ph
+    %lsr.iv15 = phi i16* [ %scevgep16, %vector.body ], [ %b, %vector.ph ]
+    %lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
+    %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %14, %vector.body ]
+    %6 = phi i32 [ %5, %vector.ph ], [ %15, %vector.body ]
+    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
+    %lsr.iv14 = bitcast i16* %lsr.iv to <4 x i16>*
+    %lsr.iv1517 = bitcast i16* %lsr.iv15 to <4 x i16>*
+    %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
+    %9 = sub i32 %7, 4
+    %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv14, i32 2, <4 x i1> %8, <4 x i16> undef)
+    %10 = sext <4 x i16> %wide.masked.load to <4 x i32>
+    %wide.masked.load13 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1517, i32 2, <4 x i1> %8, <4 x i16> undef)
+    %11 = sext <4 x i16> %wide.masked.load13 to <4 x i32>
+    %12 = add <4 x i32> %vec.phi, %10
+    %13 = add <4 x i32> %12, %11
+    %14 = select <4 x i1> %8, <4 x i32> %13, <4 x i32> %vec.phi
+    %scevgep = getelementptr i16, i16* %lsr.iv, i32 4
+    %scevgep16 = getelementptr i16, i16* %lsr.iv15, i32 4
+    %15 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
+    %16 = icmp ne i32 %15, 0
+    br i1 %16, label %vector.body, label %middle.block
+
+  middle.block:                                     ; preds = %vector.body
+    %17 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %14)
+    br label %for.cond.cleanup
+
+  for.cond.cleanup:                                 ; preds = %middle.block, %entry
+    %res.0.lcssa = phi i32 [ 0, %entry ], [ %17, %middle.block ]
+    ret i32 %res.0.lcssa
+  }
+
+  ; Function Attrs: norecurse nounwind readonly
+  define dso_local arm_aapcs_vfpcc i32 @mul_var_i32(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %N) local_unnamed_addr #0 {
+  entry:
+    %cmp8.not = icmp eq i32 %N, 0
+    %0 = add i32 %N, 3
+    %1 = lshr i32 %0, 2
+    %2 = shl nuw i32 %1, 2
+    %3 = add i32 %2, -4
+    %4 = lshr i32 %3, 2
+    %5 = add nuw nsw i32 %4, 1
+    br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
+
+  vector.ph:                                        ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %5)
+    br label %vector.body
+
+  vector.body:                                      ; preds = %vector.body, %vector.ph
+    %lsr.iv13 = phi i32* [ %scevgep14, %vector.body ], [ %b, %vector.ph ]
+    %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
+    %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ]
+    %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ]
+    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
+    %lsr.iv12 = bitcast i32* %lsr.iv to <4 x i32>*
+    %lsr.iv1315 = bitcast i32* %lsr.iv13 to <4 x i32>*
+    %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
+    %9 = sub i32 %7, 4
+    %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv12, i32 4, <4 x i1> %8, <4 x i32> undef)
+    %wide.masked.load11 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1315, i32 4, <4 x i1> %8, <4 x i32> undef)
+    %10 = mul nsw <4 x i32> %wide.masked.load11, %wide.masked.load
+    %11 = select <4 x i1> %8, <4 x i32> %10, <4 x i32> zeroinitializer
+    %12 = add <4 x i32> %vec.phi, %11
+    %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+    %scevgep14 = getelementptr i32, i32* %lsr.iv13, i32 4
+    %13 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
+    %14 = icmp ne i32 %13, 0
+    br i1 %14, label %vector.body, label %middle.block
+
+  middle.block:                                     ; preds = %vector.body
+    %15 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %12)
+    br label %for.cond.cleanup
+
+  for.cond.cleanup:                                 ; preds = %middle.block, %entry
+    %res.0.lcssa = phi i32 [ 0, %entry ], [ %15, %middle.block ]
+    ret i32 %res.0.lcssa
+  }
+
+  ; Function Attrs: norecurse nounwind readonly
+  define dso_local arm_aapcs_vfpcc i32 @add_var_i32(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %N) local_unnamed_addr #0 {
+  entry:
+    %cmp9.not = icmp eq i32 %N, 0
+    %0 = add i32 %N, 3
+    %1 = lshr i32 %0, 2
+    %2 = shl nuw i32 %1, 2
+    %3 = add i32 %2, -4
+    %4 = lshr i32 %3, 2
+    %5 = add nuw nsw i32 %4, 1
+    br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
+
+  vector.ph:                                        ; preds = %entry
+    call void @llvm.set.loop.iterations.i32(i32 %5)
+    br label %vector.body
+
+  vector.body:                                      ; preds = %vector.body, %vector.ph
+    %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %b, %vector.ph ]
+    %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
+    %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ]
+    %6 = phi i32 [ %5, %vector.ph ], [ %13, %vector.body ]
+    %7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
+    %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
+    %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
+    %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7)
+    %9 = sub i32 %7, 4
+    %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef)
+    %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef)
+    %10 = add <4 x i32> %wide.masked.load, %vec.phi
+    %11 = add <4 x i32> %10, %wide.masked.load12
+    %12 = select <4 x i1> %8, <4 x i32> %11, <4 x i32> %vec.phi
+    %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+    %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
+    %13 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
+    %14 = icmp ne i32 %13, 0
+    br i1 %14, label %vector.body, label %middle.block
+
+  middle.block:                                     ; preds = %vector.body
+    %15 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %12)
+    br label %for.cond.cleanup
+
+  for.cond.cleanup:                                 ; preds = %middle.block, %entry
+    %res.0.lcssa = phi i32 [ 0, %entry ], [ %15, %middle.block ]
+    ret i32 %res.0.lcssa
+  }
+
+  declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>)
+  declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
+  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+  declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
+  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
+  declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+
+...
+---
+name:            mul_var_i8
+alignment:       2
+tracksRegLiveness: true
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: mul_var_i8
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r7
+  ; CHECK:   tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2IT 0, 4, implicit-def $itstate
+  ; CHECK:   renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate
+  ; CHECK:   tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate
+  ; CHECK: bb.1.vector.ph:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_register $r7
+  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
+  ; CHECK: bb.2.vector.body (align 4):
+  ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1
+  ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRBU32_post killed renamable $r0, 4, 0, $noreg :: (load 4 from %ir.lsr.iv13, align 1)
+  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRBU32_post killed renamable $r1, 4, 0, $noreg :: (load 4 from %ir.lsr.iv1416, align 1)
+  ; CHECK:   renamable $q1 = nuw nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
+  ; CHECK:   renamable $q0 = MVE_VADDi32 killed renamable $q0, killed renamable $q1, 0, killed $noreg, killed renamable $q0
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
+  ; CHECK: bb.3.middle.block:
+  ; CHECK:   liveins: $q0
+  ; CHECK:   renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0
+  bb.0.entry:
+    successors: %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2, $lr
+
+    tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2IT 0, 4, implicit-def $itstate
+    renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate
+    tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate
+
+  bb.1.vector.ph:
+    successors: %bb.2(0x80000000)
+    liveins: $r0, $r1, $r2, $lr
+
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
+    frame-setup CFI_INSTRUCTION def_cfa_register $r7
+    renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg
+    renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
+    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
+    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+    t2DoLoopStart renamable $lr
+
+  bb.2.vector.body (align 4):
+    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r2
+
+    renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
+    MVE_VPST 4, implicit $vpr
+    renamable $r0, renamable $q1 = MVE_VLDRBU32_post killed renamable $r0, 4, 1, renamable $vpr :: (load 4 from %ir.lsr.iv13, align 1)
+    renamable $r1, renamable $q2 = MVE_VLDRBU32_post killed renamable $r1, 4, 1, renamable $vpr :: (load 4 from %ir.lsr.iv1416, align 1)
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
+    renamable $q1 = nuw nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
+    MVE_VPST 8, implicit $vpr
+    renamable $q0 = MVE_VADDi32 killed renamable $q0, killed renamable $q1, 1, killed renamable $vpr, renamable $q0
+    t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr
+    tB %bb.3, 14 /* CC::al */, $noreg
+
+  bb.3.middle.block:
+    liveins: $q0
+
+    renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0
+
+...
+---
+name:            add_var_i8
+alignment:       2
+tracksRegLiveness: true
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: add_var_i8
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r7
+  ; CHECK:   tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2IT 0, 4, implicit-def $itstate
+  ; CHECK:   renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate
+  ; CHECK:   tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate
+  ; CHECK: bb.1.vector.ph:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_register $r7
+  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
+  ; CHECK: bb.2.vector.body (align 4):
+  ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1
+  ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRBU32_post killed renamable $r0, 4, 0, $noreg :: (load 4 from %ir.lsr.iv14, align 1)
+  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRBU32_post killed renamable $r1, 4, 0, $noreg :: (load 4 from %ir.lsr.iv1517, align 1)
+  ; CHECK:   renamable $q1 = MVE_VADDi32 renamable $q0, killed renamable $q1, 0, $noreg, undef renamable $q1
+  ; CHECK:   renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q2, 0, killed $noreg, killed renamable $q0
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
+  ; CHECK: bb.3.middle.block:
+  ; CHECK:   liveins: $q0
+  ; CHECK:   renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0
+  bb.0.entry:
+    successors: %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2, $lr
+
+    tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2IT 0, 4, implicit-def $itstate
+    renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate
+    tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate
+
+  bb.1.vector.ph:
+    successors: %bb.2(0x80000000)
+    liveins: $r0, $r1, $r2, $lr
+
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
+    frame-setup CFI_INSTRUCTION def_cfa_register $r7
+    renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg
+    renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
+    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
+    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+    t2DoLoopStart renamable $lr
+
+  bb.2.vector.body (align 4):
+    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r2
+
+    renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
+    MVE_VPST 4, implicit $vpr
+    renamable $r0, renamable $q1 = MVE_VLDRBU32_post killed renamable $r0, 4, 1, renamable $vpr :: (load 4 from %ir.lsr.iv14, align 1)
+    renamable $r1, renamable $q2 = MVE_VLDRBU32_post killed renamable $r1, 4, 1, renamable $vpr :: (load 4 from %ir.lsr.iv1517, align 1)
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
+    renamable $q1 = MVE_VADDi32 renamable $q0, killed renamable $q1, 0, $noreg, undef renamable $q1
+    MVE_VPST 8, implicit $vpr
+    renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0
+    t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr
+    tB %bb.3, 14 /* CC::al */, $noreg
+
+  bb.3.middle.block:
+    liveins: $q0
+
+    renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0
+
+...
+---
+name:            mul_var_i16
+alignment:       2
+exposesReturnsTwice: false
+tracksRegLiveness: true
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: mul_var_i16
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r7
+  ; CHECK:   tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2IT 0, 4, implicit-def $itstate
+  ; CHECK:   renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate
+  ; CHECK:   tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate
+  ; CHECK: bb.1.vector.ph:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_register $r7
+  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
+  ; CHECK: bb.2.vector.body (align 4):
+  ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1
+  ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv13, align 2)
+  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, $noreg :: (load 8 from %ir.lsr.iv1416, align 2)
+  ; CHECK:   renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
+  ; CHECK:   renamable $q0 = MVE_VADDi32 killed renamable $q0, killed renamable $q1, 0, killed $noreg, killed renamable $q0
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
+  ; CHECK: bb.3.middle.block:
+  ; CHECK:   liveins: $q0
+  ; CHECK:   renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0
+  bb.0.entry:
+    successors: %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2, $lr
+
+    tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2IT 0, 4, implicit-def $itstate
+    renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate
+    tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate
+
+  bb.1.vector.ph:
+    successors: %bb.2(0x80000000)
+    liveins: $r0, $r1, $r2, $lr
+
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
+    frame-setup CFI_INSTRUCTION def_cfa_register $r7
+    renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg
+    renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
+    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
+    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+    t2DoLoopStart renamable $lr
+
+  bb.2.vector.body (align 4):
+    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r2
+
+    renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
+    MVE_VPST 4, implicit $vpr
+    renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv13, align 2)
+    renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1416, align 2)
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
+    renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
+    MVE_VPST 8, implicit $vpr
+    renamable $q0 = MVE_VADDi32 killed renamable $q0, killed renamable $q1, 1, killed renamable $vpr, renamable $q0
+    t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr
+    tB %bb.3, 14 /* CC::al */, $noreg
+
+  bb.3.middle.block:
+    liveins: $q0
+
+    renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0
+
+...
+---
+name:            add_var_i16
+alignment:       2
+tracksRegLiveness: true
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: add_var_i16
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r7
+  ; CHECK:   tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2IT 0, 4, implicit-def $itstate
+  ; CHECK:   renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate
+  ; CHECK:   tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate
+  ; CHECK: bb.1.vector.ph:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_register $r7
+  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
+  ; CHECK: bb.2.vector.body (align 4):
+  ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1
+  ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv14, align 2)
+  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, $noreg :: (load 8 from %ir.lsr.iv1517, align 2)
+  ; CHECK:   renamable $q1 = MVE_VADDi32 renamable $q0, killed renamable $q1, 0, $noreg, undef renamable $q1
+  ; CHECK:   renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q2, 0, killed $noreg, killed renamable $q0
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
+  ; CHECK: bb.3.middle.block:
+  ; CHECK:   liveins: $q0
+  ; CHECK:   renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0
+  bb.0.entry:
+    successors: %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2, $lr
+
+    tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2IT 0, 4, implicit-def $itstate
+    renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate
+    tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate
+
+  bb.1.vector.ph:
+    successors: %bb.2(0x80000000)
+    liveins: $r0, $r1, $r2, $lr
+
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
+    frame-setup CFI_INSTRUCTION def_cfa_register $r7
+    renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg
+    renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
+    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
+    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+    t2DoLoopStart renamable $lr
+
+  bb.2.vector.body (align 4):
+    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r2
+
+    renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
+    MVE_VPST 4, implicit $vpr
+    renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv14, align 2)
+    renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv1517, align 2)
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
+    renamable $q1 = MVE_VADDi32 renamable $q0, killed renamable $q1, 0, $noreg, undef renamable $q1
+    MVE_VPST 8, implicit $vpr
+    renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0
+    t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr
+    tB %bb.3, 14 /* CC::al */, $noreg
+
+  bb.3.middle.block:
+    liveins: $q0
+
+    renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0
+
+...
+---
+name:            mul_var_i32
+alignment:       2
+tracksRegLiveness: true
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: mul_var_i32
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r7
+  ; CHECK:   tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2IT 0, 4, implicit-def $itstate
+  ; CHECK:   renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate
+  ; CHECK:   tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate
+  ; CHECK: bb.1.vector.ph:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_register $r7
+  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
+  ; CHECK: bb.2.vector.body (align 4):
+  ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1
+  ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.lsr.iv12, align 4)
+  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1315, align 4)
+  ; CHECK:   renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
+  ; CHECK:   renamable $q0 = MVE_VADDi32 killed renamable $q0, killed renamable $q1, 0, killed $noreg, killed renamable $q0
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
+  ; CHECK: bb.3.middle.block:
+  ; CHECK:   liveins: $q0
+  ; CHECK:   renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0
+  bb.0.entry:
+    successors: %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2, $lr
+
+    tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2IT 0, 4, implicit-def $itstate
+    renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate
+    tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate
+
+  bb.1.vector.ph:
+    successors: %bb.2(0x80000000)
+    liveins: $r0, $r1, $r2, $lr
+
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
+    frame-setup CFI_INSTRUCTION def_cfa_register $r7
+    renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg
+    renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
+    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
+    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+    t2DoLoopStart renamable $lr
+
+  bb.2.vector.body (align 4):
+    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r2
+
+    renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
+    MVE_VPST 4, implicit $vpr
+    renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv12, align 4)
+    renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1315, align 4)
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
+    renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
+    MVE_VPST 8, implicit $vpr
+    renamable $q0 = MVE_VADDi32 killed renamable $q0, killed renamable $q1, 1, killed renamable $vpr, renamable $q0
+    t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr
+    tB %bb.3, 14 /* CC::al */, $noreg
+
+  bb.3.middle.block:
+    liveins: $q0
+
+    renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0
+
+...
+---
+name:            add_var_i32
+alignment:       2
+tracksRegLiveness: true
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: add_var_i32
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r7
+  ; CHECK:   tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2IT 0, 4, implicit-def $itstate
+  ; CHECK:   renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate
+  ; CHECK:   tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate
+  ; CHECK: bb.1.vector.ph:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_register $r7
+  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
+  ; CHECK: bb.2.vector.body (align 4):
+  ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1
+  ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.lsr.iv13, align 4)
+  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1416, align 4)
+  ; CHECK:   renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1
+  ; CHECK:   renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q2, 0, killed $noreg, killed renamable $q0
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
+  ; CHECK: bb.3.middle.block:
+  ; CHECK:   liveins: $q0
+  ; CHECK:   renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0
+  bb.0.entry:
+    successors: %bb.1(0x50000000)
+    liveins: $r0, $r1, $r2, $lr
+
+    tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2IT 0, 4, implicit-def $itstate
+    renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate
+    tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate
+
+  bb.1.vector.ph:
+    successors: %bb.2(0x80000000)
+    liveins: $r0, $r1, $r2, $lr
+
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
+    frame-setup CFI_INSTRUCTION def_cfa_register $r7
+    renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg
+    renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
+    renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
+    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+    t2DoLoopStart renamable $lr
+
+  bb.2.vector.body (align 4):
+    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r2
+
+    renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
+    MVE_VPST 4, implicit $vpr
+    renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4)
+    renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4)
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
+    renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1
+    MVE_VPST 8, implicit $vpr
+    renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q2, 1, killed renamable $vpr, killed renamable $q0
+    t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr
+    tB %bb.3, 14 /* CC::al */, $noreg
+
+  bb.3.middle.block:
+    liveins: $q0
+
+    renamable $r0 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0
+
+...
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
index a4961f51f32b8..0554742369fdc 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
@@ -4,38 +4,25 @@
 define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_add_add_v16i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr {
 ; CHECK-LABEL: one_loop_add_add_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    ittt eq
-; CHECK-NEXT:    moveq r0, #0
-; CHECK-NEXT:    uxtbeq r0, r0
-; CHECK-NEXT:    bxeq lr
-; CHECK-NEXT:  .LBB0_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    add.w r3, r2, #15
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    bic r3, r3, #15
-; CHECK-NEXT:    sub.w r12, r3, #16
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #4
-; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:  .LBB0_2: @ %vector.body
+; CHECK-NEXT:    cbz r2, .LBB0_4
+; CHECK-NEXT:  @ %bb.1: @ %vector.ph
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    dlstp.8 lr, r2
+; CHECK:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vctp.8 r2
-; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrbt.u8 q1, [r0], #16
-; CHECK-NEXT:    subs r2, #16
-; CHECK-NEXT:    vadd.i8 q1, q1, q0
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrbt.u8 q2, [r1], #16
-; CHECK-NEXT:    vadd.i8 q1, q1, q2
-; CHECK-NEXT:    le lr, .LBB0_2
-; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vpsel q0, q1, q0
-; CHECK-NEXT:    vaddv.u8 r0, q0
-; CHECK-NEXT:    pop.w {r7, lr}
-; CHECK-NEXT:    uxtb r0, r0
-; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
+; CHECK-NEXT:    vldrb.u8 q2, [r0], #16
+; CHECK-NEXT:    vadd.i8 q0, q2, q1
+; CHECK-NEXT:    vaddv.u8 r12, q0
+; CHECK-NEXT:    letp lr, .LBB0_2
+; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
+; CHECK-NEXT:    uxtb.w r0, r12
+; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:    mov.w r12, #0
+; CHECK-NEXT:    uxtb.w r0, r12
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %cmp11 = icmp eq i32 %N, 0
   br i1 %cmp11, label %for.cond.cleanup, label %vector.ph
@@ -56,19 +43,18 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %i2 = getelementptr inbounds i8, i8* %b, i32 %index
   %i3 = bitcast i8* %i2 to <16 x i8>*
   %wide.masked.load16 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %i3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
-  %i4 = add <16 x i8> %wide.masked.load, %vec.phi
-  %i5 = add <16 x i8> %i4, %wide.masked.load16
+  %i4 = add <16 x i8> %wide.masked.load, %wide.masked.load16
+  %i5 = select <16 x i1> %active.lane.mask, <16 x i8> %i4, <16 x i8> %vec.phi
+  %i6 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %i5)
   %index.next = add i32 %index, 16
-  %i6 = icmp eq i32 %index.next, %n.vec
-  br i1 %i6, label %middle.block, label %vector.body
+  %i7 = icmp eq i32 %index.next, %n.vec
+  br i1 %i7, label %middle.block, label %vector.body
 
 middle.block:                                     ; preds = %vector.body
-  %i7 = select <16 x i1> %active.lane.mask, <16 x i8> %i5, <16 x i8> %vec.phi
-  %i8 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %i7)
   br label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %middle.block, %entry
-  %res.0.lcssa = phi i8 [ 0, %entry ], [ %i8, %middle.block ]
+  %res.0.lcssa = phi i8 [ 0, %entry ], [ %i6, %middle.block ]
   ret i8 %res.0.lcssa
 }
 
@@ -89,7 +75,7 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_add_add_v8i16(i8* nocaptu
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
 ; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:  .LBB1_2: @ %vector.body
+; CHECK:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.16 r2
 ; CHECK-NEXT:    vmov q0, q1
@@ -155,16 +141,26 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_sub_add_v16i8(i8* nocaptur
 ; CHECK-NEXT:    bxeq lr
 ; CHECK-NEXT:  .LBB2_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    dlstp.8 lr, r2
-; CHECK-NEXT:  .LBB2_2: @ %vector.body
+; CHECK-NEXT:    add.w r3, r2, #15
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    bic r3, r3, #15
+; CHECK-NEXT:    sub.w r12, r3, #16
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    add.w lr, r3, r12, lsr #4
+; CHECK-NEXT:    dls lr, lr
+; CHECK:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
-; CHECK-NEXT:    vldrb.u8 q2, [r0], #16
+; CHECK-NEXT:    vctp.8 r2
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrbt.u8 q1, [r1], #16
+; CHECK-NEXT:    vldrbt.u8 q2, [r0], #16
+; CHECK-NEXT:    subs r2, #16
 ; CHECK-NEXT:    vsub.i8 q1, q2, q1
-; CHECK-NEXT:    vadd.i8 q0, q1, q0
-; CHECK-NEXT:    letp lr, .LBB2_2
+; CHECK-NEXT:    vadd.i8 q1, q1, q0
+; CHECK-NEXT:    le lr, .LBB2_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
+; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vaddv.u8 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    uxtb r0, r0
@@ -215,16 +211,26 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_sub_add_v8i16(i8* nocaptu
 ; CHECK-NEXT:    bxeq lr
 ; CHECK-NEXT:  .LBB3_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    dlstp.16 lr, r2
-; CHECK-NEXT:  .LBB3_2: @ %vector.body
+; CHECK-NEXT:    adds r3, r2, #7
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    bic r3, r3, #7
+; CHECK-NEXT:    sub.w r12, r3, #8
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
+; CHECK-NEXT:    dls lr, lr
+; CHECK:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrb.u16 q1, [r0], #8
-; CHECK-NEXT:    vldrb.u16 q2, [r1], #8
+; CHECK-NEXT:    vctp.16 r2
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrbt.u16 q1, [r0], #8
+; CHECK-NEXT:    vldrbt.u16 q2, [r1], #8
+; CHECK-NEXT:    subs r2, #8
 ; CHECK-NEXT:    vsub.i16 q1, q2, q1
-; CHECK-NEXT:    vadd.i16 q0, q1, q0
-; CHECK-NEXT:    letp lr, .LBB3_2
+; CHECK-NEXT:    vadd.i16 q1, q1, q0
+; CHECK-NEXT:    le lr, .LBB3_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
+; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vaddv.u16 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    sxth r0, r0
@@ -277,16 +283,26 @@ define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_mul_add_v16i8(i8* nocaptur
 ; CHECK-NEXT:    bxeq lr
 ; CHECK-NEXT:  .LBB4_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    dlstp.8 lr, r2
-; CHECK-NEXT:  .LBB4_2: @ %vector.body
+; CHECK-NEXT:    add.w r3, r2, #15
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    bic r3, r3, #15
+; CHECK-NEXT:    sub.w r12, r3, #16
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    add.w lr, r3, r12, lsr #4
+; CHECK-NEXT:    dls lr, lr
+; CHECK:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrb.u8 q1, [r0], #16
-; CHECK-NEXT:    vldrb.u8 q2, [r1], #16
+; CHECK-NEXT:    vctp.8 r2
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrbt.u8 q1, [r0], #16
+; CHECK-NEXT:    vldrbt.u8 q2, [r1], #16
+; CHECK-NEXT:    subs r2, #16
 ; CHECK-NEXT:    vmul.i8 q1, q2, q1
-; CHECK-NEXT:    vadd.i8 q0, q1, q0
-; CHECK-NEXT:    letp lr, .LBB4_2
+; CHECK-NEXT:    vadd.i8 q1, q1, q0
+; CHECK-NEXT:    le lr, .LBB4_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
+; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vaddv.u8 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    uxtb r0, r0
@@ -337,16 +353,26 @@ define dso_local arm_aapcs_vfpcc signext i16 @one_loop_mul_add_v8i16(i8* nocaptu
 ; CHECK-NEXT:    bxeq lr
 ; CHECK-NEXT:  .LBB5_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    dlstp.16 lr, r2
-; CHECK-NEXT:  .LBB5_2: @ %vector.body
+; CHECK-NEXT:    adds r3, r2, #7
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    bic r3, r3, #7
+; CHECK-NEXT:    sub.w r12, r3, #8
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
+; CHECK-NEXT:    dls lr, lr
+; CHECK:  .LBB5_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrb.u16 q1, [r0], #8
-; CHECK-NEXT:    vldrb.u16 q2, [r1], #8
+; CHECK-NEXT:    vctp.16 r2
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrbt.u16 q1, [r0], #8
+; CHECK-NEXT:    vldrbt.u16 q2, [r1], #8
+; CHECK-NEXT:    subs r2, #8
 ; CHECK-NEXT:    vmul.i16 q1, q2, q1
-; CHECK-NEXT:    vadd.i16 q0, q1, q0
-; CHECK-NEXT:    letp lr, .LBB5_2
+; CHECK-NEXT:    vadd.i16 q1, q1, q0
+; CHECK-NEXT:    le lr, .LBB5_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
+; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vaddv.u16 r0, q0
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    sxth r0, r0
@@ -406,7 +432,7 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read
 ; CHECK-NEXT:    add.w lr, r3, r6, lsr #2
 ; CHECK-NEXT:    mov r3, r2
 ; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:  .LBB6_2: @ %vector.body
+; CHECK:  .LBB6_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r3
 ; CHECK-NEXT:    vmov q0, q1
@@ -422,19 +448,25 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read
 ; CHECK-NEXT:    vaddv.u32 r12, q0
 ; CHECK-NEXT:    cbz r2, .LBB6_7
 ; CHECK-NEXT:  @ %bb.4: @ %vector.ph47
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    add.w lr, r3, r6, lsr #2
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dlstp.32 lr, r2
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vdup.32 q0, r3
-; CHECK-NEXT:    vmov.32 q1[0], r12
-; CHECK-NEXT:  .LBB6_5: @ %vector.body46
+; CHECK-NEXT:    vmov.32 q0[0], r12
+; CHECK:  .LBB6_5: @ %vector.body46
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrb.u32 q0, [r0], #4
-; CHECK-NEXT:    vldrb.u32 q2, [r1], #4
+; CHECK-NEXT:    vctp.32 r2
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrbt.u32 q0, [r0], #4
+; CHECK-NEXT:    vldrbt.u32 q2, [r1], #4
+; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmul.i32 q0, q2, q0
-; CHECK-NEXT:    vadd.i32 q1, q0, q1
-; CHECK-NEXT:    letp lr, .LBB6_5
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    le lr, .LBB6_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block44
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u32 r12, q0
 ; CHECK-NEXT:  .LBB6_7: @ %for.cond.cleanup7
 ; CHECK-NEXT:    mov r0, r12
@@ -527,7 +559,7 @@ define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(i8* nocaptur
 ; CHECK-NEXT:    mov r3, r0
 ; CHECK-NEXT:    mov r4, r1
 ; CHECK-NEXT:    dls lr, lr
-; CHECK-NEXT:  .LBB7_2: @ %vector.body
+; CHECK:  .LBB7_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.16 r2
 ; CHECK-NEXT:    vmov q0, q1
@@ -643,22 +675,31 @@ define i32 @wrongop(%struct.date* nocapture readonly %pd) {
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB8_5: @ %vector.ph
+; CHECK-NEXT:    adds r1, r3, #3
+; CHECK-NEXT:    movs r2, #1
+; CHECK-NEXT:    bic r1, r1, #3
+; CHECK-NEXT:    subs r1, #4
+; CHECK-NEXT:    add.w lr, r2, r1, lsr #2
 ; CHECK-NEXT:    movw r1, :lower16:days
 ; CHECK-NEXT:    movt r1, :upper16:days
 ; CHECK-NEXT:    movs r2, #52
 ; CHECK-NEXT:    mla r1, r4, r2, r1
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    vdup.32 q0, r2
-; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    vmov.32 q0[0], r0
 ; CHECK-NEXT:    subs r0, r3, #1
-; CHECK-NEXT:    dlstp.32 lr, r0
-; CHECK-NEXT:  .LBB8_6: @ %vector.body
+; CHECK:  .LBB8_6: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
-; CHECK-NEXT:    vadd.i32 q1, q0, q1
-; CHECK-NEXT:    letp lr, .LBB8_6
+; CHECK-NEXT:    vctp.32 r0
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r1], #16
+; CHECK-NEXT:    subs r0, #4
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    le lr, .LBB8_6
 ; CHECK-NEXT:  @ %bb.7: @ %middle.block
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 34701aba6324a..15aed3bd4e17a 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -53,24 +53,33 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; ENABLED-NEXT:    movs r7, #1
 ; ENABLED-NEXT:    subs r0, #4
 ; ENABLED-NEXT:    subs r4, r2, r6
-; ENABLED-NEXT:    vmov.i32 q0, #0x0
+; ENABLED-NEXT:    vmov.i32 q1, #0x0
 ; ENABLED-NEXT:    add.w r8, r7, r0, lsr #2
+; ENABLED-NEXT:    sub.w r0, r12, r6
+; ENABLED-NEXT:    bic r0, r0, #3
+; ENABLED-NEXT:    subs r0, #4
+; ENABLED-NEXT:    add.w r0, r7, r0, lsr #2
 ; ENABLED-NEXT:    mov r7, r10
-; ENABLED-NEXT:    dlstp.32 lr, r4
+; ENABLED-NEXT:    dls lr, r0
 ; ENABLED-NEXT:    ldr r0, [sp] @ 4-byte Reload
-; ENABLED-NEXT:  .LBB0_6: @ %vector.body
+; ENABLED:  .LBB0_6: @ %vector.body
 ; ENABLED-NEXT:    @ Parent Loop BB0_4 Depth=1
 ; ENABLED-NEXT:    @ => This Inner Loop Header: Depth=2
-; ENABLED-NEXT:    vldrh.s32 q1, [r0], #8
-; ENABLED-NEXT:    vldrh.s32 q2, [r7], #8
+; ENABLED-NEXT:    vctp.32 r4
+; ENABLED-NEXT:    vmov q0, q1
+; ENABLED-NEXT:    vpstt
+; ENABLED-NEXT:    vldrht.s32 q1, [r0], #8
+; ENABLED-NEXT:    vldrht.s32 q2, [r7], #8
 ; ENABLED-NEXT:    mov lr, r8
 ; ENABLED-NEXT:    vmul.i32 q1, q2, q1
 ; ENABLED-NEXT:    sub.w r8, r8, #1
 ; ENABLED-NEXT:    vshl.s32 q1, r5
-; ENABLED-NEXT:    vadd.i32 q0, q1, q0
-; ENABLED-NEXT:    letp lr, .LBB0_6
+; ENABLED-NEXT:    subs r4, #4
+; ENABLED-NEXT:    vadd.i32 q1, q1, q0
+; ENABLED-NEXT:    le lr, .LBB0_6
 ; ENABLED-NEXT:  @ %bb.7: @ %middle.block
 ; ENABLED-NEXT:    @ in Loop: Header=BB0_4 Depth=1
+; ENABLED-NEXT:    vpsel q0, q1, q0
 ; ENABLED-NEXT:    vaddv.u32 r0, q0
 ; ENABLED-NEXT:    b .LBB0_3
 ; ENABLED-NEXT:  .LBB0_8: @ %for.end17
@@ -103,7 +112,7 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; NOREDUCTIONS-NEXT:    adds r6, #1
 ; NOREDUCTIONS-NEXT:    add.w r10, r10, #2
 ; NOREDUCTIONS-NEXT:    cmp r6, r3
-; NOREDUCTIONS-NEXT:    beq .LBB0_8
+; NOREDUCTIONS:         beq .LBB0_8
 ; NOREDUCTIONS-NEXT:  .LBB0_4: @ %for.body
 ; NOREDUCTIONS-NEXT:    @ =>This Loop Header: Depth=1
 ; NOREDUCTIONS-NEXT:    @ Child Loop BB0_6 Depth 2
@@ -115,24 +124,33 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
 ; NOREDUCTIONS-NEXT:    movs r7, #1
 ; NOREDUCTIONS-NEXT:    subs r0, #4
 ; NOREDUCTIONS-NEXT:    subs r4, r2, r6
-; NOREDUCTIONS-NEXT:    vmov.i32 q0, #0x0
+; NOREDUCTIONS-NEXT:    vmov.i32 q1, #0x0
 ; NOREDUCTIONS-NEXT:    add.w r8, r7, r0, lsr #2
+; NOREDUCTIONS-NEXT:    sub.w r0, r12, r6
+; NOREDUCTIONS-NEXT:    bic r0, r0, #3
+; NOREDUCTIONS-NEXT:    subs r0, #4
+; NOREDUCTIONS-NEXT:    add.w r0, r7, r0, lsr #2
 ; NOREDUCTIONS-NEXT:    mov r7, r10
-; NOREDUCTIONS-NEXT:    dlstp.32 lr, r4
-; NOREDUCTIONS-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; NOREDUCTIONS-NEXT:    dls lr, r0
+; NOREDUCTIONS:         ldr r0, [sp] @ 4-byte Reload
 ; NOREDUCTIONS-NEXT:  .LBB0_6: @ %vector.body
 ; NOREDUCTIONS-NEXT:    @ Parent Loop BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    @ => This Inner Loop Header: Depth=2
-; NOREDUCTIONS-NEXT:    vldrh.s32 q1, [r0], #8
-; NOREDUCTIONS-NEXT:    vldrh.s32 q2, [r7], #8
+; NOREDUCTIONS-NEXT:    vctp.32 r4
+; NOREDUCTIONS-NEXT:    vmov q0, q1
+; NOREDUCTIONS-NEXT:    vpstt
+; NOREDUCTIONS-NEXT:    vldrht.s32 q1, [r0], #8
+; NOREDUCTIONS-NEXT:    vldrht.s32 q2, [r7], #8
 ; NOREDUCTIONS-NEXT:    mov lr, r8
 ; NOREDUCTIONS-NEXT:    vmul.i32 q1, q2, q1
 ; NOREDUCTIONS-NEXT:    sub.w r8, r8, #1
 ; NOREDUCTIONS-NEXT:    vshl.s32 q1, r5
-; NOREDUCTIONS-NEXT:    vadd.i32 q0, q1, q0
-; NOREDUCTIONS-NEXT:    letp lr, .LBB0_6
+; NOREDUCTIONS-NEXT:    subs r4, #4
+; NOREDUCTIONS-NEXT:    vadd.i32 q1, q1, q0
+; NOREDUCTIONS-NEXT:    le lr, .LBB0_6
 ; NOREDUCTIONS-NEXT:  @ %bb.7: @ %middle.block
 ; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_4 Depth=1
+; NOREDUCTIONS-NEXT:    vpsel q0, q1, q0
 ; NOREDUCTIONS-NEXT:    vaddv.u32 r0, q0
 ; NOREDUCTIONS-NEXT:    b .LBB0_3
 ; NOREDUCTIONS-NEXT:  .LBB0_8: @ %for.end17
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir
index 4f80869de3ccb..cdc9d7e7be9c6 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-add-operand-liveout.mir
@@ -122,28 +122,18 @@ body:             |
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
-  ; CHECK:   renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1
-  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   dead $lr = t2DLS renamable $r12
-  ; CHECK:   $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 renamable $r2
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $q1, $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
+  ; CHECK:   liveins: $lr, $q1, $r0, $r1, $r2
   ; CHECK:   $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0
-  ; CHECK:   MVE_VPST 4, implicit $vpr
-  ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2)
-  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2)
-  ; CHECK:   $lr = tMOVr $r3, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv17, align 2)
+  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv1820, align 2)
   ; CHECK:   renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
-  ; CHECK:   renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1
-  ; CHECK:   dead $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.middle.block:
   ; CHECK:   liveins: $q0, $q1, $r2
   ; CHECK:   renamable $r0, dead $cpsr = tADDi3 killed renamable $r2, 4, 14 /* CC::al */, $noreg
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
index c6f64a8464bdc..6628df20f2024 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
@@ -10,19 +10,28 @@ define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* no
 ; CHECK-NEXT:    bxeq lr
 ; CHECK-NEXT:  .LBB0_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    adds r3, r2, #3
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    bic r3, r3, #3
+; CHECK-NEXT:    sub.w r12, r3, #4
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dlstp.32 lr, r2
-; CHECK-NEXT:  .LBB0_2: @ %vector.body
+; CHECK-NEXT:    dls lr, lr
+; CHECK:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
-; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    vctp.32 r2
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrwt.u32 q0, [r0], #16
+; CHECK-NEXT:    vldrwt.u32 q2, [r1], #16
 ; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    vmul.i32 q0, q2, q0
-; CHECK-NEXT:    vadd.i32 q1, q0, q1
-; CHECK-NEXT:    letp lr, .LBB0_2
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    le lr, .LBB0_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -78,17 +87,26 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i
 ; CHECK-NEXT:    bxeq lr
 ; CHECK-NEXT:  .LBB1_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    adds r1, r2, #3
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    bic r1, r1, #3
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    subs r1, #4
+; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
 ; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    dlstp.32 lr, r2
-; CHECK-NEXT:  .LBB1_2: @ %vector.body
+; CHECK-NEXT:    dls lr, lr
+; CHECK:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
+; CHECK-NEXT:    vctp.32 r2
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r0], #16
 ; CHECK-NEXT:    adds r1, #4
-; CHECK-NEXT:    vadd.i32 q1, q0, q1
-; CHECK-NEXT:    letp lr, .LBB1_2
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    le lr, .LBB1_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -140,17 +158,26 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i
 ; CHECK-NEXT:    bxeq lr
 ; CHECK-NEXT:  .LBB2_1: @ %vector.ph
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    adds r1, r2, #3
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    bic r1, r1, #3
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    subs r1, #4
+; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
 ; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    dlstp.32 lr, r2
-; CHECK-NEXT:  .LBB2_2: @ %vector.body
+; CHECK-NEXT:    dls lr, lr
+; CHECK:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
+; CHECK-NEXT:    vctp.32 r2
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q0, [r0], #16
 ; CHECK-NEXT:    adds r1, #4
-; CHECK-NEXT:    vadd.i32 q1, q0, q1
-; CHECK-NEXT:    letp lr, .LBB2_2
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    le lr, .LBB2_2
 ; CHECK-NEXT:  @ %bb.3: @ %middle.block
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -203,7 +230,7 @@ define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias
 ; CHECK-NEXT:  .LBB3_1: @ %vector.ph
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
-; CHECK-NEXT:  .LBB3_2: @ %vector.body
+; CHECK:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
@@ -260,7 +287,7 @@ define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias
 ; CHECK-NEXT:  .LBB4_1: @ %vector.ph
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.32 lr, r3
-; CHECK-NEXT:  .LBB4_2: @ %vector.body
+; CHECK:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add.w r12, r12, #4
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
@@ -317,7 +344,7 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocaptur
 ; CHECK-NEXT:  .LBB5_1: @ %vector.ph
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.8 lr, r3
-; CHECK-NEXT:  .LBB5_2: @ %vector.body
+; CHECK:  .LBB5_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add.w r12, r12, #16
 ; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
@@ -377,7 +404,7 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapt
 ; CHECK-NEXT:  .LBB6_1: @ %vector.ph
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    dlstp.16 lr, r3
-; CHECK-NEXT:  .LBB6_2: @ %vector.body
+; CHECK:  .LBB6_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add.w r12, r12, #8
 ; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
index 595d70b715acd..7578b429790be 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
@@ -425,13 +425,8 @@ body:             |
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
-  ; CHECK:   renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r12 = t2MOVi 0, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   $lr = t2WLS killed renamable $lr, %bb.1
+  ; CHECK:   $lr = MVE_WLSTP_32 $r2, %bb.1
   ; CHECK:   tB %bb.4, 14 /* CC::al */, $noreg
   ; CHECK: bb.1.vector.ph:
   ; CHECK:   successors: %bb.2(0x80000000)
@@ -441,18 +436,15 @@ body:             |
   ; CHECK:   successors: %bb.3(0x04000000), %bb.2(0x7c000000)
   ; CHECK:   liveins: $lr, $q1, $r0, $r1, $r2
   ; CHECK:   $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0
-  ; CHECK:   renamable $vpr = MVE_VCTP32 $r2, 0, $noreg
-  ; CHECK:   MVE_VPST 4, implicit $vpr
-  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4)
-  ; CHECK:   renamable $q2 = MVE_VLDRWU32 renamable $r1, 0, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4)
+  ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, $noreg :: (load 16 from %ir.lsr.iv24, align 4)
+  ; CHECK:   renamable $q2 = MVE_VLDRWU32 renamable $r1, 0, 0, $noreg :: (load 16 from %ir.lsr.iv1, align 4)
   ; CHECK:   $r3 = tMOVr $r2, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
   ; CHECK:   renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 16, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed $r2, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $q1 = nsw MVE_VADDi32 killed renamable $q1, renamable $q0, 0, killed renamable $vpr, undef renamable $q1
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   renamable $q1 = nsw MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.middle.block:
   ; CHECK:   successors: %bb.4(0x80000000)
   ; CHECK:   liveins: $q0, $q1, $r3
@@ -501,7 +493,7 @@ body:             |
     renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg
     renamable $r2, dead $cpsr = tSUBi8 killed $r2, 4, 14, $noreg
     MVE_VPST 8, implicit $vpr
-    renamable $q1 = nsw MVE_VADDi32 killed renamable $q1, renamable $q0, 0, renamable $vpr, undef renamable $q1
+    renamable $q1 = nsw MVE_VADDi32 killed renamable $q1, renamable $q0, 1, renamable $vpr, undef renamable $q1
     renamable $lr = t2LoopDec killed renamable $lr, 1
     t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr
     tB %bb.3, 14, $noreg
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir
index d91556e3e70b9..e377b06fea9f8 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-opcode-liveout.mir
@@ -133,23 +133,21 @@ body:             |
   ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   dead $lr = t2DLS renamable $r3
   ; CHECK:   $r12 = tMOVr killed $r3, 14 /* CC::al */, $noreg
   ; CHECK:   $r3 = tMOVr $r2, 14 /* CC::al */, $noreg
+  ; CHECK:   dead $lr = MVE_DLSTP_32 renamable $r3
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
   ; CHECK:   liveins: $q1, $r0, $r1, $r2, $r3, $r12
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
   ; CHECK:   $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0
-  ; CHECK:   MVE_VPST 4, implicit $vpr
-  ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2)
-  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2)
+  ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv17, align 2)
+  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv1820, align 2)
   ; CHECK:   $lr = tMOVr $r12, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
   ; CHECK:   renamable $r12 = nsw t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1
-  ; CHECK:   dead $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   dead $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.middle.block:
   ; CHECK:   liveins: $q0, $q1, $r2, $r3
   ; CHECK:   renamable $r0, dead $cpsr = tSUBi3 killed renamable $r2, 1, 14 /* CC::al */, $noreg
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir
index 337816146e5f0..05bfdbb2fc0f8 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wrong-vctp-operand-liveout.mir
@@ -119,28 +119,18 @@ body:             |
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
-  ; CHECK:   renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1
-  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   dead $lr = t2DLS renamable $r12
-  ; CHECK:   $r3 = tMOVr killed $r12, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 renamable $r2
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $q1, $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
+  ; CHECK:   liveins: $lr, $q1, $r0, $r1, $r2
   ; CHECK:   $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0
-  ; CHECK:   MVE_VPST 4, implicit $vpr
-  ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr :: (load 8 from %ir.lsr.iv17, align 2)
-  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr :: (load 8 from %ir.lsr.iv1820, align 2)
-  ; CHECK:   $lr = tMOVr $r3, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg :: (load 8 from %ir.lsr.iv17, align 2)
+  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg :: (load 8 from %ir.lsr.iv1820, align 2)
   ; CHECK:   renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
-  ; CHECK:   renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $q1 = MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1
-  ; CHECK:   dead $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.middle.block:
   ; CHECK:   liveins: $q0, $q1, $r2
   ; CHECK:   renamable $vpr = MVE_VCTP32 killed renamable $r2, 0, $noreg
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
index 535d7a1c38cb7..b5efcd1342db7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
@@ -9,29 +9,27 @@ define dso_local void @mve_gather_qi_wb(i32* noalias nocapture readonly %A, i32*
 ; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
 ; CHECK-NEXT:    adr r0, .LCPI0_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vmov.i32 q2, #0x0
-; CHECK-NEXT:    dlstp.32 lr, r3
+; CHECK-NEXT:    movw lr, #1250
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vadd.i32 q0, q0, r1
 ; CHECK-NEXT:    adds r1, r3, #4
-; CHECK-NEXT:  .LBB0_1: @ %vector.body
+; CHECK:  .LBB0_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q1, [r12], #16
-; CHECK-NEXT:    vldrw.u32 q3, [q0, #80]!
+; CHECK-NEXT:    vctp.32 r3
+; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrwt.u32 q1, [r12], #16
+; CHECK-NEXT:    vldrwt.u32 q3, [q0, #80]!
+; CHECK-NEXT:    subs r3, #4
 ; CHECK-NEXT:    vmul.i32 q1, q3, q1
-; CHECK-NEXT:    vadd.i32 q2, q2, q1
-; CHECK-NEXT:    letp lr, .LBB0_1
+; CHECK-NEXT:    vadd.i32 q1, q2, q1
+; CHECK-NEXT:    le lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %middle.block
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vpsel q0, q1, q2
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    str.w r0, [r2, r1, lsl #2]
 ; CHECK-NEXT:    pop {r7, pc}
-; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  @ %bb.3:
-; CHECK-NEXT:  .LCPI0_0:
-; CHECK-NEXT:    .long 4294967228 @ 0xffffffbc
-; CHECK-NEXT:    .long 4294967248 @ 0xffffffd0
-; CHECK-NEXT:    .long 4294967268 @ 0xffffffe4
-; CHECK-NEXT:    .long 4294967288 @ 0xfffffff8
 entry:                                  ; preds = %middle.
   %add.us.us = add i32 4, %n
   %arrayidx.us.us = getelementptr inbounds i32, i32* %C, i32 %add.us.us
@@ -77,20 +75,26 @@ define dso_local void @mve_gatherscatter_offset(i32* noalias nocapture readonly
 ; CHECK-NEXT:    adr r0, .LCPI1_0
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    add.w r12, r3, #4
-; CHECK-NEXT:    vmov.i32 q3, #0x0
+; CHECK-NEXT:    movw lr, #1250
+; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vmov.i32 q0, #0x14
-; CHECK-NEXT:    dlstp.32 lr, r3
-; CHECK-NEXT:  .LBB1_1: @ %vector.body
+; CHECK-NEXT:    dls lr, lr
+; CHECK:  .LBB1_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q2, [r1, q1, uxtw #2]
-; CHECK-NEXT:    vldrw.u32 q4, [r4], #16
+; CHECK-NEXT:    vctp.32 r3
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrwt.u32 q2, [r1, q1, uxtw #2]
+; CHECK-NEXT:    vldrwt.u32 q4, [r4], #16
+; CHECK-NEXT:    subs r3, #4
 ; CHECK-NEXT:    vmul.i32 q2, q2, q4
-; CHECK-NEXT:    vstrw.32 q2, [r1, q1, uxtw #2]
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q2, [r1, q1, uxtw #2]
 ; CHECK-NEXT:    vadd.i32 q1, q1, q0
-; CHECK-NEXT:    vadd.i32 q3, q3, q2
-; CHECK-NEXT:    letp lr, .LBB1_1
+; CHECK-NEXT:    vadd.i32 q2, q3, q2
+; CHECK-NEXT:    le lr, .LBB1_1
 ; CHECK-NEXT:  @ %bb.2: @ %middle.block
-; CHECK-NEXT:    vmov q0, q3
+; CHECK-NEXT:    vpsel q0, q2, q3
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    str.w r0, [r2, r12, lsl #2]
 ; CHECK-NEXT:    vpop {d8, d9}
@@ -144,20 +148,26 @@ define dso_local void @mve_scatter_qi(i32* noalias nocapture readonly %A, i32* n
 ; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
 ; CHECK-NEXT:    adr r0, .LCPI2_0
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vmov.i32 q3, #0x0
+; CHECK-NEXT:    movw lr, #1250
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    vmov.i32 q2, #0x3
 ; CHECK-NEXT:    vadd.i32 q0, q0, r1
 ; CHECK-NEXT:    adds r1, r3, #4
-; CHECK-NEXT:    dlstp.32 lr, r3
-; CHECK-NEXT:  .LBB2_1: @ %vector.body
+; CHECK-NEXT:    dls lr, lr
+; CHECK:  .LBB2_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q1, [r12], #16
+; CHECK-NEXT:    vctp.32 r3
+; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrwt.u32 q1, [r12], #16
+; CHECK-NEXT:    subs r3, #4
 ; CHECK-NEXT:    vmul.i32 q1, q1, q2
-; CHECK-NEXT:    vstrw.32 q1, [q0, #80]!
-; CHECK-NEXT:    vadd.i32 q3, q3, q1
-; CHECK-NEXT:    letp lr, .LBB2_1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrwt.32 q1, [q0, #80]!
+; CHECK-NEXT:    vadd.i32 q1, q3, q1
+; CHECK-NEXT:    le lr, .LBB2_1
 ; CHECK-NEXT:  @ %bb.2: @ %middle.block
-; CHECK-NEXT:    vmov q0, q3
+; CHECK-NEXT:    vpsel q0, q1, q3
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    str.w r0, [r2, r1, lsl #2]
 ; CHECK-NEXT:    pop {r7, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
index 311a06a675771..5669fdf38fee0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll
@@ -9,22 +9,32 @@ define void @arm_min_helium_f32(float* %pSrc, i32 %blockSize, float* nocapture %
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    mov r12, r1
 ; CHECK-NEXT:    vidup.u32 q2, r6, #1
+; CHECK-NEXT:    cmp r1, #4
+; CHECK-NEXT:    it ge
+; CHECK-NEXT:    movge.w r12, #4
+; CHECK-NEXT:    sub.w r6, r1, r12
+; CHECK-NEXT:    adds r6, #3
+; CHECK-NEXT:    mov.w lr, #1
 ; CHECK-NEXT:    adr r4, .LCPI0_0
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    add.w lr, lr, r6, lsr #2
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
 ; CHECK-NEXT:    vmov.i32 q3, #0x4
 ; CHECK-NEXT:    mov r12, r1
-; CHECK-NEXT:    dlstp.32 lr, r12
-; CHECK-NEXT:  .LBB0_1: @ %do.body
+; CHECK-NEXT:    dls lr, lr
+; CHECK:  .LBB0_1: @ %do.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q4, [r0], #16
-; CHECK-NEXT:    vcmp.f32 ge, q1, q4
-; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vctp.32 r12
+; CHECK-NEXT:    sub.w r12, r12, #4
+; CHECK-NEXT:    vpstttt
+; CHECK-NEXT:    vldrwt.u32 q4, [r0], #16
+; CHECK-NEXT:    vcmpt.f32 ge, q1, q4
 ; CHECK-NEXT:    vmovt q1, q4
 ; CHECK-NEXT:    vmovt q0, q2
 ; CHECK-NEXT:    vadd.i32 q2, q2, q3
-; CHECK-NEXT:    letp lr, .LBB0_1
+; CHECK-NEXT:    le lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %do.end
 ; CHECK-NEXT:    vldr s8, .LCPI0_1
 ; CHECK-NEXT:    vdup.32 q3, r1
@@ -38,15 +48,6 @@ define void @arm_min_helium_f32(float* %pSrc, i32 %blockSize, float* nocapture %
 ; CHECK-NEXT:    vstr s8, [r2]
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r4, r6, r7, pc}
-; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  @ %bb.3:
-; CHECK-NEXT:  .LCPI0_0:
-; CHECK-NEXT:    .long 0x5368d4a5 @ float 9.99999995E+11
-; CHECK-NEXT:    .long 0x5368d4a5 @ float 9.99999995E+11
-; CHECK-NEXT:    .long 0x5368d4a5 @ float 9.99999995E+11
-; CHECK-NEXT:    .long 0x5368d4a5 @ float 9.99999995E+11
-; CHECK-NEXT:  .LCPI0_1:
-; CHECK-NEXT:    .long 0x5368d4a5 @ float 9.99999995E+11
 entry:
   %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 1)
   %1 = extractvalue { <4 x i32>, i32 } %0, 0
diff --git a/llvm/test/tools/yaml2obj/ELF/DWARF/debug-abbrev.yaml b/llvm/test/tools/yaml2obj/ELF/DWARF/debug-abbrev.yaml
index f1d14847699cb..23262e0ff5e40 100644
--- a/llvm/test/tools/yaml2obj/ELF/DWARF/debug-abbrev.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/DWARF/debug-abbrev.yaml
@@ -277,9 +277,12 @@ DWARF:
 ##                                            ^-            abbreviation code (ULEB128) 0x04
 ##                                                       ^- abbreviation code (ULEB128) 0x04
 ##
-# CODE-NEXT: 0x00000010 2e000000 052e0000 00062e00 0000
-##                               ^-                     abbreviation code ULEB128
-##                                          ^-          abbreviation code ULEB128
+# CODE-NEXT: 0x00000010 2e000000 052e0000 00062e00 00000001
+##                               ^-                         abbreviation code ULEB128
+##                                          ^-              abbreviation code ULEB128
+##                                                       ^- abbreviation code ULEB128
+# CODE-NEXT: 0x00000020 11000000 022e0000 0000
+##                               ^-                         abbreviation code ULEB128
 
 --- !ELF
 FileHeader:
@@ -303,6 +306,12 @@ DWARF:
           Children:   DW_CHILDREN_no
         - Tag:        DW_TAG_subprogram
           Children:   DW_CHILDREN_no
+    - Table:
+        ## Test that the abbrev codes in a new table start from 1 by default.
+        - Tag:        DW_TAG_compile_unit
+          Children:   DW_CHILDREN_no
+        - Tag:        DW_TAG_subprogram
+          Children:   DW_CHILDREN_no
 
 ## i) Test that yaml2obj emits an error message when there are non-empty compilation units
 ## and multiple abbrev tables are assigned the same ID.
diff --git a/llvm/unittests/CodeGen/ScalableVectorMVTsTest.cpp b/llvm/unittests/CodeGen/ScalableVectorMVTsTest.cpp
index f1c4a8bea1f56..f76b8db602ec7 100644
--- a/llvm/unittests/CodeGen/ScalableVectorMVTsTest.cpp
+++ b/llvm/unittests/CodeGen/ScalableVectorMVTsTest.cpp
@@ -71,8 +71,8 @@ TEST(ScalableVectorMVTsTest, HelperFuncs) {
 
   // Check fields inside llvm::ElementCount
   EltCnt = Vnx4i32.getVectorElementCount();
-  EXPECT_EQ(EltCnt.Min, 4U);
-  ASSERT_TRUE(EltCnt.Scalable);
+  EXPECT_EQ(EltCnt.getKnownMinValue(), 4U);
+  ASSERT_TRUE(EltCnt.isScalable());
 
   // Check that fixed-length vector types aren't scalable.
   EVT V8i32 = EVT::getVectorVT(Ctx, MVT::i32, 8);
@@ -82,8 +82,8 @@ TEST(ScalableVectorMVTsTest, HelperFuncs) {
 
   // Check that llvm::ElementCount works for fixed-length types.
   EltCnt = V8i32.getVectorElementCount();
-  EXPECT_EQ(EltCnt.Min, 8U);
-  ASSERT_FALSE(EltCnt.Scalable);
+  EXPECT_EQ(EltCnt.getKnownMinValue(), 8U);
+  ASSERT_FALSE(EltCnt.isScalable());
 }
 
 TEST(ScalableVectorMVTsTest, IRToVTTranslation) {
diff --git a/llvm/unittests/IR/VectorTypesTest.cpp b/llvm/unittests/IR/VectorTypesTest.cpp
index b28e445c97a79..7525ee9872b7a 100644
--- a/llvm/unittests/IR/VectorTypesTest.cpp
+++ b/llvm/unittests/IR/VectorTypesTest.cpp
@@ -119,8 +119,8 @@ TEST(VectorTypesTest, FixedLength) {
   EXPECT_EQ(ConvTy->getElementType()->getScalarSizeInBits(), 64U);
 
   EltCnt = V8Int64Ty->getElementCount();
-  EXPECT_EQ(EltCnt.Min, 8U);
-  ASSERT_FALSE(EltCnt.Scalable);
+  EXPECT_EQ(EltCnt.getKnownMinValue(), 8U);
+  ASSERT_FALSE(EltCnt.isScalable());
 }
 
 TEST(VectorTypesTest, Scalable) {
@@ -215,8 +215,8 @@ TEST(VectorTypesTest, Scalable) {
   EXPECT_EQ(ConvTy->getElementType()->getScalarSizeInBits(), 64U);
 
   EltCnt = ScV8Int64Ty->getElementCount();
-  EXPECT_EQ(EltCnt.Min, 8U);
-  ASSERT_TRUE(EltCnt.Scalable);
+  EXPECT_EQ(EltCnt.getKnownMinValue(), 8U);
+  ASSERT_TRUE(EltCnt.isScalable());
 }
 
 TEST(VectorTypesTest, BaseVectorType) {
@@ -250,7 +250,7 @@ TEST(VectorTypesTest, BaseVectorType) {
     // test I == J
     VectorType *VI = VTys[I];
     ElementCount ECI = VI->getElementCount();
-    EXPECT_EQ(isa<ScalableVectorType>(VI), ECI.Scalable);
+    EXPECT_EQ(isa<ScalableVectorType>(VI), ECI.isScalable());
 
     for (size_t J = I + 1, JEnd = VTys.size(); J < JEnd; ++J) {
       // test I < J