diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 74471cf6f222fb..9f477aa81b0c0f 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -270,7 +270,7 @@ class fir_AllocatableOp<string mnemonic, list<OpTrait> traits = []> :
     bool hasLenParams() { return bool{getAttr(lenpName())}; }
     
     unsigned numLenParams() {
-      if (auto val = getAttrOfType<mlir::IntegerAttr>(lenpName()))
+      if (auto val = (*this)->getAttrOfType<mlir::IntegerAttr>(lenpName()))
         return val.getInt();
       return 0;
     }
@@ -291,7 +291,7 @@ class fir_AllocatableOp<string mnemonic, list<OpTrait> traits = []> :
 
     /// Get the input type of the allocation
     mlir::Type getInType() {
-      return getAttrOfType<mlir::TypeAttr>(inType()).getValue();
+      return (*this)->getAttrOfType<mlir::TypeAttr>(inType()).getValue();
     }
   }];
 
@@ -567,7 +567,7 @@ class fir_SwitchTerminatorOp<string mnemonic, list<OpTrait> traits = []> :
 
     // The number of destination conditions that may be tested
     unsigned getNumConditions() {
-      return getAttrOfType<mlir::ArrayAttr>(getCasesAttr()).size();
+      return (*this)->getAttrOfType<mlir::ArrayAttr>(getCasesAttr()).size();
     }
 
     // The selector is the value being tested to determine the destination
@@ -577,7 +577,7 @@ class fir_SwitchTerminatorOp<string mnemonic, list<OpTrait> traits = []> :
     }
 
     // The number of blocks that may be branched to
-    unsigned getNumDest() { return getOperation()->getNumSuccessors(); }
+    unsigned getNumDest() { return (*this)->getNumSuccessors(); }
 
     llvm::Optional<mlir::OperandRange> getCompareOperands(unsigned cond);
     llvm::Optional<llvm::ArrayRef<mlir::Value>> getCompareOperands(
@@ -1561,11 +1561,11 @@ def fir_CoordinateOp : fir_Op<"coordinate_of", [NoSideEffect]> {
   let parser = "return parseCoordinateOp(parser, result);";
 
   let printer = [{
-    p << getOperationName() << ' ' << getOperation()->getOperands();
+    p << getOperationName() << ' ' << (*this)->getOperands();
     p.printOptionalAttrDict(getAttrs(), /*elidedAttrs=*/{baseType()});
     p << " : ";
-    p.printFunctionalType(getOperation()->getOperandTypes(),
-        getOperation()->getResultTypes());
+    p.printFunctionalType((*this)->getOperandTypes(),
+        (*this)->getResultTypes());
   }];
   
   let verifier = [{
@@ -1940,9 +1940,9 @@ def fir_LoopOp : region_Op<"do_loop",
       return getOperands().drop_front(getNumControlOperands());
     }
 
-    void setLowerBound(Value bound) { getOperation()->setOperand(0, bound); }
-    void setUpperBound(Value bound) { getOperation()->setOperand(1, bound); }
-    void setStep(Value step) { getOperation()->setOperand(2, step); }
+    void setLowerBound(Value bound) { (*this)->setOperand(0, bound); }
+    void setUpperBound(Value bound) { (*this)->setOperand(1, bound); }
+    void setStep(Value step) { (*this)->setOperand(2, step); }
 
     /// Number of region arguments for loop-carried values
     unsigned getNumRegionIterArgs() {
@@ -1952,18 +1952,18 @@ def fir_LoopOp : region_Op<"do_loop",
     unsigned getNumControlOperands() { return 3; }
     /// Does the operation hold operands for loop-carried values
     bool hasIterOperands() {
-      return getOperation()->getNumOperands() > getNumControlOperands();
+      return (*this)->getNumOperands() > getNumControlOperands();
     }
     /// Get Number of loop-carried values
     unsigned getNumIterOperands() {
-      return getOperation()->getNumOperands() - getNumControlOperands();
+      return (*this)->getNumOperands() - getNumControlOperands();
     }
 
     /// Get the body of the loop
     mlir::Block *getBody() { return &region().front(); }
 
     void setUnordered() {
-      getOperation()->setAttr(unorderedAttrName(),
+      (*this)->setAttr(unorderedAttrName(),
                               mlir::UnitAttr::get(getContext()));
     }
   }];
@@ -2062,9 +2062,9 @@ def fir_IterWhileOp : region_Op<"iterate_while",
       return getOperands().drop_front(getNumControlOperands());
     }
 
-    void setLowerBound(Value bound) { getOperation()->setOperand(0, bound); }
-    void setUpperBound(Value bound) { getOperation()->setOperand(1, bound); }
-    void setStep(mlir::Value step) { getOperation()->setOperand(2, step); }
+    void setLowerBound(Value bound) { (*this)->setOperand(0, bound); }
+    void setUpperBound(Value bound) { (*this)->setOperand(1, bound); }
+    void setStep(mlir::Value step) { (*this)->setOperand(2, step); }
 
     /// Number of region arguments for loop-carried values
     unsigned getNumRegionIterArgs() {
@@ -2074,11 +2074,11 @@ def fir_IterWhileOp : region_Op<"iterate_while",
     unsigned getNumControlOperands() { return 3; }
     /// Does the operation hold operands for loop-carried values
     bool hasIterOperands() {
-      return getOperation()->getNumOperands() > getNumControlOperands();
+      return (*this)->getNumOperands() > getNumControlOperands();
     }
     /// Get Number of loop-carried values
     unsigned getNumIterOperands() {
-      return getOperation()->getNumOperands() - getNumControlOperands();
+      return (*this)->getNumOperands() - getNumControlOperands();
     }
   }];
 }
@@ -2705,7 +2705,7 @@ def fir_GlobalOp : fir_Op<"global", [IsolatedFromAbove, Symbol]> {
     p << " : ";
     p.printType(getType());
     if (hasInitializationBody())
-      p.printRegion(getOperation()->getRegion(0), /*printEntryBlockArgs=*/false,
+      p.printRegion((*this)->getRegion(0), /*printEntryBlockArgs=*/false,
                     /*printBlockTerminators=*/true);
   }];
 
@@ -2754,7 +2754,7 @@ def fir_GlobalOp : fir_Op<"global", [IsolatedFromAbove, Symbol]> {
     void appendInitialValue(mlir::Operation *op);
 
     /// A GlobalOp has one region.
-    mlir::Region &getRegion() { return getOperation()->getRegion(0); }
+    mlir::Region &getRegion() { return (*this)->getRegion(0); }
 
     /// A GlobalOp has one block.
     mlir::Block &getBlock() { return getRegion().front(); }
@@ -2763,7 +2763,7 @@ def fir_GlobalOp : fir_Op<"global", [IsolatedFromAbove, Symbol]> {
     static mlir::ParseResult verifyValidLinkage(StringRef linkage);
 
     bool hasInitializationBody() {
-      return (getOperation()->getNumRegions() == 1) && !getRegion().empty() &&
+      return ((*this)->getNumRegions() == 1) && !getRegion().empty() &&
         !isa<fir::FirEndOp>(getBlock().front());
     }
 
@@ -2869,7 +2869,7 @@ def fir_DispatchTableOp : fir_Op<"dispatch_table",
       mlir::SymbolTable::getSymbolAttrName()).getValue();
     p << getOperationName() << " @" << tableName;
 
-    Region &body = getOperation()->getRegion(0);
+    Region &body = (*this)->getRegion(0);
     if (!body.empty())
       p.printRegion(body, /*printEntryBlockArgs=*/false,
                           /*printBlockTerminators=*/false);
@@ -2900,7 +2900,7 @@ def fir_DispatchTableOp : fir_Op<"dispatch_table",
     void appendTableEntry(mlir::Operation *op);
 
     mlir::Region &getRegion() {
-      return this->getOperation()->getRegion(0);
+      return (*this)->getRegion(0);
     }
 
     mlir::Block &getBlock() {
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 6724a4019030d8..b29eb589e2d727 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -16095,6 +16095,81 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of floating-point values.
 
+'``llvm.experimental.vector.insert``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. You can use ``llvm.experimental.vector.insert``
+to insert a fixed-width vector into a scalable vector, but not the other way
+around.
+
+::
+
+      declare <vscale x 4 x float> @llvm.experimental.vector.insert.v4f32(<vscale x 4 x float> %vec, <4 x float> %subvec, i64 %idx)
+      declare <vscale x 2 x double> @llvm.experimental.vector.insert.v2f64(<vscale x 2 x double> %vec, <2 x double> %subvec, i64 %idx)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.insert.*``' intrinsics insert a vector into another vector
+starting from a given index. The return type matches the type of the vector we
+insert into. Conceptually, this can be used to build a scalable vector out of
+non-scalable vectors.
+
+Arguments:
+""""""""""
+
+The ``vec`` is the vector which ``subvec`` will be inserted into.
+The ``subvec`` is the vector that will be inserted.
+
+``idx`` represents the starting element number at which ``subvec`` will be
+inserted. ``idx`` must be a constant multiple of ``subvec``'s known minimum
+vector length. If ``subvec`` is a scalable vector, ``idx`` is first scaled by
+the runtime scaling factor of ``subvec``. The elements of ``vec`` starting at
+``idx`` are overwritten with ``subvec``. Elements ``idx`` through (``idx`` +
+num_elements(``subvec``) - 1) must be valid ``vec`` indices. If this condition
+cannot be determined statically but is false at runtime, then the result vector
+is undefined.
+
+
+'``llvm.experimental.vector.extract``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. You can use
+``llvm.experimental.vector.extract`` to extract a fixed-width vector from a
+scalable vector, but not the other way around.
+
+::
+
+      declare <4 x float> @llvm.experimental.vector.extract.v4f32(<vscale x 4 x float> %vec, i64 %idx)
+      declare <2 x double> @llvm.experimental.vector.extract.v2f64(<vscale x 2 x double> %vec, i64 %idx)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.extract.*``' intrinsics extract a vector from
+within another vector starting from a given index. The return type must be
+explicitly specified. Conceptually, this can be used to decompose a scalable
+vector into non-scalable parts.
+
+Arguments:
+""""""""""
+
+The ``vec`` is the vector from which we will extract a subvector.
+
+The ``idx`` specifies the starting element number within ``vec`` from which a
+subvector is extracted. ``idx`` must be a constant multiple of the known-minimum
+vector length of the result type. If the result type is a scalable vector,
+``idx`` is first scaled by the result type's runtime scaling factor. Elements
+``idx`` through (``idx`` + num_elements(result_type) - 1) must be valid vector
+indices. If this condition cannot be determined statically but is false at
+runtime, then the result vector is undefined. The ``idx`` parameter must be a
+vector index constant type (for most targets this will be an integer pointer
+type).
+
 Matrix Intrinsics
 -----------------
 
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index d454c4ea8d9b70..d73155aa2f2fad 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1362,7 +1362,7 @@ class SelectionDAG {
                                 ISD::MemIndexedMode AM);
   SDValue getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
                           ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
-                          ISD::MemIndexType IndexType);
+                          ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy);
   SDValue getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
                            ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
                            ISD::MemIndexType IndexType,
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 1e71d110730e0e..aa81a31bf23ad4 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -512,6 +512,7 @@ BEGIN_TWO_BYTE_PACK()
   class LoadSDNodeBitfields {
     friend class LoadSDNode;
     friend class MaskedLoadSDNode;
+    friend class MaskedGatherSDNode;
 
     uint16_t : NumLSBaseSDNodeBits;
 
@@ -2451,12 +2452,18 @@ class MaskedGatherSDNode : public MaskedGatherScatterSDNode {
 
   MaskedGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
                      EVT MemVT, MachineMemOperand *MMO,
-                     ISD::MemIndexType IndexType)
+                     ISD::MemIndexType IndexType, ISD::LoadExtType ETy)
       : MaskedGatherScatterSDNode(ISD::MGATHER, Order, dl, VTs, MemVT, MMO,
-                                  IndexType) {}
+                                  IndexType) {
+    LoadSDNodeBits.ExtTy = ETy;
+  }
 
   const SDValue &getPassThru() const { return getOperand(1); }
 
+  ISD::LoadExtType getExtensionType() const {
+    return ISD::LoadExtType(LoadSDNodeBits.ExtTy);
+  }
+
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::MGATHER;
   }
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index db215094a7e498..c2b3446d159f2f 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -879,6 +879,10 @@ class IRBuilderBase {
                              Type *ResultType,
                              const Twine &Name = "");
 
+  /// Create a call to llvm.vscale, multiplied by \p Scaling. The type of VScale
+  /// will be the same type as that of \p Scaling.
+  Value *CreateVScale(Constant *Scaling, const Twine &Name = "");
+
   /// Create a call to intrinsic \p ID with 1 operand which is mangled on its
   /// type.
   CallInst *CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V,
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 710479103459b5..eb6c408b4f85b0 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1614,6 +1614,15 @@ def int_preserve_struct_access_index : DefaultAttrsIntrinsic<[llvm_anyptr_ty],
 //===---------- Intrinsics to query properties of scalable vectors --------===//
 def int_vscale : DefaultAttrsIntrinsic<[llvm_anyint_ty], [], [IntrNoMem]>;
 
+//===---------- Intrinsics to perform subvector insertion/extraction ------===//
+def int_experimental_vector_insert : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                                           [LLVMMatchType<0>, llvm_anyvector_ty, llvm_i64_ty],
+                                                           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
+def int_experimental_vector_extract : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                                            [llvm_anyvector_ty, llvm_i64_ty],
+                                                            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 5054ed81fcfc23..4ab0c60399f2ae 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -787,6 +787,16 @@ static void collectCallSiteParameters(const MachineInstr *CallMI,
     (void)InsertedReg;
   }
 
+  // Do not emit CSInfo for undef forwarding registers.
+  for (auto &MO : CallMI->uses()) {
+    if (!MO.isReg() || !MO.isUndef())
+      continue;
+    auto It = ForwardedRegWorklist.find(MO.getReg());
+    if (It == ForwardedRegWorklist.end())
+      continue;
+    ForwardedRegWorklist.erase(It);
+  }
+
   // We erase, from the ForwardedRegWorklist, those forwarding registers for
   // which we successfully describe a loaded value (by using
   // the describeLoadedValue()). For those remaining arguments in the working
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8f0c9542b3e7a2..212e0a2ea9884b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -932,6 +932,33 @@ bool DAGCombiner::isOneUseSetCC(SDValue N) const {
   return false;
 }
 
+static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy) {
+  if (!ScalarTy.isSimple())
+    return false;
+
+  uint64_t MaskForTy = 0ULL;
+  switch (ScalarTy.getSimpleVT().SimpleTy) {
+  case MVT::i8:
+    MaskForTy = 0xFFULL;
+    break;
+  case MVT::i16:
+    MaskForTy = 0xFFFFULL;
+    break;
+  case MVT::i32:
+    MaskForTy = 0xFFFFFFFFULL;
+    break;
+  default:
+    return false;
+    break;
+  }
+
+  APInt Val;
+  if (ISD::isConstantSplatVector(N, Val))
+    return Val.getLimitedValue() == MaskForTy;
+
+  return false;
+}
+
 // Returns the SDNode if it is a constant float BuildVector
 // or constant float.
 static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) {
@@ -5622,6 +5649,28 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     }
   }
 
+  // fold (and (masked_gather x)) -> (zext_masked_gather x)
+  if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
+    EVT MemVT = GN0->getMemoryVT();
+    EVT ScalarVT = MemVT.getScalarType();
+
+    if (SDValue(GN0, 0).hasOneUse() &&
+        isConstantSplatVectorMaskForType(N1.getNode(), ScalarVT) &&
+        TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
+      SDValue Ops[] = {GN0->getChain(),   GN0->getPassThru(), GN0->getMask(),
+                       GN0->getBasePtr(), GN0->getIndex(),    GN0->getScale()};
+
+      SDValue ZExtLoad = DAG.getMaskedGather(
+          DAG.getVTList(VT, MVT::Other), MemVT, SDLoc(N), Ops,
+          GN0->getMemOperand(), GN0->getIndexType(), ISD::ZEXTLOAD);
+
+      CombineTo(N, ZExtLoad);
+      AddToWorklist(ZExtLoad.getNode());
+      // Avoid recheck of N.
+      return SDValue(N, 0);
+    }
+  }
+
   // fold (and (load x), 255) -> (zextload x, i8)
   // fold (and (extload x, i16), 255) -> (zextload x, i8)
   // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8)
@@ -9499,14 +9548,16 @@ SDValue DAGCombiner::visitMGATHER(SDNode *N) {
     SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
     return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
                                PassThru.getValueType(), DL, Ops,
-                               MGT->getMemOperand(), MGT->getIndexType());
+                               MGT->getMemOperand(), MGT->getIndexType(),
+                               MGT->getExtensionType());
   }
 
   if (refineIndexType(MGT, Index, MGT->isIndexScaled(), DAG)) {
     SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
     return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
                                PassThru.getValueType(), DL, Ops,
-                               MGT->getMemOperand(), MGT->getIndexType());
+                               MGT->getMemOperand(), MGT->getIndexType(),
+                               MGT->getExtensionType());
   }
 
   return SDValue();
@@ -11595,6 +11646,25 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
     }
   }
 
+  // fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x)
+  if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
+    if (SDValue(GN0, 0).hasOneUse() &&
+        ExtVT == GN0->getMemoryVT() &&
+        TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
+      SDValue Ops[] = {GN0->getChain(),   GN0->getPassThru(), GN0->getMask(),
+                       GN0->getBasePtr(), GN0->getIndex(),    GN0->getScale()};
+
+      SDValue ExtLoad = DAG.getMaskedGather(
+          DAG.getVTList(VT, MVT::Other), ExtVT, SDLoc(N), Ops,
+          GN0->getMemOperand(), GN0->getIndexType(), ISD::SEXTLOAD);
+
+      CombineTo(N, ExtLoad);
+      CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+      AddToWorklist(ExtLoad.getNode());
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
+    }
+  }
+
   // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))
   if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) {
     if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 8468f51a922cf7..5c8a562ed9d7d3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -679,12 +679,17 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
   assert(NVT == ExtPassThru.getValueType() &&
       "Gather result type and the passThru argument type should be the same");
 
+  ISD::LoadExtType ExtType = N->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    ExtType = ISD::EXTLOAD;
+
   SDLoc dl(N);
   SDValue Ops[] = {N->getChain(), ExtPassThru, N->getMask(), N->getBasePtr(),
                    N->getIndex(), N->getScale() };
   SDValue Res = DAG.getMaskedGather(DAG.getVTList(NVT, MVT::Other),
                                     N->getMemoryVT(), dl, Ops,
-                                    N->getMemOperand(), N->getIndexType());
+                                    N->getMemOperand(), N->getIndexType(),
+                                    ExtType);
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 1525543a60b67e..86a1f6bff9f7f9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1748,6 +1748,7 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
   SDValue Scale = MGT->getScale();
   EVT MemoryVT = MGT->getMemoryVT();
   Align Alignment = MGT->getOriginalAlign();
+  ISD::LoadExtType ExtType = MGT->getExtensionType();
 
   // Split Mask operand
   SDValue MaskLo, MaskHi;
@@ -1783,11 +1784,11 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
 
   SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
   Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, OpsLo,
-                           MMO, MGT->getIndexType());
+                           MMO, MGT->getIndexType(), ExtType);
 
   SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
   Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, OpsHi,
-                           MMO, MGT->getIndexType());
+                           MMO, MGT->getIndexType(), ExtType);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -2392,6 +2393,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
   SDValue Mask = MGT->getMask();
   SDValue PassThru = MGT->getPassThru();
   Align Alignment = MGT->getOriginalAlign();
+  ISD::LoadExtType ExtType = MGT->getExtensionType();
 
   SDValue MaskLo, MaskHi;
   if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
@@ -2423,11 +2425,11 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
 
   SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
   SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl,
-                                   OpsLo, MMO, MGT->getIndexType());
+                                   OpsLo, MMO, MGT->getIndexType(), ExtType);
 
   SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
   SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl,
-                                   OpsHi, MMO, MGT->getIndexType());
+                                   OpsHi, MMO, MGT->getIndexType(), ExtType);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -3928,7 +3930,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) {
                     Scale };
   SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other),
                                     N->getMemoryVT(), dl, Ops,
-                                    N->getMemOperand(), N->getIndexType());
+                                    N->getMemOperand(), N->getIndexType(),
+                                    N->getExtensionType());
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
@@ -4722,7 +4725,8 @@ SDValue DAGTypeLegalizer::WidenVecOp_MGATHER(SDNode *N, unsigned OpNo) {
   SDValue Ops[] = {MG->getChain(), DataOp, Mask, MG->getBasePtr(), Index,
                    Scale};
   SDValue Res = DAG.getMaskedGather(MG->getVTList(), MG->getMemoryVT(), dl, Ops,
-                                    MG->getMemOperand(), MG->getIndexType());
+                                    MG->getMemOperand(), MG->getIndexType(),
+                                    MG->getExtensionType());
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
   ReplaceValueWith(SDValue(N, 0), Res.getValue(0));
   return SDValue();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index dd837d4d495fcb..4661b0d9189b16 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7295,14 +7295,15 @@ SDValue SelectionDAG::getIndexedMaskedStore(SDValue OrigStore, const SDLoc &dl,
 SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
                                       ArrayRef<SDValue> Ops,
                                       MachineMemOperand *MMO,
-                                      ISD::MemIndexType IndexType) {
+                                      ISD::MemIndexType IndexType,
+                                      ISD::LoadExtType ExtTy) {
   assert(Ops.size() == 6 && "Incompatible number of operands");
 
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MGATHER, VTs, Ops);
   ID.AddInteger(VT.getRawBits());
   ID.AddInteger(getSyntheticNodeSubclassData<MaskedGatherSDNode>(
-      dl.getIROrder(), VTs, VT, MMO, IndexType));
+      dl.getIROrder(), VTs, VT, MMO, IndexType, ExtTy));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
@@ -7312,7 +7313,7 @@ SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
 
   IndexType = TLI->getCanonicalIndexType(IndexType, VT, Ops[4]);
   auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),
-                                          VTs, VT, MMO, IndexType);
+                                          VTs, VT, MMO, IndexType, ExtTy);
   createOperands(N, Ops);
 
   assert(N->getPassThru().getValueType() == N->getValueType(0) &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index fbbe54a1d34ada..170f3f2c4e1395 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4421,7 +4421,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   }
   SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale };
   SDValue Gather = DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl,
-                                       Ops, MMO, IndexType);
+                                       Ops, MMO, IndexType, ISD::NON_EXTLOAD);
 
   PendingLoads.push_back(Gather.getValue(1));
   setValue(&I, Gather);
@@ -6932,6 +6932,27 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                              SetCC));
     return;
   }
+  case Intrinsic::experimental_vector_insert: {
+    auto DL = getCurSDLoc();
+
+    SDValue Vec = getValue(I.getOperand(0));
+    SDValue SubVec = getValue(I.getOperand(1));
+    SDValue Index = getValue(I.getOperand(2));
+    EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    setValue(&I, DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ResultVT, Vec, SubVec,
+                             Index));
+    return;
+  }
+  case Intrinsic::experimental_vector_extract: {
+    auto DL = getCurSDLoc();
+
+    SDValue Vec = getValue(I.getOperand(0));
+    SDValue Index = getValue(I.getOperand(1));
+    EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+    setValue(&I, DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResultVT, Vec, Index));
+    return;
+  }
   }
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 82f102f27a089f..82b4de3d5449b0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -743,6 +743,25 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
     if (MSt->isCompressingStore())
       OS << ", compressing";
 
+    OS << ">";
+  } else if (const auto *MGather = dyn_cast<MaskedGatherSDNode>(this)) {
+    OS << "<";
+    printMemOperand(OS, *MGather->getMemOperand(), G);
+
+    bool doExt = true;
+    switch (MGather->getExtensionType()) {
+    default: doExt = false; break;
+    case ISD::EXTLOAD:  OS << ", anyext"; break;
+    case ISD::SEXTLOAD: OS << ", sext"; break;
+    case ISD::ZEXTLOAD: OS << ", zext"; break;
+    }
+    if (doExt)
+      OS << " from " << MGather->getMemoryVT().getEVTString();
+
+    auto Signed = MGather->isIndexSigned() ? "signed" : "unsigned";
+    auto Scaled = MGather->isIndexScaled() ? "scaled" : "unscaled";
+    OS << ", " << Signed << " " << Scaled << " offset";
+
     OS << ">";
   } else if (const auto *MScatter = dyn_cast<MaskedScatterSDNode>(this)) {
     OS << "<";
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index c0e4451f52003c..f936f5756b6f0b 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -80,6 +80,17 @@ static CallInst *createCallHelper(Function *Callee, ArrayRef<Value *> Ops,
   return CI;
 }
 
+Value *IRBuilderBase::CreateVScale(Constant *Scaling, const Twine &Name) {
+  Module *M = GetInsertBlock()->getParent()->getParent();
+  assert(isa<ConstantInt>(Scaling) && "Expected constant integer");
+  Function *TheFn =
+      Intrinsic::getDeclaration(M, Intrinsic::vscale, {Scaling->getType()});
+  CallInst *CI = createCallHelper(TheFn, {}, this, Name);
+  return cast<ConstantInt>(Scaling)->getSExtValue() == 1
+             ? CI
+             : CreateMul(CI, Scaling);
+}
+
 CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size,
                                       MaybeAlign Align, bool isVolatile,
                                       MDNode *TBAATag, MDNode *ScopeTag,
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index bc24d488d2f785..e3a2828be41aa8 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5138,6 +5138,26 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
 
     break;
   }
+  case Intrinsic::experimental_vector_insert: {
+    VectorType *VecTy = cast<VectorType>(Call.getArgOperand(0)->getType());
+    VectorType *SubVecTy = cast<VectorType>(Call.getArgOperand(1)->getType());
+
+    Assert(VecTy->getElementType() == SubVecTy->getElementType(),
+           "experimental_vector_insert parameters must have the same element "
+           "type.",
+           &Call);
+    break;
+  }
+  case Intrinsic::experimental_vector_extract: {
+    VectorType *ResultTy = cast<VectorType>(Call.getType());
+    VectorType *VecTy = cast<VectorType>(Call.getArgOperand(0)->getType());
+
+    Assert(ResultTy->getElementType() == VecTy->getElementType(),
+           "experimental_vector_extract result must have the same element "
+           "type as the input vector.",
+           &Call);
+    break;
+  }
   };
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 20f5ded9935085..5d9c66e170eab7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3836,6 +3836,26 @@ unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
   return AddrModes.find(Key)->second;
 }
 
+unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("unimplemented opcode");
+    return Opcode;
+  case AArch64ISD::GLD1_MERGE_ZERO:
+    return AArch64ISD::GLD1S_MERGE_ZERO;
+  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
+    return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
+  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
+    return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
+  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
+    return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
+  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
+    return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
+  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
+    return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
+  }
+}
+
 bool getGatherScatterIndexIsExtended(SDValue Index) {
   unsigned Opcode = Index.getOpcode();
   if (Opcode == ISD::SIGN_EXTEND_INREG)
@@ -3865,6 +3885,7 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
   SDValue PassThru = MGT->getPassThru();
   SDValue Mask = MGT->getMask();
   SDValue BasePtr = MGT->getBasePtr();
+  ISD::LoadExtType ExtTy = MGT->getExtensionType();
 
   ISD::MemIndexType IndexType = MGT->getIndexType();
   bool IsScaled =
@@ -3874,6 +3895,7 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
   bool IdxNeedsExtend =
       getGatherScatterIndexIsExtended(Index) ||
       Index.getSimpleValueType().getVectorElementType() == MVT::i32;
+  bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD;
 
   EVT VT = PassThru.getSimpleValueType();
   EVT MemVT = MGT->getMemoryVT();
@@ -3900,9 +3922,12 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
   if (getGatherScatterIndexIsExtended(Index))
     Index = Index.getOperand(0);
 
+  unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend);
+  if (ResNeedsSignExtend)
+    Opcode = getSignExtendedGatherOpcode(Opcode);
+
   SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru};
-  return DAG.getNode(getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend), DL,
-                     VTs, Ops);
+  return DAG.getNode(Opcode, DL, VTs, Ops);
 }
 
 SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e437b9291148d0..135b13b34f3329 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47438,7 +47438,8 @@ static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
     return DAG.getMaskedGather(Gather->getVTList(),
                                Gather->getMemoryVT(), DL, Ops,
                                Gather->getMemOperand(),
-                               Gather->getIndexType());
+                               Gather->getIndexType(),
+                               Gather->getExtensionType());
   }
   auto *Scatter = cast<MaskedScatterSDNode>(GorS);
   SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 3b0b12d4d7424f..22e4fd7d995589 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1652,6 +1652,102 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
+  case Intrinsic::experimental_vector_insert: {
+    Value *Vec = II->getArgOperand(0);
+    Value *SubVec = II->getArgOperand(1);
+    Value *Idx = II->getArgOperand(2);
+    auto *DstTy = dyn_cast<FixedVectorType>(II->getType());
+    auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+    auto *SubVecTy = dyn_cast<FixedVectorType>(SubVec->getType());
+
+    // Only canonicalize if the destination vector, Vec, and SubVec are all
+    // fixed vectors.
+    if (DstTy && VecTy && SubVecTy) {
+      unsigned DstNumElts = DstTy->getNumElements();
+      unsigned VecNumElts = VecTy->getNumElements();
+      unsigned SubVecNumElts = SubVecTy->getNumElements();
+      unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
+
+      // The result of this call is undefined if IdxN is not a constant multiple
+      // of the SubVec's minimum vector length OR the insertion overruns Vec.
+      if (IdxN % SubVecNumElts != 0 || IdxN + SubVecNumElts > VecNumElts) {
+        replaceInstUsesWith(CI, UndefValue::get(CI.getType()));
+        return eraseInstFromFunction(CI);
+      }
+
+      // An insert that entirely overwrites Vec with SubVec is a nop.
+      if (VecNumElts == SubVecNumElts) {
+        replaceInstUsesWith(CI, SubVec);
+        return eraseInstFromFunction(CI);
+      }
+
+      // Widen SubVec into a vector of the same width as Vec, since
+      // shufflevector requires the two input vectors to be the same width.
+      // Elements beyond the bounds of SubVec within the widened vector are
+      // undefined.
+      SmallVector<int, 8> WidenMask;
+      unsigned i;
+      for (i = 0; i != SubVecNumElts; ++i)
+        WidenMask.push_back(i);
+      for (; i != VecNumElts; ++i)
+        WidenMask.push_back(UndefMaskElem);
+
+      Value *WidenShuffle = Builder.CreateShuffleVector(
+          SubVec, llvm::UndefValue::get(SubVecTy), WidenMask);
+
+      SmallVector<int, 8> Mask;
+      for (unsigned i = 0; i != IdxN; ++i)
+        Mask.push_back(i);
+      for (unsigned i = DstNumElts; i != DstNumElts + SubVecNumElts; ++i)
+        Mask.push_back(i);
+      for (unsigned i = IdxN + SubVecNumElts; i != DstNumElts; ++i)
+        Mask.push_back(i);
+
+      Value *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask);
+      replaceInstUsesWith(CI, Shuffle);
+      return eraseInstFromFunction(CI);
+    }
+    break;
+  }
+  case Intrinsic::experimental_vector_extract: {
+    Value *Vec = II->getArgOperand(0);
+    Value *Idx = II->getArgOperand(1);
+
+    auto *DstTy = dyn_cast<FixedVectorType>(II->getType());
+    auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+
+    // Only canonicalize if the the destination vector and Vec are fixed
+    // vectors.
+    if (DstTy && VecTy) {
+      unsigned DstNumElts = DstTy->getNumElements();
+      unsigned VecNumElts = VecTy->getNumElements();
+      unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
+
+      // The result of this call is undefined if IdxN is not a constant multiple
+      // of the result type's minimum vector length OR the extraction overruns
+      // Vec.
+      if (IdxN % DstNumElts != 0 || IdxN + DstNumElts > VecNumElts) {
+        replaceInstUsesWith(CI, UndefValue::get(CI.getType()));
+        return eraseInstFromFunction(CI);
+      }
+
+      // Extracting the entirety of Vec is a nop.
+      if (VecNumElts == DstNumElts) {
+        replaceInstUsesWith(CI, Vec);
+        return eraseInstFromFunction(CI);
+      }
+
+      SmallVector<int, 8> Mask;
+      for (unsigned i = 0; i != DstNumElts; ++i)
+        Mask.push_back(IdxN + i);
+
+      Value *Shuffle =
+          Builder.CreateShuffleVector(Vec, UndefValue::get(VecTy), Mask);
+      replaceInstUsesWith(CI, Shuffle);
+      return eraseInstFromFunction(CI);
+    }
+    break;
+  }
   default: {
     // Handle target specific intrinsics
     Optional<Instruction *> V = targetInstCombineIntrinsic(*II);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6ba14e942ff804..a91fb988badf61 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -367,7 +367,6 @@ static Type *getMemInstValueType(Value *I) {
 /// type is irregular if its allocated size doesn't equal the store size of an
 /// element of the corresponding vector type at the given vectorization factor.
 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   // Determine if an array of VF elements of type Ty is "bitcast compatible"
   // with a <VF x Ty> vector.
   if (VF.isVector()) {
@@ -1122,6 +1121,15 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
   return R;
 }
 
+/// Return a value for Step multiplied by VF.
+static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
+  assert(isa<ConstantInt>(Step) && "Expected an integer step");
+  Constant *StepVal = ConstantInt::get(
+      Step->getType(),
+      cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
+  return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
+}
+
 namespace llvm {
 
 void reportVectorizationFailure(const StringRef DebugMsg,
@@ -1387,9 +1395,7 @@ class LoopVectorizationCostModel {
   /// width \p VF. Return CM_Unknown if this instruction did not pass
   /// through the cost modeling.
   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
-    assert(!VF.isScalable() && "scalable vectors not yet supported.");
-    assert(VF.isVector() && "Expected VF >=2");
-
+    assert(VF.isVector() && "Expected VF to be a vector VF");
     // Cost model is not run in the VPlan-native path - return conservative
     // result until this changes.
     if (EnableVPlanNativePath)
@@ -2280,8 +2286,6 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
                                            const InductionDescriptor &ID) {
   // We shouldn't have to build scalar steps if we aren't vectorizing.
   assert(VF.isVector() && "VF should be greater than one");
-  assert(!VF.isScalable() &&
-         "the code below assumes a fixed number of elements at compile time");
   // Get the value type and ensure it and the step have the same integer type.
   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
   assert(ScalarIVTy == Step->getType() &&
@@ -2306,11 +2310,24 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
           ? 1
           : VF.getKnownMinValue();
+  assert((!VF.isScalable() || Lanes == 1) &&
+         "Should never scalarize a scalable vector");
   // Compute the scalar steps and save the results in VectorLoopValueMap.
   for (unsigned Part = 0; Part < UF; ++Part) {
     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
-      auto *StartIdx = getSignedIntOrFpConstant(
-          ScalarIVTy, VF.getKnownMinValue() * Part + Lane);
+      auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
+                                         ScalarIVTy->getScalarSizeInBits());
+      Value *StartIdx =
+          createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
+      if (ScalarIVTy->isFloatingPointTy())
+        StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
+      StartIdx = addFastMathFlag(Builder.CreateBinOp(
+          AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
+      // The step returned by `createStepForVF` is a runtime-evaluated value
+      // when VF is scalable. Otherwise, it should be folded into a Constant.
+      assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
+             "Expected StartIdx to be folded to a constant when VF is not "
+             "scalable");
       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
@@ -2353,10 +2370,11 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
     // is known to be uniform after vectorization, this corresponds to lane zero
     // of the Part unroll iteration. Otherwise, the last instruction is the one
     // we created for the last vector lane of the Part unroll iteration.
-    assert(!VF.isScalable() && "scalable vectors not yet supported.");
     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
                             ? 0
                             : VF.getKnownMinValue() - 1;
+    assert((!VF.isScalable() || LastLane == 0) &&
+           "Scalable vectorization can't lead to any scalarized values.");
     auto *LastInst = cast<Instruction>(
         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
 
@@ -2698,7 +2716,6 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
 
   Type *ScalarDataTy = getMemInstValueType(Instr);
 
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   auto *DataTy = VectorType::get(ScalarDataTy, VF);
   const Align Alignment = getLoadStoreAlignment(Instr);
 
@@ -2731,6 +2748,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
       InBounds = gep->isInBounds();
 
     if (Reverse) {
+      assert(!VF.isScalable() &&
+             "Reversing vectors is not yet supported for scalable vectors.");
+
       // If the address is consecutive but reversed, then the
       // wide store needs to start at the last vector element.
       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
@@ -2742,8 +2762,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(
       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
     } else {
-      PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
-          ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue())));
+      Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
+      PartPtr = cast<GetElementPtrInst>(
+          Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
       PartPtr->setIsInBounds(InBounds);
     }
 
@@ -2948,8 +2969,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
 
   Type *Ty = TC->getType();
   // This is where we can make the step a runtime constant.
-  assert(!VF.isScalable() && "scalable vectorization is not supported yet");
-  Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF);
+  Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
 
   // If the tail is to be folded by masking, round the number of iterations N
   // up to a multiple of Step instead of rounding down. This is done by first
@@ -2960,6 +2980,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
   if (Cost->foldTailByMasking()) {
     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
            "VF*UF must be a power of 2 when folding tail by masking");
+    assert(!VF.isScalable() &&
+           "Tail folding not yet supported for scalable vectors");
     TC = Builder.CreateAdd(
         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
   }
@@ -3038,11 +3060,9 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
   // If tail is to be folded, vector loop takes care of all iterations.
   Value *CheckMinIters = Builder.getFalse();
   if (!Cost->foldTailByMasking()) {
-    assert(!VF.isScalable() && "scalable vectors not yet supported.");
-    CheckMinIters = Builder.CreateICmp(
-        P, Count,
-        ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF),
-        "min.iters.check");
+    Value *Step =
+        createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
+    CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
   }
   // Create new preheader for vector loop.
   LoopVectorPreHeader =
@@ -3521,8 +3541,8 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   Value *StartIdx = ConstantInt::get(IdxTy, 0);
   // The loop step is equal to the vectorization factor (num of SIMD elements)
   // times the unroll factor (num of SIMD instructions).
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
-  Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
+  Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
+  Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
   Induction =
       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
@@ -3902,8 +3922,10 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
   // profile is not inherently precise anyway. Note also possible bypass of
   // vector code caused by legality checks is ignored, assigning all the weight
   // to the vector loop, optimistically.
-  assert(!VF.isScalable() &&
-         "cannot use scalable ElementCount to determine unroll factor");
+  //
+  // For scalable vectorization we can't know at compile time how many iterations
+  // of the loop are handled in one vector iteration, so instead assume a pessimistic
+  // vscale of '1'.
   setProfileInfoAfterUnrolling(
       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
@@ -4366,7 +4388,6 @@ void InnerLoopVectorizer::clearReductionWrapFlags(
 }
 
 void InnerLoopVectorizer::fixLCSSAPHIs() {
-  assert(!VF.isScalable() && "the code below assumes fixed width vectors");
   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
     if (LCSSAPhi.getNumIncomingValues() == 1) {
       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
@@ -4377,6 +4398,8 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
                        cast<Instruction>(IncomingValue), VF)
                        ? 0
                        : VF.getKnownMinValue() - 1;
+      assert((!VF.isScalable() || LastLane == 0) &&
+             "scalable vectors dont support non-uniform scalars yet");
       // Can be a loop invariant incoming value or the last scalar value to be
       // extracted from the vectorized loop.
       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
@@ -4709,7 +4732,6 @@ static bool mayDivideByZero(Instruction &I) {
 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
                                            VPUser &User,
                                            VPTransformState &State) {
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   switch (I.getOpcode()) {
   case Instruction::Call:
   case Instruction::Br:
@@ -4797,7 +4819,6 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
     setDebugLocFromInst(Builder, CI);
 
     /// Vectorize casts.
-    assert(!VF.isScalable() && "VF is assumed to be non scalable.");
     Type *DestTy =
         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
 
@@ -5099,7 +5120,6 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
 
 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
                                                          ElementCount VF) {
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   if (!blockNeedsPredication(I->getParent()))
     return false;
   switch(I->getOpcode()) {
@@ -5532,7 +5552,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
 ElementCount
 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
                                                  ElementCount UserVF) {
-  assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
   unsigned SmallestType, WidestType;
   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -5545,6 +5564,11 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
 
   if (UserVF.isNonZero()) {
+    // For now, don't verify legality of scalable vectors.
+    // This will be addressed properly in https://reviews.llvm.org/D91718.
+    if (UserVF.isScalable())
+      return UserVF;
+
     // If legally unsafe, clamp the user vectorization factor to a safe value.
     unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
     if (UserVF.getFixedValue() <= MaxSafeVF)
@@ -5633,6 +5657,9 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
 
 VectorizationFactor
 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
+  // FIXME: This can be fixed for scalable vectors later, because at this stage
+  // the LoopVectorizer will only consider vectorizing a loop with scalable
+  // vectors when the loop has a hint to enable vectorization for a given VF.
   assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
 
   float Cost = expectedCost(ElementCount::getFixed(1)).first;
@@ -5942,7 +5969,6 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   }
 
   // Clamp the interleave ranges to reasonable counts.
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   unsigned MaxInterleaveCount =
       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
 
@@ -5958,6 +5984,13 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   // If trip count is known or estimated compile time constant, limit the
   // interleave count to be less than the trip count divided by VF, provided it
   // is at least 1.
+  //
+  // For scalable vectors we can't know if interleaving is beneficial. It may
+  // not be beneficial for small loops if none of the lanes in the second vector
+  // iterations is enabled. However, for larger loops, there is likely to be a
+  // similar benefit as for fixed-width vectors. For now, we choose to leave
+  // the InterleaveCount as if vscale is '1', although if some information about
+  // the vector is known (e.g. min vector size), we can make a better decision.
   if (BestKnownTC) {
     MaxInterleaveCount =
         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
@@ -6001,7 +6034,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   // potentially expose ILP opportunities.
   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
                     << "LV: IC is " << IC << '\n'
-                    << "LV: VF is " << VF.getKnownMinValue() << '\n');
+                    << "LV: VF is " << VF << '\n');
   const bool AggressivelyInterleaveReductions =
       TTI.enableAggressiveInterleaving(HasReductions);
   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
@@ -6420,7 +6453,6 @@ int LoopVectorizationCostModel::computePredInstDiscount(
 
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   VectorizationCostTy Cost;
 
   // For each block.
@@ -6669,8 +6701,6 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                                ElementCount VF) {
-  assert(!VF.isScalable() &&
-         "the cost model is not yet implemented for scalable vectorization");
   // If we know that this instruction will remain uniform, check the cost of
   // the scalar version.
   if (isUniformAfterVectorization(I, VF))
@@ -6734,7 +6764,6 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
 }
 
 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   if (VF.isScalar())
     return;
   NumPredStores = 0;
@@ -7321,7 +7350,6 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
 
 Optional<VectorizationFactor>
 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
-  assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
@@ -7344,9 +7372,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   ElementCount MaxVF = MaybeMaxVF.getValue();
   assert(MaxVF.isNonZero() && "MaxVF is zero.");
 
-  if (!UserVF.isZero() && UserVF.getFixedValue() <= MaxVF.getFixedValue()) {
+  if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) {
     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-    assert(isPowerOf2_32(UserVF.getFixedValue()) &&
+    assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
            "VF needs to be a power of two");
     // Collect the instructions (and their associated costs) that will be more
     // profitable to scalarize.
@@ -7357,6 +7385,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
     return {{UserVF, 0}};
   }
 
+  assert(!MaxVF.isScalable() &&
+         "Scalable vectors not yet supported beyond this point");
+
   for (ElementCount VF = ElementCount::getFixed(1);
        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
     // Collect Uniform and Scalar instructions after vectorization with VF.
@@ -7935,7 +7966,6 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
          "Must be called with either a load or store");
 
   auto willWiden = [&](ElementCount VF) -> bool {
-    assert(!VF.isScalable() && "unexpected scalable ElementCount");
     if (VF.isScalar())
       return false;
     LoopVectorizationCostModel::InstWidening Decision =
@@ -8701,6 +8731,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
 
 void VPReplicateRecipe::execute(VPTransformState &State) {
   if (State.Instance) { // Generate a single instance.
+    assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
     State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
                                     *State.Instance, IsPredicated, State);
     // Insert scalar instance packing it into a vector.
@@ -8723,6 +8754,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
   // instruction is uniform inwhich case generate only the first lane for each
   // of the UF parts.
   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
+  assert((!State.VF.isScalable() || IsUniform) &&
+         "Can't scalarize a scalable vector");
   for (unsigned Part = 0; Part < State.UF; ++Part)
     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
       State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
@@ -8876,12 +8909,6 @@ static bool processLoopInVPlanNativePath(
 
   // Get user vectorization factor.
   ElementCount UserVF = Hints.getWidth();
-  if (UserVF.isScalable()) {
-    // TODO: Use scalable UserVF once we've added initial support for scalable
-    // vectorization. For now we convert it to fixed width, but this will be
-    // removed in a later patch.
-    UserVF = ElementCount::getFixed(UserVF.getKnownMinValue());
-  }
 
   // Plan how to best vectorize, return the best VF and its cost.
   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
@@ -9047,13 +9074,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Get user vectorization factor and interleave count.
   ElementCount UserVF = Hints.getWidth();
-  if (UserVF.isScalable()) {
-    // TODO: Use scalable UserVF once we've added initial support for scalable
-    // vectorization. For now we convert it to fixed width, but this will be
-    // removed in a later patch.
-    UserVF = ElementCount::getFixed(UserVF.getKnownMinValue());
-  }
-
   unsigned UserIC = Hints.getInterleave();
 
   // Plan how to best vectorize, return the best VF and its cost.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1dd81fa30af34e..d24f19e6bffa94 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -163,7 +163,6 @@ struct VectorizerValueMap {
     assert(Instance.Part < UF && "Queried Scalar Part is too large.");
     assert(Instance.Lane < VF.getKnownMinValue() &&
            "Queried Scalar Lane is too large.");
-    assert(!VF.isScalable() && "VF is assumed to be non scalable.");
 
     if (!hasAnyScalarValue(Key))
       return false;
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-vector.ll
new file mode 100644
index 00000000000000..c9a2223946e65f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-extract-vector.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s --check-prefixes=CHECK
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s < %t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+; Should codegen to a nop, since idx is zero.
+define <2 x i64> @extract_v2i64_nxv2i64(<vscale x 2 x i64> %vec) nounwind {
+; CHECK-LABEL: extract_v2i64_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %retval = call <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> %vec, i64 0)
+  ret <2 x i64> %retval
+}
+
+; Goes through memory currently; idx != 0.
+define <2 x i64> @extract_v2i64_nxv2i64_idx1(<vscale x 2 x i64> %vec) nounwind {
+; CHECK-LABEL: extract_v2i64_nxv2i64_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #1 // =1
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    csinc x8, x8, xzr, lo
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK-NEXT:    lsl x8, x8, #3
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ldr q0, [x9, x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+%retval = call <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> %vec, i64 1)
+ret <2 x i64> %retval
+}
+
+; Should codegen to a nop, since idx is zero.
+define <4 x i32> @extract_v4i32_nxv4i32(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_v4i32_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+%retval = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
+ret <4 x i32> %retval
+}
+
+; Goes through memory currently; idx != 0.
+define <4 x i32> @extract_v4i32_nxv4i32_idx1(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_v4i32_nxv4i32_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #1 // =1
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    csinc x8, x8, xzr, lo
+; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK-NEXT:    lsl x8, x8, #2
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ldr q0, [x9, x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> %vec, i64 1)
+  ret <4 x i32> %retval
+}
+
+; Should codegen to a nop, since idx is zero.
+define <8 x i16> @extract_v8i16_nxv8i16(<vscale x 8 x i16> %vec) nounwind {
+; CHECK-LABEL: extract_v8i16_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %retval = call <8 x i16> @llvm.experimental.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> %vec, i64 0)
+  ret <8 x i16> %retval
+}
+
+; Goes through memory currently; idx != 0.
+define <8 x i16> @extract_v8i16_nxv8i16_idx1(<vscale x 8 x i16> %vec) nounwind {
+; CHECK-LABEL: extract_v8i16_nxv8i16_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #1 // =1
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    csinc x8, x8, xzr, lo
+; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK-NEXT:    lsl x8, x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ldr q0, [x9, x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <8 x i16> @llvm.experimental.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16> %vec, i64 1)
+  ret <8 x i16> %retval
+}
+
+; Should codegen to a nop, since idx is zero.
+define <16 x i8> @extract_v16i8_nxv16i8(<vscale x 16 x i8> %vec) nounwind {
+; CHECK-LABEL: extract_v16i8_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %retval = call <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> %vec, i64 0)
+  ret <16 x i8> %retval
+}
+
+; Goes through memory currently; idx != 0.
+define <16 x i8> @extract_v16i8_nxv16i8_idx1(<vscale x 16 x i8> %vec) nounwind {
+; CHECK-LABEL: extract_v16i8_nxv16i8_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    cmp x8, #1 // =1
+; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK-NEXT:    csinc x8, x8, xzr, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ldr q0, [x9, x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> %vec, i64 1)
+  ret <16 x i8> %retval
+}
+
+declare <2 x i64> @llvm.experimental.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64>, i64)
+declare <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32>, i64)
+declare <8 x i16> @llvm.experimental.vector.extract.v8i16.nxv8i16(<vscale x 8 x i16>, i64)
+declare <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8>, i64)
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
new file mode 100644
index 00000000000000..c1de878a32ac9b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
@@ -0,0 +1,184 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s --check-prefixes=CHECK
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s < %t
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+define <vscale x 2 x i64> @insert_v2i64_nxv2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind {
+; CHECK-LABEL: insert_v2i64_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #0 // =0
+; CHECK-NEXT:    csel x8, x8, xzr, lo
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    lsl x8, x8, #3
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 0)
+  ret <vscale x 2 x i64> %retval
+}
+
+define <vscale x 2 x i64> @insert_v2i64_nxv2i64_idx1(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind {
+; CHECK-LABEL: insert_v2i64_nxv2i64_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #1 // =1
+; CHECK-NEXT:    csinc x8, x8, xzr, lo
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    lsl x8, x8, #3
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 1)
+  ret <vscale x 2 x i64> %retval
+}
+
+define <vscale x 4 x i32> @insert_v4i32_nxv4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec) nounwind {
+; CHECK-LABEL: insert_v4i32_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #0 // =0
+; CHECK-NEXT:    csel x8, x8, xzr, lo
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    lsl x8, x8, #2
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 0)
+  ret <vscale x 4 x i32> %retval
+}
+
+define <vscale x 4 x i32> @insert_v4i32_nxv4i32_idx1(<vscale x 4 x i32> %vec, <4 x i32> %subvec) nounwind {
+; CHECK-LABEL: insert_v4i32_nxv4i32_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #1 // =1
+; CHECK-NEXT:    csinc x8, x8, xzr, lo
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    lsl x8, x8, #2
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 1)
+  ret <vscale x 4 x i32> %retval
+}
+
+define <vscale x 8 x i16> @insert_v8i16_nxv8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec) nounwind {
+; CHECK-LABEL: insert_v8i16_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #0 // =0
+; CHECK-NEXT:    csel x8, x8, xzr, lo
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    lsl x8, x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec, i64 0)
+  ret <vscale x 8 x i16> %retval
+}
+
+define <vscale x 8 x i16> @insert_v8i16_nxv8i16_idx1(<vscale x 8 x i16> %vec, <8 x i16> %subvec) nounwind {
+; CHECK-LABEL: insert_v8i16_nxv8i16_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #1 // =1
+; CHECK-NEXT:    csinc x8, x8, xzr, lo
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    lsl x8, x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec, i64 1)
+  ret <vscale x 8 x i16> %retval
+}
+
+define <vscale x 16 x i8> @insert_v16i8_nxv16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec) nounwind {
+; CHECK-LABEL: insert_v16i8_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #0 // =0
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    csel x8, x8, xzr, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec, i64 0)
+  ret <vscale x 16 x i8> %retval
+}
+
+define <vscale x 16 x i8> @insert_v16i8_nxv16i8_idx1(<vscale x 16 x i8> %vec, <16 x i8> %subvec) nounwind {
+; CHECK-LABEL: insert_v16i8_nxv16i8_idx1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    sub x8, x8, #1 // =1
+; CHECK-NEXT:    cmp x8, #1 // =1
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    csinc x8, x8, xzr, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec, i64 1)
+  ret <vscale x 16 x i8> %retval
+}
+
+declare <vscale x 2 x i64> @llvm.experimental.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
+declare <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
+declare <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
+declare <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
index 32dca0d26cdc07..e6b89b0070d6de 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; unscaled unpacked 32-bit offsets
@@ -9,7 +10,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32>
 ; CHECK-LABEL: masked_gather_nxv2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffff
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
   %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
@@ -21,7 +21,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32>
 ; CHECK-LABEL: masked_gather_nxv2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
   %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
@@ -72,9 +71,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x
 define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
   %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
@@ -85,9 +82,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32>
 define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
   %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
@@ -103,7 +98,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32>
 ; CHECK-LABEL: masked_gather_nxv4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
-; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i32> %offsets
   %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
@@ -144,9 +138,7 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i
 define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw #1]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i32> %offsets
   %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
index 1fc048a3adf7e1..2d4ce50e8464fc 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; unscaled unpacked 32-bit offsets
@@ -9,7 +10,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %o
 ; CHECK-LABEL: masked_gather_nxv2i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    and z0.d, z0.d, #0xff
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
@@ -21,7 +21,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %
 ; CHECK-LABEL: masked_gather_nxv2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffff
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@@ -34,7 +33,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %
 ; CHECK-LABEL: masked_gather_nxv2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
@@ -90,9 +88,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32
 define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
@@ -103,9 +99,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %
 define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@@ -117,9 +111,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32>
 define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
@@ -136,7 +128,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %o
 ; CHECK-LABEL: masked_gather_nxv4i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
-; CHECK-NEXT:    and z0.s, z0.s, #0xff
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
   %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
@@ -148,7 +139,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %
 ; CHECK-LABEL: masked_gather_nxv4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
-; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
   %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
@@ -193,9 +183,7 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32>
 define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sxtb z0.s, p0/m, z0.s
+; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
   %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
@@ -206,9 +194,7 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %
 define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
   %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
index ada49b7fecbc66..41f1eb4e94d4c4 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; unscaled unpacked 32-bit offsets
@@ -9,7 +10,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32>
 ; CHECK-LABEL: masked_gather_nxv2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffff
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets.zext
@@ -22,7 +22,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32>
 ; CHECK-LABEL: masked_gather_nxv2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets.zext
@@ -78,9 +77,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x
 define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1]
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets.zext
@@ -92,9 +89,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32>
 define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2]
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets.zext
@@ -111,7 +106,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32>
 ; CHECK-LABEL: masked_gather_nxv4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
-; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
   %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %offsets.zext
@@ -156,9 +150,7 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i
 define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw #1]
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
   %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %offsets.zext
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
index 61b8e3e53e23c5..51ab73c14ac9e0 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; unscaled unpacked 32-bit offsets
@@ -9,7 +10,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %o
 ; CHECK-LABEL: masked_gather_nxv2i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT:    and z0.d, z0.d, #0xff
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -22,7 +22,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %
 ; CHECK-LABEL: masked_gather_nxv2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffff
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -36,7 +35,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %
 ; CHECK-LABEL: masked_gather_nxv2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -97,9 +95,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32
 define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -111,9 +107,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %
 define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -126,9 +120,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32>
 define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -146,7 +138,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %o
 ; CHECK-LABEL: masked_gather_nxv4i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
-; CHECK-NEXT:    and z0.s, z0.s, #0xff
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
   %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
@@ -159,7 +150,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %
 ; CHECK-LABEL: masked_gather_nxv4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
-; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
@@ -208,9 +198,7 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32>
 define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sxtb z0.s, p0/m, z0.s
+; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x0, z0.s, uxtw]
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
   %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
@@ -222,9 +210,7 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %
 define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw]
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll
index 197ed69ee52f55..15dfcc61316e10 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
 
 define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffff
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
   %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
@@ -17,7 +17,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i64>
 ; CHECK-LABEL: masked_gather_nxv2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
   %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
@@ -68,9 +67,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x
 define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, lsl #1]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
   %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
@@ -81,9 +78,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i64>
 define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, lsl #2]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
   %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
index 3f4f54c5d8393f..3320b88691eefd 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
 
 define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT:    and z0.d, z0.d, #0xff
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
   %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
@@ -17,7 +17,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i64> %
 ; CHECK-LABEL: masked_gather_nxv2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffff
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@@ -30,7 +29,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i64> %
 ; CHECK-LABEL: masked_gather_nxv2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
@@ -86,9 +84,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i64
 define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, z0.d]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
   %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
@@ -99,9 +95,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i64> %
 define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d]
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@@ -113,9 +107,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i64>
 define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d]
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
index 962ba079ca9ef1..076edc1fd86da4 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
@@ -1,5 +1,46 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
+
+; Test for multiple uses of the mgather where the s/zext should not be combined
+
+define <vscale x 2 x i64> @masked_sgather_sext(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask, <vscale x 2 x i8> %vals) {
+; CHECK-LABEL: masked_sgather_sext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxtb z2.d, p0/m, z0.d
+; CHECK-NEXT:    add z0.d, z0.d, z1.d
+; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %data = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %data.sext = sext <vscale x 2 x i8> %data to <vscale x 2 x i64>
+  %add = add <vscale x 2 x i8> %data, %vals
+  %add.sext = sext <vscale x 2 x i8> %add to <vscale x 2 x i64>
+  %mul = mul <vscale x 2 x i64> %data.sext, %add.sext
+  ret <vscale x 2 x i64> %mul
+}
+
+define <vscale x 2 x i64> @masked_sgather_zext(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask, <vscale x 2 x i8> %vals) {
+; CHECK-LABEL: masked_sgather_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add z1.d, z0.d, z1.d
+; CHECK-NEXT: and z0.d, z0.d, #0xff
+; CHECK-NEXT: and z1.d, z1.d, #0xff
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %data = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %data.zext = zext <vscale x 2 x i8> %data to <vscale x 2 x i64>
+  %add = add <vscale x 2 x i8> %data, %vals
+  %add.zext = zext <vscale x 2 x i8> %add to <vscale x 2 x i64>
+  %mul = mul <vscale x 2 x i64> %data.zext, %add.zext
+  ret <vscale x 2 x i64> %mul
+}
 
 ; Tests that exercise various type legalisation scenarios for ISD::MGATHER.
 
@@ -7,7 +48,7 @@
 define <vscale x 2 x i32> @masked_gather_nxv2i32(<vscale x 2 x i32*> %ptrs, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2i32:
 ; CHECK-DAG: mov x8, xzr
-; CHECK-DAG: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-DAG: ld1sw { z0.d }, p0/z, [x8, z0.d]
 ; CHECK:     ret
   %data = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
   ret <vscale x 2 x i32> %data
@@ -41,8 +82,8 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(<vscale x 4 x i8*> %ptrs, <vsca
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
 ; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT:    ld1b { z1.d }, p2/z, [x8, z1.d]
-; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ld1sb { z1.d }, p2/z, [x8, z1.d]
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x8, z0.d]
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    sxtb z0.s, p0/m, z0.s
diff --git a/llvm/test/DebugInfo/X86/dbg-call-site-undef-params.ll b/llvm/test/DebugInfo/X86/dbg-call-site-undef-params.ll
new file mode 100644
index 00000000000000..84248c33d9df34
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/dbg-call-site-undef-params.ll
@@ -0,0 +1,80 @@
+; RUN: llc -emit-call-site-info %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
+
+;; Compiled from source:
+;; extern int fn1 (long int x, long int y, long int z);
+;; __attribute__((noinline)) long int
+;; fn2 (long int a, long int b, long int c)
+;; {
+;;   long int q = 2 * a;
+;;   return fn1 (5, 6, 7);
+;; }
+;; int main(void) {
+;;         return fn2(14, 23, 34);
+;; }
+;; Using command:
+;; clang -g -O2 m.c -emit-llvm -S -c -o m.ll
+
+;; Verify that call site info is not created for parameters marked as "undef".
+; CHECK: DW_TAG_GNU_call_site
+; CHECK: DW_AT_abstract_origin ({{.*}} "fn2")
+; CHECK-NOT: DW_TAG_GNU_call_site_parameter
+
+; ModuleID = 'm.c'
+source_filename = "m.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define dso_local i64 @fn2(i64 %a, i64 %b, i64 %c) local_unnamed_addr !dbg !7 {
+entry:
+  call void @llvm.dbg.value(metadata i64 undef, metadata !12, metadata !DIExpression()), !dbg !16
+  call void @llvm.dbg.value(metadata i64 undef, metadata !13, metadata !DIExpression()), !dbg !16
+  call void @llvm.dbg.value(metadata i64 undef, metadata !14, metadata !DIExpression()), !dbg !16
+  call void @llvm.dbg.value(metadata i64 undef, metadata !15, metadata !DIExpression()), !dbg !16
+  %call = tail call i32 @fn1(i64 5, i64 6, i64 7) #4, !dbg !16
+  %conv = sext i32 %call to i64, !dbg !16
+  ret i64 %conv, !dbg !16
+}
+
+declare !dbg !19 dso_local i32 @fn1(i64, i64, i64) local_unnamed_addr
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @main() local_unnamed_addr !dbg !23 {
+entry:
+  %call = tail call i64 @fn2(i64 undef, i64 undef, i64 undef), !dbg !26
+  %conv = trunc i64 %call to i32, !dbg !26
+  ret i32 %conv, !dbg !26
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "m.c", directory: "/dir")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 12.0.0"}
+!7 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 4, type: !8, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10, !10, !10, !10}
+!10 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed)
+!11 = !{!12, !13, !14, !15}
+!12 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 4, type: !10)
+!13 = !DILocalVariable(name: "b", arg: 2, scope: !7, file: !1, line: 4, type: !10)
+!14 = !DILocalVariable(name: "c", arg: 3, scope: !7, file: !1, line: 4, type: !10)
+!15 = !DILocalVariable(name: "q", scope: !7, file: !1, line: 6, type: !10)
+!16 = !DILocation(line: 0, scope: !7)
+!19 = !DISubprogram(name: "fn1", scope: !1, file: !1, line: 1, type: !20, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+!20 = !DISubroutineType(types: !21)
+!21 = !{!22, !10, !10, !10}
+!22 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!23 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 9, type: !24, scopeLine: 9, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!24 = !DISubroutineType(types: !25)
+!25 = !{!22}
+!26 = !DILocation(line: 10, column: 16, scope: !23)
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-vector-extract.ll b/llvm/test/Transforms/InstCombine/canonicalize-vector-extract.ll
new file mode 100644
index 00000000000000..f7d0f11babff21
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/canonicalize-vector-extract.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; llvm.experimental.vector.extract canonicalizes to shufflevector in the fixed case. In the
+; scalable case, we lower to the EXTRACT_SUBVECTOR ISD node.
+
+declare <10 x i32> @llvm.experimental.vector.extract.v10i32.v8i32(<8 x i32> %vec, i64 %idx)
+declare <2 x i32> @llvm.experimental.vector.extract.v2i32.v4i32(<8 x i32> %vec, i64 %idx)
+declare <3 x i32> @llvm.experimental.vector.extract.v3i32.v8i32(<8 x i32> %vec, i64 %idx)
+declare <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> %vec, i64 %idx)
+declare <4 x i32> @llvm.experimental.vector.extract.v4i32.v8i32(<8 x i32> %vec, i64 %idx)
+declare <8 x i32> @llvm.experimental.vector.extract.v8i32.v8i32(<8 x i32> %vec, i64 %idx)
+
+; ============================================================================ ;
+; Trivial cases
+; ============================================================================ ;
+
+; Extracting the entirety of a vector is a nop.
+define <8 x i32> @trivial_nop(<8 x i32> %vec) {
+; CHECK-LABEL: @trivial_nop(
+; CHECK-NEXT:    ret <8 x i32> [[VEC:%.*]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.extract.v8i32.v8i32(<8 x i32> %vec, i64 0)
+  ret <8 x i32> %1
+}
+
+; ============================================================================ ;
+; Valid canonicalizations
+; ============================================================================ ;
+
+define <2 x i32> @valid_extraction_a(<8 x i32> %vec) {
+; CHECK-LABEL: @valid_extraction_a(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    ret <2 x i32> [[TMP1]]
+;
+  %1 = call <2 x i32> @llvm.experimental.vector.extract.v2i32.v4i32(<8 x i32> %vec, i64 0)
+  ret <2 x i32> %1
+}
+
+define <2 x i32> @valid_extraction_b(<8 x i32> %vec) {
+; CHECK-LABEL: @valid_extraction_b(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    ret <2 x i32> [[TMP1]]
+;
+  %1 = call <2 x i32> @llvm.experimental.vector.extract.v2i32.v4i32(<8 x i32> %vec, i64 2)
+  ret <2 x i32> %1
+}
+
+define <2 x i32> @valid_extraction_c(<8 x i32> %vec) {
+; CHECK-LABEL: @valid_extraction_c(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    ret <2 x i32> [[TMP1]]
+;
+  %1 = call <2 x i32> @llvm.experimental.vector.extract.v2i32.v4i32(<8 x i32> %vec, i64 4)
+  ret <2 x i32> %1
+}
+
+define <2 x i32> @valid_extraction_d(<8 x i32> %vec) {
+; CHECK-LABEL: @valid_extraction_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    ret <2 x i32> [[TMP1]]
+;
+  %1 = call <2 x i32> @llvm.experimental.vector.extract.v2i32.v4i32(<8 x i32> %vec, i64 6)
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @valid_extraction_e(<8 x i32> %vec) {
+; CHECK-LABEL: @valid_extraction_e(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = call <4 x i32> @llvm.experimental.vector.extract.v4i32.v8i32(<8 x i32> %vec, i64 0)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @valid_extraction_f(<8 x i32> %vec) {
+; CHECK-LABEL: @valid_extraction_f(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = call <4 x i32> @llvm.experimental.vector.extract.v4i32.v8i32(<8 x i32> %vec, i64 4)
+  ret <4 x i32> %1
+}
+
+define <3 x i32> @valid_extraction_g(<8 x i32> %vec) {
+; CHECK-LABEL: @valid_extraction_g(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    ret <3 x i32> [[TMP1]]
+;
+  %1 = call <3 x i32> @llvm.experimental.vector.extract.v3i32.v8i32(<8 x i32> %vec, i64 0)
+  ret <3 x i32> %1
+}
+
+define <3 x i32> @valid_extraction_h(<8 x i32> %vec) {
+; CHECK-LABEL: @valid_extraction_h(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> undef, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    ret <3 x i32> [[TMP1]]
+;
+  %1 = call <3 x i32> @llvm.experimental.vector.extract.v3i32.v8i32(<8 x i32> %vec, i64 3)
+  ret <3 x i32> %1
+}
+
+; ============================================================================ ;
+; Invalid canonicalizations
+; ============================================================================ ;
+
+; Idx must be the be a constant multiple of the destination vector's length,
+; otherwise the result is undefined.
+define <4 x i32> @idx_not_constant_multiple(<8 x i32> %vec) {
+; CHECK-LABEL: @idx_not_constant_multiple(
+; CHECK-NEXT:    ret <4 x i32> undef
+;
+  %1 = call <4 x i32> @llvm.experimental.vector.extract.v4i32.v8i32(<8 x i32> %vec, i64 1)
+  ret <4 x i32> %1
+}
+
+; If the extraction overruns the vector, the result is undefined.
+define <10 x i32> @extract_overrun(<8 x i32> %vec) {
+; CHECK-LABEL: @extract_overrun(
+; CHECK-NEXT:    ret <10 x i32> undef
+;
+  %1 = call <10 x i32> @llvm.experimental.vector.extract.v10i32.v8i32(<8 x i32> %vec, i64 0)
+  ret <10 x i32> %1
+}
+
+; ============================================================================ ;
+; Scalable cases
+; ============================================================================ ;
+
+; Scalable extractions should not be canonicalized. This will be lowered to the
+; EXTRACT_SUBVECTOR ISD node later.
+define <4 x i32> @scalable_extract(<vscale x 4 x i32> %vec) {
+; CHECK-LABEL: @scalable_extract(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> [[VEC:%.*]], i64 0)
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = call <4 x i32> @llvm.experimental.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
+  ret <4 x i32> %1
+}
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll b/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll
new file mode 100644
index 00000000000000..413d8155787f16
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; llvm.experimental.vector.insert canonicalizes to shufflevector in the fixed case. In the
+; scalable case, we lower to the INSERT_SUBVECTOR ISD node.
+
+declare <8 x i32> @llvm.experimental.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 %idx)
+declare <8 x i32> @llvm.experimental.vector.insert.v8i32.v3i32(<8 x i32> %vec, <3 x i32> %subvec, i64 %idx)
+declare <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 %idx)
+declare <8 x i32> @llvm.experimental.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 %idx)
+declare <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 %idx)
+
+; ============================================================================ ;
+; Trivial cases
+; ============================================================================ ;
+
+; An insert that entirely overwrites an <n x ty> with another <n x ty> is a
+; nop.
+define <8 x i32> @trivial_nop(<8 x i32> %vec, <8 x i32> %subvec) {
+; CHECK-LABEL: @trivial_nop(
+; CHECK-NEXT:    ret <8 x i32> [[SUBVEC:%.*]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 0)
+  ret <8 x i32> %1
+}
+
+; ============================================================================ ;
+; Valid canonicalizations
+; ============================================================================ ;
+
+define <8 x i32> @valid_insertion_a(<8 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_a(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_b(<8 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_b(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 2)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_c(<8 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_c(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 4)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_d(<8 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 6)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_e(<8 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_e(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC:%.*]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_f(<8 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_f(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC:%.*]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 4)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_g(<8 x i32> %vec, <3 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_g(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i32> [[SUBVEC:%.*]], <3 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v3i32(<8 x i32> %vec, <3 x i32> %subvec, i64 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_h(<8 x i32> %vec, <3 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_h(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i32> [[SUBVEC:%.*]], <3 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 10, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v3i32(<8 x i32> %vec, <3 x i32> %subvec, i64 3)
+  ret <8 x i32> %1
+}
+
+; ============================================================================ ;
+; Invalid canonicalizations
+; ============================================================================ ;
+
+; Idx must be the be a constant multiple of the subvector's minimum vector
+; length, otherwise the result is undefined.
+define <8 x i32> @idx_not_constant_multiple(<8 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: @idx_not_constant_multiple(
+; CHECK-NEXT:    ret <8 x i32> undef
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 2)
+  ret <8 x i32> %1
+}
+
+; If the insertion overruns the vector, the result is undefined.
+define <8 x i32> @insert_overrun(<8 x i32> %vec, <8 x i32> %subvec) {
+; CHECK-LABEL: @insert_overrun(
+; CHECK-NEXT:    ret <8 x i32> undef
+;
+  %1 = call <8 x i32> @llvm.experimental.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 4)
+  ret <8 x i32> %1
+}
+
+; ============================================================================ ;
+; Scalable cases
+; ============================================================================ ;
+
+; Scalable insertions should not be canonicalized. This will be lowered to the
+; INSERT_SUBVECTOR ISD node later.
+define <vscale x 4 x i32> @scalable_insert(<vscale x 4 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: @scalable_insert(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+;
+  %1 = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 0)
+  ret <vscale x 4 x i32> %1
+}
diff --git a/llvm/test/Transforms/LoopVectorize/metadata-width.ll b/llvm/test/Transforms/LoopVectorize/metadata-width.ll
index db9c6c9d862ac0..e107de672c8be3 100644
--- a/llvm/test/Transforms/LoopVectorize/metadata-width.ll
+++ b/llvm/test/Transforms/LoopVectorize/metadata-width.ll
@@ -13,8 +13,7 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-  %0 = trunc i64 %indvars.iv to i32
-  store i32 %0, i32* %arrayidx, align 4
+  store i32 42, i32* %arrayidx, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp eq i32 %lftr.wideiv, %n
@@ -25,7 +24,7 @@ for.end:                                          ; preds = %for.body, %entry
 }
 
 ; CHECK-LABEL: @test2(
-; CHECK: store <8 x i32>
+; CHECK: store <vscale x 8 x i32>
 ; CHECK: ret void
 define void @test2(i32* nocapture %a, i32 %n) #0 {
 entry:
@@ -35,8 +34,7 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-  %0 = trunc i64 %indvars.iv to i32
-  store i32 %0, i32* %arrayidx, align 4
+  store i32 42, i32* %arrayidx, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp eq i32 %lftr.wideiv, %n
@@ -57,8 +55,7 @@ entry:
 for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-  %0 = trunc i64 %indvars.iv to i32
-  store i32 %0, i32* %arrayidx, align 4
+  store i32 42, i32* %arrayidx, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp eq i32 %lftr.wideiv, %n
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
new file mode 100644
index 00000000000000..692b6e5c4ce179
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
@@ -0,0 +1,101 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 < %s | FileCheck %s --check-prefix=CHECKUF1
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 < %s | FileCheck %s --check-prefix=CHECKUF2
+
+; CHECKUF1: for.body.preheader:
+; CHECKUF1-DAG: %wide.trip.count = zext i32 %N to i64
+; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
+; CHECKUF1-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count
+
+; CHECKUF1: vector.ph:
+; CHECKUF1-DAG:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF1-DAG:  %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
+; CHECKUF1-DAG:  %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]]
+; CHECKUF1:      %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf
+
+; CHECKUF1: vector.body:
+; CHECKUF1: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECKUF1: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index
+; CHECKUF1: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to <vscale x 4 x double>*
+; CHECKUF1: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0
+; CHECKUF1: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECKUF1: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index
+; CHECKUF1: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to <vscale x 4 x double>*
+; CHECKUF1: store <vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0
+; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF1: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
+; CHECKUF1: %index.next = add i64 %index, %[[VSCALEX4]]
+; CHECKUF1: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
+; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5
+
+
+; For an interleave factor of 2, vscale is scaled by 8 instead of 4 (and thus shifted left by 3 instead of 2).
+; There is also the increment for the next iteration, e.g. instead of indexing IDXB, it indexes at IDXB + vscale * 4.
+
+; CHECKUF2: for.body.preheader:
+; CHECKUF2-DAG: %wide.trip.count = zext i32 %N to i64
+; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
+; CHECKUF2-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX8]], %wide.trip.count
+
+; CHECKUF2: vector.ph:
+; CHECKUF2-DAG:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF2-DAG:  %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
+; CHECKUF2-DAG:  %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX8]]
+; CHECKUF2:      %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf
+
+; CHECKUF2: vector.body:
+; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECKUF2: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index
+; CHECKUF2: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to <vscale x 4 x double>*
+; CHECKUF2: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0
+; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
+; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64
+; CHECKUF2: %[[IDXB_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXB]], i64 %[[VSCALE2_EXT]]
+; CHECKUF2: %[[IDXB_NEXT_CAST:.*]] = bitcast double* %[[IDXB_NEXT]] to <vscale x 4 x double>*
+; CHECKUF2: %wide.load{{[0-9]+}} = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_NEXT_CAST]], align 8, !alias.scope !0
+; CHECKUF2: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECKUF2: %[[FADD_NEXT:.*]] = fadd <vscale x 4 x double> %wide.load{{[0-9]+}}, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECKUF2: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index
+; CHECKUF2: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to <vscale x 4 x double>*
+; CHECKUF2: store <vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0
+; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
+; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64
+; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXA]], i64 %[[VSCALE2_EXT]]
+; CHECKUF2: %[[IDXA_NEXT_CAST:.*]] = bitcast double* %[[IDXA_NEXT]] to <vscale x 4 x double>*
+; CHECKUF2: store <vscale x 4 x double> %[[FADD_NEXT]], <vscale x 4 x double>* %[[IDXA_NEXT_CAST]], align 8, !alias.scope !3, !noalias !0
+; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
+; CHECKUF2: %index.next = add i64 %index, %[[VSCALEX8]]
+; CHECKUF2: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
+; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5
+
+define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) {
+entry:
+  %cmp7 = icmp sgt i32 %N, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %add = fadd double %0, 1.000000e+00
+  %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  store double %add, double* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
diff --git a/llvm/test/Verifier/extract-vector-mismatched-element-types.ll b/llvm/test/Verifier/extract-vector-mismatched-element-types.ll
new file mode 100644
index 00000000000000..b8a10854938646
--- /dev/null
+++ b/llvm/test/Verifier/extract-vector-mismatched-element-types.ll
@@ -0,0 +1,9 @@
+; RUN: not opt -verify -S < %s 2>&1 >/dev/null | FileCheck %s
+
+; CHECK: experimental_vector_extract result must have the same element type as the input vector.
+define <16 x i16> @invalid_mismatched_element_types(<vscale x 16 x i8> %vec) nounwind {
+  %retval = call <16 x i16> @llvm.experimental.vector.extract.v16i16.nxv16i8(<vscale x 16 x i8> %vec, i64 0)
+  ret <16 x i16> %retval
+}
+
+declare <16 x i16> @llvm.experimental.vector.extract.v16i16.nxv16i8(<vscale x 16 x i8>, i64)
diff --git a/llvm/test/Verifier/insert-vector-mismatched-element-types.ll b/llvm/test/Verifier/insert-vector-mismatched-element-types.ll
new file mode 100644
index 00000000000000..84376f5a3ec752
--- /dev/null
+++ b/llvm/test/Verifier/insert-vector-mismatched-element-types.ll
@@ -0,0 +1,9 @@
+; RUN: not opt -verify -S < %s 2>&1 >/dev/null | FileCheck %s
+
+; CHECK: experimental_vector_insert parameters must have the same element type.
+define <vscale x 16 x i8> @invalid_mismatched_element_types(<vscale x 16 x i8> %vec, <4 x i16> %subvec) nounwind {
+  %retval = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v4i16(<vscale x 16 x i8> %vec, <4 x i16> %subvec, i64 0)
+  ret <vscale x 16 x i8> %retval
+}
+
+declare <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv16i8.v4i16(<vscale x 16 x i8>, <4 x i16>, i64)
diff --git a/mlir/examples/toy/Ch2/mlir/Dialect.cpp b/mlir/examples/toy/Ch2/mlir/Dialect.cpp
index 0e715c9aafa4cc..e4391e3b6fa909 100644
--- a/mlir/examples/toy/Ch2/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch2/mlir/Dialect.cpp
@@ -191,7 +191,7 @@ void MulOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
 static mlir::LogicalResult verify(ReturnOp op) {
   // We know that the parent operation is a function, because of the 'HasParent'
   // trait attached to the operation definition.
-  auto function = cast<FuncOp>(op.getParentOp());
+  auto function = cast<FuncOp>(op->getParentOp());
 
   /// ReturnOps can only have a single optional operand.
   if (op.getNumOperands() > 1)
diff --git a/mlir/examples/toy/Ch3/mlir/Dialect.cpp b/mlir/examples/toy/Ch3/mlir/Dialect.cpp
index 0e715c9aafa4cc..e4391e3b6fa909 100644
--- a/mlir/examples/toy/Ch3/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch3/mlir/Dialect.cpp
@@ -191,7 +191,7 @@ void MulOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
 static mlir::LogicalResult verify(ReturnOp op) {
   // We know that the parent operation is a function, because of the 'HasParent'
   // trait attached to the operation definition.
-  auto function = cast<FuncOp>(op.getParentOp());
+  auto function = cast<FuncOp>(op->getParentOp());
 
   /// ReturnOps can only have a single optional operand.
   if (op.getNumOperands() > 1)
diff --git a/mlir/examples/toy/Ch4/mlir/Dialect.cpp b/mlir/examples/toy/Ch4/mlir/Dialect.cpp
index 09875f7bdac198..0a3ec29b570740 100644
--- a/mlir/examples/toy/Ch4/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch4/mlir/Dialect.cpp
@@ -246,7 +246,7 @@ void GenericCallOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
 /// Return the callee of the generic call operation, this is required by the
 /// call interface.
 CallInterfaceCallable GenericCallOp::getCallableForCallee() {
-  return getAttrOfType<SymbolRefAttr>("callee");
+  return (*this)->getAttrOfType<SymbolRefAttr>("callee");
 }
 
 /// Get the argument operands to the called function, this is required by the
@@ -272,7 +272,7 @@ void MulOp::inferShapes() { getResult().setType(getOperand(0).getType()); }
 static mlir::LogicalResult verify(ReturnOp op) {
   // We know that the parent operation is a function, because of the 'HasParent'
   // trait attached to the operation definition.
-  auto function = cast<FuncOp>(op.getParentOp());
+  auto function = cast<FuncOp>(op->getParentOp());
 
   /// ReturnOps can only have a single optional operand.
   if (op.getNumOperands() > 1)
diff --git a/mlir/examples/toy/Ch5/mlir/Dialect.cpp b/mlir/examples/toy/Ch5/mlir/Dialect.cpp
index 045673e67f7961..d4356f61f83bea 100644
--- a/mlir/examples/toy/Ch5/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch5/mlir/Dialect.cpp
@@ -246,7 +246,7 @@ void GenericCallOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
 /// Return the callee of the generic call operation, this is required by the
 /// call interface.
 CallInterfaceCallable GenericCallOp::getCallableForCallee() {
-  return getAttrOfType<SymbolRefAttr>("callee");
+  return (*this)->getAttrOfType<SymbolRefAttr>("callee");
 }
 
 /// Get the argument operands to the called function, this is required by the
@@ -272,7 +272,7 @@ void MulOp::inferShapes() { getResult().setType(getOperand(0).getType()); }
 static mlir::LogicalResult verify(ReturnOp op) {
   // We know that the parent operation is a function, because of the 'HasParent'
   // trait attached to the operation definition.
-  auto function = cast<FuncOp>(op.getParentOp());
+  auto function = cast<FuncOp>(op->getParentOp());
 
   /// ReturnOps can only have a single optional operand.
   if (op.getNumOperands() > 1)
diff --git a/mlir/examples/toy/Ch6/mlir/Dialect.cpp b/mlir/examples/toy/Ch6/mlir/Dialect.cpp
index 045673e67f7961..d4356f61f83bea 100644
--- a/mlir/examples/toy/Ch6/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch6/mlir/Dialect.cpp
@@ -246,7 +246,7 @@ void GenericCallOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
 /// Return the callee of the generic call operation, this is required by the
 /// call interface.
 CallInterfaceCallable GenericCallOp::getCallableForCallee() {
-  return getAttrOfType<SymbolRefAttr>("callee");
+  return (*this)->getAttrOfType<SymbolRefAttr>("callee");
 }
 
 /// Get the argument operands to the called function, this is required by the
@@ -272,7 +272,7 @@ void MulOp::inferShapes() { getResult().setType(getOperand(0).getType()); }
 static mlir::LogicalResult verify(ReturnOp op) {
   // We know that the parent operation is a function, because of the 'HasParent'
   // trait attached to the operation definition.
-  auto function = cast<FuncOp>(op.getParentOp());
+  auto function = cast<FuncOp>(op->getParentOp());
 
   /// ReturnOps can only have a single optional operand.
   if (op.getNumOperands() > 1)
diff --git a/mlir/examples/toy/Ch7/mlir/Dialect.cpp b/mlir/examples/toy/Ch7/mlir/Dialect.cpp
index 032d7a61fb904f..5c87e5a886fbdc 100644
--- a/mlir/examples/toy/Ch7/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch7/mlir/Dialect.cpp
@@ -298,7 +298,7 @@ void GenericCallOp::build(mlir::OpBuilder &builder, mlir::OperationState &state,
 /// Return the callee of the generic call operation, this is required by the
 /// call interface.
 CallInterfaceCallable GenericCallOp::getCallableForCallee() {
-  return getAttrOfType<SymbolRefAttr>("callee");
+  return (*this)->getAttrOfType<SymbolRefAttr>("callee");
 }
 
 /// Get the argument operands to the called function, this is required by the
@@ -324,7 +324,7 @@ void MulOp::inferShapes() { getResult().setType(getOperand(0).getType()); }
 static mlir::LogicalResult verify(ReturnOp op) {
   // We know that the parent operation is a function, because of the 'HasParent'
   // trait attached to the operation definition.
-  auto function = cast<FuncOp>(op.getParentOp());
+  auto function = cast<FuncOp>(op->getParentOp());
 
   /// ReturnOps can only have a single optional operand.
   if (op.getNumOperands() > 1)
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
index 33c00ca9b22c02..953a2d5c282c91 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -207,8 +207,8 @@ def GPU_GPUFuncOp : GPU_Op<"func", [HasParent<"GPUModuleOp">,
     /// Returns `true` if the GPU function defined by this Op is a kernel, i.e.
     /// it is intended to be launched from host.
     bool isKernel() {
-      return getAttrOfType<UnitAttr>(GPUDialect::getKernelFuncAttrName()) !=
-             nullptr;
+      return (*this)->getAttrOfType<UnitAttr>(
+          GPUDialect::getKernelFuncAttrName()) != nullptr;
     }
 
     /// Change the type of this function in place. This is an extremely
@@ -223,8 +223,8 @@ def GPU_GPUFuncOp : GPU_Op<"func", [HasParent<"GPUModuleOp">,
 
     /// Returns the number of buffers located in the workgroup memory.
     unsigned getNumWorkgroupAttributions() {
-      return getAttrOfType<IntegerAttr>(getNumWorkgroupAttributionsAttrName())
-          .getInt();
+      return (*this)->getAttrOfType<IntegerAttr>(
+          getNumWorkgroupAttributionsAttrName()).getInt();
     }
 
     /// Returns a list of block arguments that correspond to buffers located in
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index e4dee53560bd3d..807ea8826ef842 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -766,7 +766,7 @@ def LLVM_LLVMFuncOp
     Block *addEntryBlock();
 
     LLVMType getType() {
-      return getAttrOfType<TypeAttr>(getTypeAttrName())
+      return (*this)->getAttrOfType<TypeAttr>(getTypeAttrName())
           .getValue().cast<LLVMType>();
     }
     bool isVarArg() {
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 96b111f7c50828..1f9b860eb52eb5 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -107,7 +107,7 @@ def NVVM_ShflBflyOp :
   let parser = [{ return parseNVVMShflSyncBflyOp(parser, result); }];
   let printer = [{ printNVVMIntrinsicOp(p, this->getOperation()); }];
   let verifier = [{
-    if (!getAttrOfType<UnitAttr>("return_value_and_is_valid"))
+    if (!(*this)->getAttrOfType<UnitAttr>("return_value_and_is_valid"))
       return success();
     auto type = getType().cast<LLVM::LLVMType>();
     if (!type.isStructTy() || type.getStructNumElements() != 2 ||
diff --git a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h
index e2b9470f19a785..7302bd486657cc 100644
--- a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h
+++ b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h
@@ -59,7 +59,9 @@ class ConstantFloatOp : public ConstantOp {
   static void build(OpBuilder &builder, OperationState &result,
                     const APFloat &value, FloatType type);
 
-  APFloat getValue() { return getAttrOfType<FloatAttr>("value").getValue(); }
+  APFloat getValue() {
+    return (*this)->getAttrOfType<FloatAttr>("value").getValue();
+  }
 
   static bool classof(Operation *op);
 };
@@ -81,7 +83,9 @@ class ConstantIntOp : public ConstantOp {
   static void build(OpBuilder &builder, OperationState &result, int64_t value,
                     Type type);
 
-  int64_t getValue() { return getAttrOfType<IntegerAttr>("value").getInt(); }
+  int64_t getValue() {
+    return (*this)->getAttrOfType<IntegerAttr>("value").getInt();
+  }
 
   static bool classof(Operation *op);
 };
@@ -98,7 +102,9 @@ class ConstantIndexOp : public ConstantOp {
   /// Build a constant int op producing an index.
   static void build(OpBuilder &builder, OperationState &result, int64_t value);
 
-  int64_t getValue() { return getAttrOfType<IntegerAttr>("value").getInt(); }
+  int64_t getValue() {
+    return (*this)->getAttrOfType<IntegerAttr>("value").getInt();
+  }
 
   static bool classof(Operation *op);
 };
@@ -159,8 +165,8 @@ class DmaStartOp
   }
   // Returns the source memref indices for this DMA operation.
   operand_range getSrcIndices() {
-    return {getOperation()->operand_begin() + 1,
-            getOperation()->operand_begin() + 1 + getSrcMemRefRank()};
+    return {(*this)->operand_begin() + 1,
+            (*this)->operand_begin() + 1 + getSrcMemRefRank()};
   }
 
   // Returns the destination MemRefType for this DMA operations.
@@ -178,8 +184,8 @@ class DmaStartOp
 
   // Returns the destination memref indices for this DMA operation.
   operand_range getDstIndices() {
-    return {getOperation()->operand_begin() + 1 + getSrcMemRefRank() + 1,
-            getOperation()->operand_begin() + 1 + getSrcMemRefRank() + 1 +
+    return {(*this)->operand_begin() + 1 + getSrcMemRefRank() + 1,
+            (*this)->operand_begin() + 1 + getSrcMemRefRank() + 1 +
                 getDstMemRefRank()};
   }
 
@@ -201,9 +207,8 @@ class DmaStartOp
   operand_range getTagIndices() {
     unsigned tagIndexStartPos =
         1 + getSrcMemRefRank() + 1 + getDstMemRefRank() + 1 + 1;
-    return {getOperation()->operand_begin() + tagIndexStartPos,
-            getOperation()->operand_begin() + tagIndexStartPos +
-                getTagMemRefRank()};
+    return {(*this)->operand_begin() + tagIndexStartPos,
+            (*this)->operand_begin() + tagIndexStartPos + getTagMemRefRank()};
   }
 
   /// Returns true if this is a DMA from a faster memory space to a slower one.
@@ -279,8 +284,8 @@ class DmaWaitOp
 
   // Returns the tag memref index for this DMA operation.
   operand_range getTagIndices() {
-    return {getOperation()->operand_begin() + 1,
-            getOperation()->operand_begin() + 1 + getTagMemRefRank()};
+    return {(*this)->operand_begin() + 1,
+            (*this)->operand_begin() + 1 + getTagMemRefRank()};
   }
 
   // Returns the rank (number of indices) of the tag memref.
diff --git a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
index 441cff497ed26f..5368880a7cb134 100644
--- a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
+++ b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
@@ -820,7 +820,7 @@ def CallOp : Std_Op<"call",
 
     /// Return the callee of this operation.
     CallInterfaceCallable getCallableForCallee() {
-      return getAttrOfType<SymbolRefAttr>("callee");
+      return (*this)->getAttrOfType<SymbolRefAttr>("callee");
     }
   }];
 
@@ -1040,8 +1040,8 @@ def CmpFOp : Std_Op<"cmpf",
     static CmpFPredicate getPredicateByName(StringRef name);
 
     CmpFPredicate getPredicate() {
-      return (CmpFPredicate)getAttrOfType<IntegerAttr>(getPredicateAttrName())
-          .getInt();
+      return (CmpFPredicate)(*this)->getAttrOfType<IntegerAttr>(
+          getPredicateAttrName()).getInt();
     }
   }];
 
@@ -1162,8 +1162,8 @@ def CmpIOp : Std_Op<"cmpi",
     static CmpIPredicate getPredicateByName(StringRef name);
 
     CmpIPredicate getPredicate() {
-      return (CmpIPredicate)getAttrOfType<IntegerAttr>(getPredicateAttrName())
-          .getInt();
+      return (CmpIPredicate)(*this)->getAttrOfType<IntegerAttr>(
+          getPredicateAttrName()).getInt();
     }
   }];
 
diff --git a/mlir/include/mlir/Dialect/Vector/VectorOps.td b/mlir/include/mlir/Dialect/Vector/VectorOps.td
index 76b98c1f3f36ca..de77e3b034830f 100644
--- a/mlir/include/mlir/Dialect/Vector/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/VectorOps.td
@@ -1937,7 +1937,8 @@ def Vector_TupleGetOp :
       return getResult().getType().cast<VectorType>();
     }
     int64_t getIndex() {
-      return getAttrOfType<IntegerAttr>("index").getValue().getSExtValue();
+      auto index = (*this)->getAttrOfType<IntegerAttr>("index");
+      return index.getValue().getSExtValue();
     }
     static StringRef getIndexAttrName() { return "index"; }
   }];
diff --git a/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp b/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp
index 361bfa2b6fade4..75c23d37a53a54 100644
--- a/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp
+++ b/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp
@@ -466,7 +466,7 @@ static void addSuspensionPoint(CoroMachinery coro, Value coroState,
 // Note that this is not reversible transformation.
 static std::pair<FuncOp, CoroMachinery>
 outlineExecuteOp(SymbolTable &symbolTable, ExecuteOp execute) {
-  ModuleOp module = execute.getParentOfType<ModuleOp>();
+  ModuleOp module = execute->getParentOfType<ModuleOp>();
 
   MLIRContext *ctx = module.getContext();
   Location loc = execute.getLoc();
@@ -727,7 +727,7 @@ class AwaitOpLoweringBase : public ConversionPattern {
       return failure();
 
     // Check if await operation is inside the outlined coroutine function.
-    auto func = await.template getParentOfType<FuncOp>();
+    auto func = await->template getParentOfType<FuncOp>();
     auto outlined = outlinedFunctions.find(func);
     const bool isInCoroutine = outlined != outlinedFunctions.end();
 
diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
index 810511194f682b..3b4b39e57d557c 100644
--- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
+++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp
@@ -587,7 +587,8 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
       launchOp, launchOp.getKernelModuleName());
   assert(kernelModule && "expected a kernel module");
 
-  auto binaryAttr = kernelModule.getAttrOfType<StringAttr>(gpuBinaryAnnotation);
+  auto binaryAttr =
+      kernelModule->getAttrOfType<StringAttr>(gpuBinaryAnnotation);
   if (!binaryAttr) {
     kernelModule.emitOpError()
         << "missing " << gpuBinaryAnnotation << " attribute";
diff --git a/mlir/lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp b/mlir/lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp
index 16f30c3fe02617..355bced96ae750 100644
--- a/mlir/lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp
+++ b/mlir/lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp
@@ -186,7 +186,7 @@ void VulkanLaunchFuncToVulkanCallsPass::collectSPIRVAttributes(
   // Check that `kSPIRVBinary` and `kSPIRVEntryPoint` are present in attributes
   // for the given vulkan launch call.
   auto spirvBlobAttr =
-      vulkanLaunchCallOp.getAttrOfType<StringAttr>(kSPIRVBlobAttrName);
+      vulkanLaunchCallOp->getAttrOfType<StringAttr>(kSPIRVBlobAttrName);
   if (!spirvBlobAttr) {
     vulkanLaunchCallOp.emitError()
         << "missing " << kSPIRVBlobAttrName << " attribute";
@@ -194,7 +194,7 @@ void VulkanLaunchFuncToVulkanCallsPass::collectSPIRVAttributes(
   }
 
   auto spirvEntryPointNameAttr =
-      vulkanLaunchCallOp.getAttrOfType<StringAttr>(kSPIRVEntryPointAttrName);
+      vulkanLaunchCallOp->getAttrOfType<StringAttr>(kSPIRVEntryPointAttrName);
   if (!spirvEntryPointNameAttr) {
     vulkanLaunchCallOp.emitError()
         << "missing " << kSPIRVEntryPointAttrName << " attribute";
diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
index b7b4e7aab859c4..39b0d62c76453a 100644
--- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
+++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
@@ -381,7 +381,7 @@ static LogicalResult processParallelLoop(
   // TODO: Verify that this is a valid GPU mapping.
   // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
   ArrayAttr mapping =
-      parallelOp.getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());
+      parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());
 
   // TODO: Support reductions.
   if (!mapping || parallelOp.getNumResults() != 0)
@@ -390,7 +390,7 @@ static LogicalResult processParallelLoop(
   Location loc = parallelOp.getLoc();
 
   auto launchIndependent = [&launchOp](Value val) {
-    return val.getParentRegion()->isAncestor(launchOp.getParentRegion());
+    return val.getParentRegion()->isAncestor(launchOp->getParentRegion());
   };
 
   auto ensureLaunchIndependent = [&rewriter,
@@ -568,7 +568,7 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
                                              PatternRewriter &rewriter) const {
   // We can only transform starting at the outer-most loop. Launches inside of
   // parallel loops are not supported.
-  if (auto parentLoop = parallelOp.getParentOfType<ParallelOp>())
+  if (auto parentLoop = parallelOp->getParentOfType<ParallelOp>())
     return failure();
   // Create a launch operation. We start with bound one for all grid/block
   // sizes. Those will be refined later as we discover them from mappings.
diff --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
index 906ee8bafbbb55..3adb02af15c604 100644
--- a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
+++ b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
@@ -66,7 +66,7 @@ static void insertOpenMPParallel(FuncOp func) {
   SmallVector<scf::ParallelOp, 4> topLevelParallelOps;
   func.walk([&topLevelParallelOps](scf::ParallelOp parallelOp) {
     // Ignore ops that are already within OpenMP parallel construct.
-    if (!parallelOp.getParentOfType<scf::ParallelOp>())
+    if (!parallelOp->getParentOfType<scf::ParallelOp>())
       topLevelParallelOps.push_back(parallelOp);
   });
 
@@ -87,7 +87,7 @@ static LogicalResult applyPatterns(FuncOp func) {
   ConversionTarget target(*func.getContext());
   target.addIllegalOp<scf::ParallelOp>();
   target.addDynamicallyLegalOp<scf::YieldOp>(
-      [](scf::YieldOp op) { return !isa<scf::ParallelOp>(op.getParentOp()); });
+      [](scf::YieldOp op) { return !isa<scf::ParallelOp>(op->getParentOp()); });
   target.addLegalDialect<omp::OpenMPDialect>();
 
   OwningRewritePatternList patterns;
diff --git a/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp b/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp
index 1030f0dbd288b2..754e1937a7833c 100644
--- a/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp
+++ b/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp
@@ -269,11 +269,11 @@ LogicalResult TerminatorOpConversion::matchAndRewrite(
   // VariableOp created during lowering of the parent region.
   if (!operands.empty()) {
     auto loc = terminatorOp.getLoc();
-    auto &allocas = scfToSPIRVContext->outputVars[terminatorOp.getParentOp()];
+    auto &allocas = scfToSPIRVContext->outputVars[terminatorOp->getParentOp()];
     assert(allocas.size() == operands.size());
     for (unsigned i = 0, e = operands.size(); i < e; i++)
       rewriter.create<spirv::StoreOp>(loc, allocas[i], operands[i]);
-    if (isa<spirv::LoopOp>(terminatorOp.getParentOp())) {
+    if (isa<spirv::LoopOp>(terminatorOp->getParentOp())) {
       // For loops we also need to update the branch jumping back to the header.
       auto br =
           cast<spirv::BranchOp>(rewriter.getInsertionBlock()->getTerminator());
diff --git a/mlir/lib/Conversion/SPIRVToLLVM/ConvertLaunchFuncToLLVMCalls.cpp b/mlir/lib/Conversion/SPIRVToLLVM/ConvertLaunchFuncToLLVMCalls.cpp
index f83f72d1d10ebe..b01c443ddc779f 100644
--- a/mlir/lib/Conversion/SPIRVToLLVM/ConvertLaunchFuncToLLVMCalls.cpp
+++ b/mlir/lib/Conversion/SPIRVToLLVM/ConvertLaunchFuncToLLVMCalls.cpp
@@ -52,7 +52,7 @@ static std::string bindingName() {
 ///   i -> (0, i)
 /// which is implemented under `LowerABIAttributesPass`.
 static unsigned calculateGlobalIndex(spirv::GlobalVariableOp op) {
-  IntegerAttr binding = op.getAttrOfType<IntegerAttr>(bindingName());
+  IntegerAttr binding = op->getAttrOfType<IntegerAttr>(bindingName());
   return binding.getInt();
 }
 
@@ -75,8 +75,8 @@ static std::string
 createGlobalVariableWithBindName(spirv::GlobalVariableOp op,
                                  StringRef kernelModuleName) {
   IntegerAttr descriptorSet =
-      op.getAttrOfType<IntegerAttr>(descriptorSetName());
-  IntegerAttr binding = op.getAttrOfType<IntegerAttr>(bindingName());
+      op->getAttrOfType<IntegerAttr>(descriptorSetName());
+  IntegerAttr binding = op->getAttrOfType<IntegerAttr>(bindingName());
   return llvm::formatv("{0}_{1}_descriptor_set{2}_binding{3}",
                        kernelModuleName.str(), op.sym_name().str(),
                        std::to_string(descriptorSet.getInt()),
@@ -87,8 +87,8 @@ createGlobalVariableWithBindName(spirv::GlobalVariableOp op,
 /// and a binding number.
 static bool hasDescriptorSetAndBinding(spirv::GlobalVariableOp op) {
   IntegerAttr descriptorSet =
-      op.getAttrOfType<IntegerAttr>(descriptorSetName());
-  IntegerAttr binding = op.getAttrOfType<IntegerAttr>(bindingName());
+      op->getAttrOfType<IntegerAttr>(descriptorSetName());
+  IntegerAttr binding = op->getAttrOfType<IntegerAttr>(bindingName());
   return descriptorSet && binding;
 }
 
@@ -155,7 +155,7 @@ class GPULaunchLowering : public ConvertOpToLLVMPattern<gpu::LaunchFuncOp> {
                   ConversionPatternRewriter &rewriter) const override {
     auto *op = launchOp.getOperation();
     MLIRContext *context = rewriter.getContext();
-    auto module = launchOp.getParentOfType<ModuleOp>();
+    auto module = launchOp->getParentOfType<ModuleOp>();
 
     // Get the SPIR-V module that represents the gpu kernel module. The module
     // is named:
diff --git a/mlir/lib/Conversion/SPIRVToLLVM/ConvertSPIRVToLLVM.cpp b/mlir/lib/Conversion/SPIRVToLLVM/ConvertSPIRVToLLVM.cpp
index f81ae9e9c1d519..5309daec5e523e 100644
--- a/mlir/lib/Conversion/SPIRVToLLVM/ConvertSPIRVToLLVM.cpp
+++ b/mlir/lib/Conversion/SPIRVToLLVM/ConvertSPIRVToLLVM.cpp
@@ -647,7 +647,7 @@ class ExecutionModePattern
     // First, create the global struct's name that would be associated with
     // this entry point's execution mode. We set it to be:
     //   __spv__{SPIR-V module name}_{function name}_execution_mode_info
-    ModuleOp module = op.getParentOfType<ModuleOp>();
+    ModuleOp module = op->getParentOfType<ModuleOp>();
     std::string moduleName;
     if (module.getName().hasValue())
       moduleName = "_" + module.getName().getValue().str();
@@ -1530,8 +1530,9 @@ void mlir::encodeBindAttribute(ModuleOp module) {
   auto spvModules = module.getOps<spirv::ModuleOp>();
   for (auto spvModule : spvModules) {
     spvModule.walk([&](spirv::GlobalVariableOp op) {
-      IntegerAttr descriptorSet = op.getAttrOfType<IntegerAttr>(kDescriptorSet);
-      IntegerAttr binding = op.getAttrOfType<IntegerAttr>(kBinding);
+      IntegerAttr descriptorSet =
+          op->getAttrOfType<IntegerAttr>(kDescriptorSet);
+      IntegerAttr binding = op->getAttrOfType<IntegerAttr>(kBinding);
       // For every global variable in the module, get the ones with descriptor
       // set and binding numbers.
       if (descriptorSet && binding) {
diff --git a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
index 17a06546329700..2f6856dc275a76 100644
--- a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
+++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
@@ -1349,7 +1349,7 @@ struct FuncOpConversionBase : public ConvertOpToLLVMPattern<FuncOp> {
                             ConversionPatternRewriter &rewriter) const {
     // Convert the original function arguments. They are converted using the
     // LLVMTypeConverter provided to this legalization pattern.
-    auto varargsAttr = funcOp.getAttrOfType<BoolAttr>("std.varargs");
+    auto varargsAttr = funcOp->getAttrOfType<BoolAttr>("std.varargs");
     TypeConverter::SignatureConversion result(funcOp.getNumArguments());
     auto llvmType = getTypeConverter()->convertFunctionSignature(
         funcOp.getType(), varargsAttr && varargsAttr.getValue(), result);
@@ -1407,7 +1407,7 @@ struct FuncOpConversion : public FuncOpConversionBase {
       return failure();
 
     if (getTypeConverter()->getOptions().emitCWrappers ||
-        funcOp.getAttrOfType<UnitAttr>(kEmitIfaceAttrName)) {
+        funcOp->getAttrOfType<UnitAttr>(kEmitIfaceAttrName)) {
       if (newFuncOp.isExternal())
         wrapExternalFunction(rewriter, funcOp.getLoc(), *getTypeConverter(),
                              funcOp, newFuncOp);
@@ -1717,7 +1717,7 @@ struct AssertOpLowering : public ConvertOpToLLVMPattern<AssertOp> {
     AssertOp::Adaptor transformed(operands);
 
     // Insert the `abort` declaration if necessary.
-    auto module = op.getParentOfType<ModuleOp>();
+    auto module = op->getParentOfType<ModuleOp>();
     auto abortFunc = module.lookupSymbol<LLVM::LLVMFuncOp>("abort");
     if (!abortFunc) {
       OpBuilder::InsertionGuard guard(rewriter);
@@ -2056,7 +2056,7 @@ struct AllocOpLowering : public AllocLikeOpLowering {
     Type elementPtrType = this->getElementPtrType(memRefType);
     Value allocatedPtr =
         createAllocCall(loc, "malloc", elementPtrType, {sizeBytes},
-                        allocOp.getParentOfType<ModuleOp>(), rewriter);
+                        allocOp->getParentOfType<ModuleOp>(), rewriter);
 
     Value alignedPtr = allocatedPtr;
     if (alignment) {
@@ -2138,7 +2138,7 @@ struct AlignedAllocOpLowering : public AllocLikeOpLowering {
     Type elementPtrType = this->getElementPtrType(memRefType);
     Value allocatedPtr = createAllocCall(
         loc, "aligned_alloc", elementPtrType, {allocAlignment, sizeBytes},
-        allocOp.getParentOfType<ModuleOp>(), rewriter);
+        allocOp->getParentOfType<ModuleOp>(), rewriter);
 
     return std::make_tuple(allocatedPtr, allocatedPtr);
   }
@@ -2363,11 +2363,11 @@ struct DeallocOpLowering : public ConvertOpToLLVMPattern<DeallocOp> {
 
     // Insert the `free` declaration if it is not already present.
     auto freeFunc =
-        op.getParentOfType<ModuleOp>().lookupSymbol<LLVM::LLVMFuncOp>("free");
+        op->getParentOfType<ModuleOp>().lookupSymbol<LLVM::LLVMFuncOp>("free");
     if (!freeFunc) {
       OpBuilder::InsertionGuard guard(rewriter);
       rewriter.setInsertionPointToStart(
-          op.getParentOfType<ModuleOp>().getBody());
+          op->getParentOfType<ModuleOp>().getBody());
       freeFunc = rewriter.create<LLVM::LLVMFuncOp>(
           rewriter.getUnknownLoc(), "free",
           LLVM::LLVMType::getFunctionTy(getVoidType(), getVoidPtrType(),
diff --git a/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp
index 1c7aec1f55a3ef..cd5079f50425b0 100644
--- a/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp
+++ b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp
@@ -280,7 +280,7 @@ class AllocOpPattern final : public SPIRVOpLowering<AllocOp> {
 
     // Insert spv.globalVariable for this allocation.
     Operation *parent =
-        SymbolTable::getNearestSymbolTable(operation.getParentOp());
+        SymbolTable::getNearestSymbolTable(operation->getParentOp());
     if (!parent)
       return failure();
     Location loc = operation.getLoc();
@@ -868,9 +868,9 @@ IntLoadOpPattern::matchAndRewrite(LoadOp loadOp, ArrayRef<Value> operands,
                                                    srcBits, dstBits, rewriter);
   Value spvLoadOp = rewriter.create<spirv::LoadOp>(
       loc, dstType, adjustedPtr,
-      loadOp.getAttrOfType<IntegerAttr>(
+      loadOp->getAttrOfType<IntegerAttr>(
           spirv::attributeName<spirv::MemoryAccess>()),
-      loadOp.getAttrOfType<IntegerAttr>("alignment"));
+      loadOp->getAttrOfType<IntegerAttr>("alignment"));
 
   // Shift the bits to the rightmost.
   // ____XXXX________ -> ____________XXXX
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index 7c3d1b762d7ee3..005db18c54e51d 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -1670,7 +1670,7 @@ void AffineForOp::setLowerBound(ValueRange lbOperands, AffineMap map) {
   newOperands.append(ubOperands.begin(), ubOperands.end());
   auto iterOperands = getIterOperands();
   newOperands.append(iterOperands.begin(), iterOperands.end());
-  getOperation()->setOperands(newOperands);
+  (*this)->setOperands(newOperands);
 
   setAttr(getLowerBoundAttrName(), AffineMapAttr::get(map));
 }
@@ -1683,7 +1683,7 @@ void AffineForOp::setUpperBound(ValueRange ubOperands, AffineMap map) {
   newOperands.append(ubOperands.begin(), ubOperands.end());
   auto iterOperands = getIterOperands();
   newOperands.append(iterOperands.begin(), iterOperands.end());
-  getOperation()->setOperands(newOperands);
+  (*this)->setOperands(newOperands);
 
   setAttr(getUpperBoundAttrName(), AffineMapAttr::get(map));
 }
@@ -1902,7 +1902,7 @@ struct SimplifyDeadElse : public OpRewritePattern<AffineIfOp> {
 static LogicalResult verify(AffineIfOp op) {
   // Verify that we have a condition attribute.
   auto conditionAttr =
-      op.getAttrOfType<IntegerSetAttr>(op.getConditionAttrName());
+      op->getAttrOfType<IntegerSetAttr>(op.getConditionAttrName());
   if (!conditionAttr)
     return op.emitOpError(
         "requires an integer set attribute named 'condition'");
@@ -1975,7 +1975,7 @@ static ParseResult parseAffineIfOp(OpAsmParser &parser,
 
 static void print(OpAsmPrinter &p, AffineIfOp op) {
   auto conditionAttr =
-      op.getAttrOfType<IntegerSetAttr>(op.getConditionAttrName());
+      op->getAttrOfType<IntegerSetAttr>(op.getConditionAttrName());
   p << "affine.if " << conditionAttr;
   printDimAndSymbolList(op.operand_begin(), op.operand_end(),
                         conditionAttr.getValue().getNumDims(), p);
@@ -1999,7 +1999,9 @@ static void print(OpAsmPrinter &p, AffineIfOp op) {
 }
 
 IntegerSet AffineIfOp::getIntegerSet() {
-  return getAttrOfType<IntegerSetAttr>(getConditionAttrName()).getValue();
+  return (*this)
+      ->getAttrOfType<IntegerSetAttr>(getConditionAttrName())
+      .getValue();
 }
 void AffineIfOp::setIntegerSet(IntegerSet newSet) {
   setAttr(getConditionAttrName(), IntegerSetAttr::get(newSet));
@@ -2007,7 +2009,7 @@ void AffineIfOp::setIntegerSet(IntegerSet newSet) {
 
 void AffineIfOp::setConditional(IntegerSet set, ValueRange operands) {
   setIntegerSet(set);
-  getOperation()->setOperands(operands);
+  (*this)->setOperands(operands);
 }
 
 void AffineIfOp::build(OpBuilder &builder, OperationState &result,
@@ -2120,7 +2122,7 @@ static ParseResult parseAffineLoadOp(OpAsmParser &parser,
 static void print(OpAsmPrinter &p, AffineLoadOp op) {
   p << "affine.load " << op.getMemRef() << '[';
   if (AffineMapAttr mapAttr =
-          op.getAttrOfType<AffineMapAttr>(op.getMapAttrName()))
+          op->getAttrOfType<AffineMapAttr>(op.getMapAttrName()))
     p.printAffineMapOfSSAIds(mapAttr, op.getMapOperands());
   p << ']';
   p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{op.getMapAttrName()});
@@ -2163,7 +2165,7 @@ LogicalResult verify(AffineLoadOp op) {
 
   if (failed(verifyMemoryOpIndexing(
           op.getOperation(),
-          op.getAttrOfType<AffineMapAttr>(op.getMapAttrName()),
+          op->getAttrOfType<AffineMapAttr>(op.getMapAttrName()),
           op.getMapOperands(), memrefType,
           /*numIndexOperands=*/op.getNumOperands() - 1)))
     return failure();
@@ -2236,7 +2238,7 @@ static void print(OpAsmPrinter &p, AffineStoreOp op) {
   p << "affine.store " << op.getValueToStore();
   p << ", " << op.getMemRef() << '[';
   if (AffineMapAttr mapAttr =
-          op.getAttrOfType<AffineMapAttr>(op.getMapAttrName()))
+          op->getAttrOfType<AffineMapAttr>(op.getMapAttrName()))
     p.printAffineMapOfSSAIds(mapAttr, op.getMapOperands());
   p << ']';
   p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{op.getMapAttrName()});
@@ -2252,7 +2254,7 @@ LogicalResult verify(AffineStoreOp op) {
 
   if (failed(verifyMemoryOpIndexing(
           op.getOperation(),
-          op.getAttrOfType<AffineMapAttr>(op.getMapAttrName()),
+          op->getAttrOfType<AffineMapAttr>(op.getMapAttrName()),
           op.getMapOperands(), memrefType,
           /*numIndexOperands=*/op.getNumOperands() - 2)))
     return failure();
@@ -2438,7 +2440,7 @@ static ParseResult parseAffinePrefetchOp(OpAsmParser &parser,
 
 static void print(OpAsmPrinter &p, AffinePrefetchOp op) {
   p << AffinePrefetchOp::getOperationName() << " " << op.memref() << '[';
-  AffineMapAttr mapAttr = op.getAttrOfType<AffineMapAttr>(op.getMapAttrName());
+  AffineMapAttr mapAttr = op->getAttrOfType<AffineMapAttr>(op.getMapAttrName());
   if (mapAttr) {
     SmallVector<Value, 2> operands(op.getMapOperands());
     p.printAffineMapOfSSAIds(mapAttr, operands);
@@ -2454,7 +2456,7 @@ static void print(OpAsmPrinter &p, AffinePrefetchOp op) {
 }
 
 static LogicalResult verify(AffinePrefetchOp op) {
-  auto mapAttr = op.getAttrOfType<AffineMapAttr>(op.getMapAttrName());
+  auto mapAttr = op->getAttrOfType<AffineMapAttr>(op.getMapAttrName());
   if (mapAttr) {
     AffineMap map = mapAttr.getValue();
     if (map.getNumResults() != op.getMemRefType().getRank())
@@ -2624,7 +2626,7 @@ void AffineParallelOp::setLowerBounds(ValueRange lbOperands, AffineMap map) {
 
   SmallVector<Value, 4> newOperands(lbOperands);
   newOperands.append(ubOperands.begin(), ubOperands.end());
-  getOperation()->setOperands(newOperands);
+  (*this)->setOperands(newOperands);
 
   lowerBoundsMapAttr(AffineMapAttr::get(map));
 }
@@ -2636,7 +2638,7 @@ void AffineParallelOp::setUpperBounds(ValueRange ubOperands, AffineMap map) {
 
   SmallVector<Value, 4> newOperands(getLowerBoundsOperands());
   newOperands.append(ubOperands.begin(), ubOperands.end());
-  getOperation()->setOperands(newOperands);
+  (*this)->setOperands(newOperands);
 
   upperBoundsMapAttr(AffineMapAttr::get(map));
 }
@@ -2880,7 +2882,7 @@ static ParseResult parseAffineParallelOp(OpAsmParser &parser,
 //===----------------------------------------------------------------------===//
 
 static LogicalResult verify(AffineYieldOp op) {
-  auto *parentOp = op.getParentOp();
+  auto *parentOp = op->getParentOp();
   auto results = parentOp->getResults();
   auto operands = op.getOperands();
 
@@ -2960,7 +2962,7 @@ static ParseResult parseAffineVectorLoadOp(OpAsmParser &parser,
 static void print(OpAsmPrinter &p, AffineVectorLoadOp op) {
   p << "affine.vector_load " << op.getMemRef() << '[';
   if (AffineMapAttr mapAttr =
-          op.getAttrOfType<AffineMapAttr>(op.getMapAttrName()))
+          op->getAttrOfType<AffineMapAttr>(op.getMapAttrName()))
     p.printAffineMapOfSSAIds(mapAttr, op.getMapOperands());
   p << ']';
   p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{op.getMapAttrName()});
@@ -2981,7 +2983,7 @@ static LogicalResult verify(AffineVectorLoadOp op) {
   MemRefType memrefType = op.getMemRefType();
   if (failed(verifyMemoryOpIndexing(
           op.getOperation(),
-          op.getAttrOfType<AffineMapAttr>(op.getMapAttrName()),
+          op->getAttrOfType<AffineMapAttr>(op.getMapAttrName()),
           op.getMapOperands(), memrefType,
           /*numIndexOperands=*/op.getNumOperands() - 1)))
     return failure();
@@ -3048,7 +3050,7 @@ static void print(OpAsmPrinter &p, AffineVectorStoreOp op) {
   p << "affine.vector_store " << op.getValueToStore();
   p << ", " << op.getMemRef() << '[';
   if (AffineMapAttr mapAttr =
-          op.getAttrOfType<AffineMapAttr>(op.getMapAttrName()))
+          op->getAttrOfType<AffineMapAttr>(op.getMapAttrName()))
     p.printAffineMapOfSSAIds(mapAttr, op.getMapOperands());
   p << ']';
   p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{op.getMapAttrName()});
@@ -3059,7 +3061,7 @@ static LogicalResult verify(AffineVectorStoreOp op) {
   MemRefType memrefType = op.getMemRefType();
   if (failed(verifyMemoryOpIndexing(
           op.getOperation(),
-          op.getAttrOfType<AffineMapAttr>(op.getMapAttrName()),
+          op->getAttrOfType<AffineMapAttr>(op.getMapAttrName()),
           op.getMapOperands(), memrefType,
           /*numIndexOperands=*/op.getNumOperands() - 2)))
     return failure();
diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
index e5f5a6d8998fd5..b57b2908d63507 100644
--- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
@@ -221,7 +221,7 @@ LogicalResult mlir::hoistAffineIfOp(AffineIfOp ifOp, bool *folded) {
   // Canonicalize to remove dead else blocks (happens whenever an 'if' moves up
   // a sequence of affine.fors that are all perfectly nested).
   applyPatternsAndFoldGreedily(
-      hoistedIfOp.getParentWithTrait<OpTrait::IsIsolatedFromAbove>(),
+      hoistedIfOp->getParentWithTrait<OpTrait::IsIsolatedFromAbove>(),
       frozenPatterns);
 
   return success();
diff --git a/mlir/lib/Dialect/Async/IR/Async.cpp b/mlir/lib/Dialect/Async/IR/Async.cpp
index 1e84ba3418bb1b..36b3393118c3ad 100644
--- a/mlir/lib/Dialect/Async/IR/Async.cpp
+++ b/mlir/lib/Dialect/Async/IR/Async.cpp
@@ -102,7 +102,7 @@ Type ValueType::getValueType() { return getImpl()->valueType; }
 static LogicalResult verify(YieldOp op) {
   // Get the underlying value types from async values returned from the
   // parent `async.execute` operation.
-  auto executeOp = op.getParentOfType<ExecuteOp>();
+  auto executeOp = op->getParentOfType<ExecuteOp>();
   auto types = llvm::map_range(executeOp.results(), [](const OpResult &result) {
     return result.getType().cast<ValueType>().getValueType();
   });
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index e9d8009fb2e9dc..ee66ede112146a 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -80,13 +80,13 @@ LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
   auto walkResult = module.walk([&module](LaunchFuncOp launchOp) -> WalkResult {
     // Ignore launches that are nested more or less deep than functions in the
     // module we are currently checking.
-    if (!launchOp.getParentOp() ||
-        launchOp.getParentOp()->getParentOp() != module)
+    if (!launchOp->getParentOp() ||
+        launchOp->getParentOp()->getParentOp() != module)
       return success();
 
     // Ignore launch ops with missing attributes here. The errors will be
     // reported by the verifiers of those ops.
-    if (!launchOp.getAttrOfType<SymbolRefAttr>(
+    if (!launchOp->getAttrOfType<SymbolRefAttr>(
             LaunchFuncOp::getKernelAttrName()))
       return success();
 
@@ -434,7 +434,7 @@ void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
   result.addOperands({gridSize.x, gridSize.y, gridSize.z, blockSize.x,
                       blockSize.y, blockSize.z});
   result.addOperands(kernelOperands);
-  auto kernelModule = kernelFunc.getParentOfType<GPUModuleOp>();
+  auto kernelModule = kernelFunc->getParentOfType<GPUModuleOp>();
   auto kernelSymbol = builder.getSymbolRefAttr(
       kernelModule.getName(), {builder.getSymbolRefAttr(kernelFunc.getName())});
   result.addAttribute(getKernelAttrName(), kernelSymbol);
@@ -470,16 +470,17 @@ KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() {
 }
 
 static LogicalResult verify(LaunchFuncOp op) {
-  auto module = op.getParentOfType<ModuleOp>();
+  auto module = op->getParentOfType<ModuleOp>();
   if (!module)
     return op.emitOpError("expected to belong to a module");
 
-  if (!module.getAttrOfType<UnitAttr>(GPUDialect::getContainerModuleAttrName()))
+  if (!module->getAttrOfType<UnitAttr>(
+          GPUDialect::getContainerModuleAttrName()))
     return op.emitOpError(
         "expected the closest surrounding module to have the '" +
         GPUDialect::getContainerModuleAttrName() + "' attribute");
 
-  auto kernelAttr = op.getAttrOfType<SymbolRefAttr>(op.getKernelAttrName());
+  auto kernelAttr = op->getAttrOfType<SymbolRefAttr>(op.getKernelAttrName());
   if (!kernelAttr)
     return op.emitOpError("symbol reference attribute '" +
                           op.getKernelAttrName() + "' must be specified");
@@ -522,7 +523,7 @@ static void printLaunchFuncOperands(OpAsmPrinter &printer, Operation *,
 /// workgroup memory.
 BlockArgument GPUFuncOp::addWorkgroupAttribution(Type type) {
   auto attrName = getNumWorkgroupAttributionsAttrName();
-  auto attr = getAttrOfType<IntegerAttr>(attrName);
+  auto attr = (*this)->getAttrOfType<IntegerAttr>(attrName);
   setAttr(attrName, IntegerAttr::get(attr.getType(), attr.getValue() + 1));
   return getBody().insertArgument(getType().getNumInputs() + attr.getInt(),
                                   type);
@@ -777,7 +778,7 @@ static ParseResult parseReturnOp(OpAsmParser &parser, OperationState &result) {
 }
 
 static LogicalResult verify(gpu::ReturnOp returnOp) {
-  GPUFuncOp function = returnOp.getParentOfType<GPUFuncOp>();
+  GPUFuncOp function = returnOp->getParentOfType<GPUFuncOp>();
 
   FunctionType funType = function.getType();
 
diff --git a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
index f2c7010be29189..eaa777c380604b 100644
--- a/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
@@ -140,7 +140,7 @@ struct GpuAsyncRegionPass::DeferWaitCallback {
   ~DeferWaitCallback() {
     for (size_t i = 0; i < worklist.size(); ++i) {
       auto waitOp = worklist[i];
-      auto executeOp = waitOp.getParentOfType<async::ExecuteOp>();
+      auto executeOp = waitOp->getParentOfType<async::ExecuteOp>();
       auto numDependencies = waitOp.asyncDependencies().size();
 
       // Erase `gpu.wait` and return async dependencies from region instead.
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index c7be304236c366..ac5fe8c9458d34 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -243,7 +243,7 @@ class GpuKernelOutliningPass
       auto funcWalkResult = func.walk([&](gpu::LaunchOp op) {
         llvm::SetVector<Value> operands;
         std::string kernelFnName =
-            Twine(op.getParentOfType<FuncOp>().getName(), "_kernel").str();
+            Twine(op->getParentOfType<FuncOp>().getName(), "_kernel").str();
 
         // Pull in instructions that can be sunk
         if (failed(sinkOperationsIntoLaunchOp(op)))
diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
index b953bad6762760..43a27db9df6924 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
@@ -123,7 +123,7 @@ static void mapParallelOp(ParallelOp parallelOp,
                           MappingLevel mappingLevel = MapGrid) {
   // Do not try to add a mapping to already mapped loops or nested loops.
   if (parallelOp.getAttr(getMappingAttrName()) ||
-      ((mappingLevel == MapGrid) && parallelOp.getParentOfType<ParallelOp>()))
+      ((mappingLevel == MapGrid) && parallelOp->getParentOfType<ParallelOp>()))
     return;
 
   MLIRContext *ctx = parallelOp.getContext();
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index f481b702822b6f..4c48dccad096fa 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -457,7 +457,7 @@ static ParseResult parseInvokeOp(OpAsmParser &parser, OperationState &result) {
 
 static LogicalResult verify(LandingpadOp op) {
   Value value;
-  if (LLVMFuncOp func = op.getParentOfType<LLVMFuncOp>()) {
+  if (LLVMFuncOp func = op->getParentOfType<LLVMFuncOp>()) {
     if (!func.personality().hasValue())
       return op.emitError(
           "llvm.landingpad needs to be in a function with a personality");
@@ -985,11 +985,13 @@ static OpTy lookupSymbolInModule(Operation *parent, StringRef name) {
 }
 
 GlobalOp AddressOfOp::getGlobal() {
-  return lookupSymbolInModule<LLVM::GlobalOp>(getParentOp(), global_name());
+  return lookupSymbolInModule<LLVM::GlobalOp>((*this)->getParentOp(),
+                                              global_name());
 }
 
 LLVMFuncOp AddressOfOp::getFunction() {
-  return lookupSymbolInModule<LLVM::LLVMFuncOp>(getParentOp(), global_name());
+  return lookupSymbolInModule<LLVM::LLVMFuncOp>((*this)->getParentOp(),
+                                                global_name());
 }
 
 static LogicalResult verify(AddressOfOp op) {
@@ -1203,7 +1205,7 @@ static LogicalResult verify(GlobalOp op) {
   if (!LLVMPointerType::isValidElementType(op.getType()))
     return op.emitOpError(
         "expects type to be a valid element type for an LLVM pointer");
-  if (op.getParentOp() && !satisfiesLLVMModule(op.getParentOp()))
+  if (op->getParentOp() && !satisfiesLLVMModule(op->getParentOp()))
     return op.emitOpError("must appear at the module level");
 
   if (auto strAttr = op.getValueOrNull().dyn_cast_or_null<StringAttr>()) {
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 19c42c1e990cb8..707ff7c1b089ba 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -109,8 +109,8 @@ static LogicalResult verify(MmaOp op) {
                           "<halfx2>s or 8 floats");
   }
 
-  auto alayout = op.getAttrOfType<StringAttr>("alayout");
-  auto blayout = op.getAttrOfType<StringAttr>("blayout");
+  auto alayout = op->getAttrOfType<StringAttr>("alayout");
+  auto blayout = op->getAttrOfType<StringAttr>("blayout");
 
   if (!(alayout && blayout) ||
       !(alayout.getValue() == "row" || alayout.getValue() == "col") ||
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 92668be811f687..9d7148fe68dd4b 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -1139,7 +1139,7 @@ static LogicalResult verifyYield(linalg::YieldOp op,
 }
 
 static LogicalResult verify(linalg::YieldOp op) {
-  auto *parentOp = op.getParentOp();
+  auto *parentOp = op->getParentOp();
   if (parentOp->getNumRegions() != 1 || parentOp->getRegion(0).empty())
     return op.emitOpError("expected single non-empty parent region");
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/CodegenStrategy.cpp b/mlir/lib/Dialect/Linalg/Transforms/CodegenStrategy.cpp
index bc86dcd9e05011..652a036838ede7 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/CodegenStrategy.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/CodegenStrategy.cpp
@@ -67,7 +67,7 @@ void mlir::linalg::CodegenStrategy::transform(FuncOp func) const {
   // Post staged patterns transforms
   //===--------------------------------------------------------------------===//
 
-  ModuleOp module = func.getParentOfType<ModuleOp>();
+  ModuleOp module = func->getParentOfType<ModuleOp>();
 
   // Programmatic splitting of slow/fast path vector transfers.
   OwningRewritePatternList patterns;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
index 9aeb39e6b565ab..9e7e7efdd1361f 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
@@ -89,8 +89,8 @@ void mlir::linalg::hoistRedundantVectorTransfers(FuncOp func) {
     func.walk([&](vector::TransferReadOp transferRead) {
       LLVM_DEBUG(DBGS() << "Candidate for hoisting: "
                         << *transferRead.getOperation() << "\n");
-      auto loop = dyn_cast<scf::ForOp>(transferRead.getParentOp());
-      LLVM_DEBUG(DBGS() << "Parent op: " << *transferRead.getParentOp()
+      auto loop = dyn_cast<scf::ForOp>(transferRead->getParentOp());
+      LLVM_DEBUG(DBGS() << "Parent op: " << *transferRead->getParentOp()
                         << "\n");
       if (!loop)
         return WalkResult::advance();
diff --git a/mlir/lib/Dialect/PDL/IR/PDL.cpp b/mlir/lib/Dialect/PDL/IR/PDL.cpp
index 951c76246b1247..49f7f273af19f8 100644
--- a/mlir/lib/Dialect/PDL/IR/PDL.cpp
+++ b/mlir/lib/Dialect/PDL/IR/PDL.cpp
@@ -97,7 +97,7 @@ static LogicalResult verify(AttributeOp op) {
   Value attrType = op.type();
   Optional<Attribute> attrValue = op.value();
 
-  if (!attrValue && isa<RewriteOp>(op.getParentOp()))
+  if (!attrValue && isa<RewriteOp>(op->getParentOp()))
     return op.emitOpError("expected constant value when specified within a "
                           "`pdl.rewrite`");
   if (attrValue && attrType)
@@ -273,7 +273,7 @@ static LogicalResult verifyResultTypesAreInferrable(OperationOp op,
 }
 
 static LogicalResult verify(OperationOp op) {
-  bool isWithinRewrite = isa<RewriteOp>(op.getParentOp());
+  bool isWithinRewrite = isa<RewriteOp>(op->getParentOp());
   if (isWithinRewrite && !op.name())
     return op.emitOpError("must have an operation name when nested within "
                           "a `pdl.rewrite`");
diff --git a/mlir/lib/Dialect/SCF/SCF.cpp b/mlir/lib/Dialect/SCF/SCF.cpp
index 5bff924b8e7df7..1ea0571cf69024 100644
--- a/mlir/lib/Dialect/SCF/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/SCF.cpp
@@ -1172,7 +1172,7 @@ static void print(OpAsmPrinter &p, ReduceOp op) {
 static LogicalResult verify(ReduceReturnOp op) {
   // The type of the return value should be the same type as the type of the
   // operand of the enclosing ReduceOp.
-  auto reduceOp = cast<ReduceOp>(op.getParentOp());
+  auto reduceOp = cast<ReduceOp>(op->getParentOp());
   Type reduceType = reduceOp.operand().getType();
   if (reduceType != op.result().getType())
     return op.emitOpError() << "needs to have type " << reduceType
diff --git a/mlir/lib/Dialect/SCF/Transforms/StructuralTypeConversions.cpp b/mlir/lib/Dialect/SCF/Transforms/StructuralTypeConversions.cpp
index bf9f797a118ea1..9197375cc237b6 100644
--- a/mlir/lib/Dialect/SCF/Transforms/StructuralTypeConversions.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/StructuralTypeConversions.cpp
@@ -144,7 +144,7 @@ void mlir::scf::populateSCFStructuralTypeConversionsAndLegality(
   target.addDynamicallyLegalOp<scf::YieldOp>([&](scf::YieldOp op) {
     // We only have conversions for a subset of ops that use scf.yield
     // terminators.
-    if (!isa<ForOp, IfOp>(op.getParentOp()))
+    if (!isa<ForOp, IfOp>(op->getParentOp()))
       return true;
     return typeConverter.isLegal(op.getOperandTypes());
   });
diff --git a/mlir/lib/Dialect/SCF/Transforms/Utils.cpp b/mlir/lib/Dialect/SCF/Transforms/Utils.cpp
index a451c167aeea33..c30c823792ae2b 100644
--- a/mlir/lib/Dialect/SCF/Transforms/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/Utils.cpp
@@ -89,7 +89,7 @@ void mlir::outlineIfOp(OpBuilder &b, scf::IfOp ifOp, FuncOp *thenFn,
 
     // Outline before current function.
     OpBuilder::InsertionGuard g(b);
-    b.setInsertionPoint(ifOp.getParentOfType<FuncOp>());
+    b.setInsertionPoint(ifOp->getParentOfType<FuncOp>());
 
     llvm::SetVector<Value> captures;
     getUsedValuesDefinedAbove(ifOrElseRegion, captures);
diff --git a/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp b/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp
index 10260b8029c6a4..756e318ac39939 100644
--- a/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp
+++ b/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp
@@ -523,7 +523,7 @@ static spirv::GlobalVariableOp getBuiltinVariable(Block &body,
   // Look through all global variables in the given `body` block and check if
   // there is a spv.globalVariable that has the same `builtin` attribute.
   for (auto varOp : body.getOps<spirv::GlobalVariableOp>()) {
-    if (auto builtinAttr = varOp.getAttrOfType<StringAttr>(
+    if (auto builtinAttr = varOp->getAttrOfType<StringAttr>(
             spirv::SPIRVDialect::getAttributeName(
                 spirv::Decoration::BuiltIn))) {
       auto varBuiltIn = spirv::symbolizeBuiltIn(builtinAttr.getValue());
diff --git a/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
index 19e0f98f26e96b..03e416e9544140 100644
--- a/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp
@@ -1046,7 +1046,7 @@ void spirv::AddressOfOp::build(OpBuilder &builder, OperationState &state,
 
 static LogicalResult verify(spirv::AddressOfOp addressOfOp) {
   auto varOp = dyn_cast_or_null<spirv::GlobalVariableOp>(
-      SymbolTable::lookupNearestSymbolFrom(addressOfOp.getParentOp(),
+      SymbolTable::lookupNearestSymbolFrom(addressOfOp->getParentOp(),
                                            addressOfOp.variable()));
   if (!varOp) {
     return addressOfOp.emitOpError("expected spv.globalVariable symbol");
@@ -1849,7 +1849,7 @@ static LogicalResult verify(spirv::FunctionCallOp functionCallOp) {
 
   auto funcOp =
       dyn_cast_or_null<spirv::FuncOp>(SymbolTable::lookupNearestSymbolFrom(
-          functionCallOp.getParentOp(), fnName));
+          functionCallOp->getParentOp(), fnName));
   if (!funcOp) {
     return functionCallOp.emitOpError("callee function '")
            << fnName << "' not found in nearest symbol table";
@@ -1898,7 +1898,7 @@ static LogicalResult verify(spirv::FunctionCallOp functionCallOp) {
 }
 
 CallInterfaceCallable spirv::FunctionCallOp::getCallableForCallee() {
-  return getAttrOfType<SymbolRefAttr>(kCallee);
+  return (*this)->getAttrOfType<SymbolRefAttr>(kCallee);
 }
 
 Operation::operand_range spirv::FunctionCallOp::getArgOperands() {
@@ -2005,9 +2005,9 @@ static LogicalResult verify(spirv::GlobalVariableOp varOp) {
   }
 
   if (auto init =
-          varOp.getAttrOfType<FlatSymbolRefAttr>(kInitializerAttrName)) {
+          varOp->getAttrOfType<FlatSymbolRefAttr>(kInitializerAttrName)) {
     Operation *initOp = SymbolTable::lookupNearestSymbolFrom(
-        varOp.getParentOp(), init.getValue());
+        varOp->getParentOp(), init.getValue());
     // TODO: Currently only variable initialization with specialization
     // constants and other variables is supported. They could be normal
     // constants in the module scope as well.
@@ -2066,7 +2066,7 @@ static LogicalResult verify(spirv::GroupNonUniformBroadcastOp broadcastOp) {
   // SPIR-V spec: "Before version 1.5, Id must come from a
   // constant instruction.
   auto targetEnv = spirv::getDefaultTargetEnv(broadcastOp.getContext());
-  if (auto spirvModule = broadcastOp.getParentOfType<spirv::ModuleOp>())
+  if (auto spirvModule = broadcastOp->getParentOfType<spirv::ModuleOp>())
     targetEnv = spirv::lookupTargetEnvOrDefault(spirvModule);
 
   if (targetEnv.getVersion() < spirv::Version::V_1_5) {
@@ -2407,12 +2407,12 @@ void spirv::LoopOp::addEntryAndMergeBlock() {
 //===----------------------------------------------------------------------===//
 
 static LogicalResult verify(spirv::MergeOp mergeOp) {
-  auto *parentOp = mergeOp.getParentOp();
+  auto *parentOp = mergeOp->getParentOp();
   if (!parentOp || !isa<spirv::SelectionOp, spirv::LoopOp>(parentOp))
     return mergeOp.emitOpError(
         "expected parent op to be 'spv.selection' or 'spv.loop'");
 
-  Block &parentLastBlock = mergeOp.getParentRegion()->back();
+  Block &parentLastBlock = mergeOp->getParentRegion()->back();
   if (mergeOp.getOperation() != parentLastBlock.getTerminator())
     return mergeOp.emitOpError(
         "can only be used in the last block of 'spv.selection' or 'spv.loop'");
@@ -2577,7 +2577,7 @@ static LogicalResult verify(spirv::ModuleOp moduleOp) {
 
 static LogicalResult verify(spirv::ReferenceOfOp referenceOfOp) {
   auto *specConstSym = SymbolTable::lookupNearestSymbolFrom(
-      referenceOfOp.getParentOp(), referenceOfOp.spec_const());
+      referenceOfOp->getParentOp(), referenceOfOp.spec_const());
   Type constType;
 
   auto specConstOp = dyn_cast_or_null<spirv::SpecConstantOp>(specConstSym);
@@ -2792,13 +2792,13 @@ static ParseResult parseSpecConstantOp(OpAsmParser &parser,
 static void print(spirv::SpecConstantOp constOp, OpAsmPrinter &printer) {
   printer << spirv::SpecConstantOp::getOperationName() << ' ';
   printer.printSymbolName(constOp.sym_name());
-  if (auto specID = constOp.getAttrOfType<IntegerAttr>(kSpecIdAttrName))
+  if (auto specID = constOp->getAttrOfType<IntegerAttr>(kSpecIdAttrName))
     printer << ' ' << kSpecIdAttrName << '(' << specID.getInt() << ')';
   printer << " = " << constOp.default_value();
 }
 
 static LogicalResult verify(spirv::SpecConstantOp constOp) {
-  if (auto specID = constOp.getAttrOfType<IntegerAttr>(kSpecIdAttrName))
+  if (auto specID = constOp->getAttrOfType<IntegerAttr>(kSpecIdAttrName))
     if (specID.getValue().isNegative())
       return constOp.emitOpError("SpecId cannot be negative");
 
@@ -3383,7 +3383,7 @@ static LogicalResult verify(spirv::SpecConstantCompositeOp constOp) {
 
     auto constituentSpecConstOp =
         dyn_cast<spirv::SpecConstantOp>(SymbolTable::lookupNearestSymbolFrom(
-            constOp.getParentOp(), constituent.getValue()));
+            constOp->getParentOp(), constituent.getValue()));
 
     if (constituentSpecConstOp.default_value().getType() !=
         cType.getElementType(index))
@@ -3400,7 +3400,7 @@ static LogicalResult verify(spirv::SpecConstantCompositeOp constOp) {
 //===----------------------------------------------------------------------===//
 
 static LogicalResult verify(spirv::YieldOp yieldOp) {
-  Operation *parentOp = yieldOp.getParentOp();
+  Operation *parentOp = yieldOp->getParentOp();
 
   if (!parentOp || !isa<spirv::SpecConstantOperationOp>(parentOp))
     return yieldOp.emitOpError(
diff --git a/mlir/lib/Dialect/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Dialect/SPIRV/Serialization/Serializer.cpp
index b7236c17c00040..691cf973f06cbc 100644
--- a/mlir/lib/Dialect/SPIRV/Serialization/Serializer.cpp
+++ b/mlir/lib/Dialect/SPIRV/Serialization/Serializer.cpp
@@ -638,8 +638,8 @@ void Serializer::processExtension() {
 }
 
 void Serializer::processMemoryModel() {
-  uint32_t mm = module.getAttrOfType<IntegerAttr>("memory_model").getInt();
-  uint32_t am = module.getAttrOfType<IntegerAttr>("addressing_model").getInt();
+  uint32_t mm = module->getAttrOfType<IntegerAttr>("memory_model").getInt();
+  uint32_t am = module->getAttrOfType<IntegerAttr>("addressing_model").getInt();
 
   encodeInstructionInto(memoryModel, spirv::Opcode::OpMemoryModel, {am, mm});
 }
@@ -656,7 +656,7 @@ LogicalResult Serializer::processSpecConstantOp(spirv::SpecConstantOp op) {
   if (auto resultID = prepareConstantScalar(op.getLoc(), op.default_value(),
                                             /*isSpec=*/true)) {
     // Emit the OpDecorate instruction for SpecId.
-    if (auto specID = op.getAttrOfType<IntegerAttr>("spec_id")) {
+    if (auto specID = op->getAttrOfType<IntegerAttr>("spec_id")) {
       auto val = static_cast<uint32_t>(specID.getInt());
       emitDecoration(resultID, spirv::Decoration::SpecId, {val});
     }
@@ -1973,7 +1973,7 @@ Serializer::processOp<spirv::ControlBarrierOp>(spirv::ControlBarrierOp op) {
   SmallVector<uint32_t, 3> operands;
 
   for (auto argName : argNames) {
-    auto argIntAttr = op.getAttrOfType<IntegerAttr>(argName);
+    auto argIntAttr = op->getAttrOfType<IntegerAttr>(argName);
     auto operand = prepareConstantInt(op.getLoc(), argIntAttr);
     if (!operand) {
       return failure();
@@ -2020,7 +2020,7 @@ Serializer::processOp<spirv::MemoryBarrierOp>(spirv::MemoryBarrierOp op) {
   SmallVector<uint32_t, 2> operands;
 
   for (auto argName : argNames) {
-    auto argIntAttr = op.getAttrOfType<IntegerAttr>(argName);
+    auto argIntAttr = op->getAttrOfType<IntegerAttr>(argName);
     auto operand = prepareConstantInt(op.getLoc(), argIntAttr);
     if (!operand) {
       return failure();
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/DecorateSPIRVCompositeTypeLayoutPass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/DecorateSPIRVCompositeTypeLayoutPass.cpp
index 53160427cf39c2..282c6bbfb65194 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/DecorateSPIRVCompositeTypeLayoutPass.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/DecorateSPIRVCompositeTypeLayoutPass.cpp
@@ -63,7 +63,7 @@ class SPIRVAddressOfOpLayoutInfoDecoration
 
   LogicalResult matchAndRewrite(spirv::AddressOfOp op,
                                 PatternRewriter &rewriter) const override {
-    auto spirvModule = op.getParentOfType<spirv::ModuleOp>();
+    auto spirvModule = op->getParentOfType<spirv::ModuleOp>();
     auto varName = op.variable();
     auto varOp = spirvModule.lookupSymbol<spirv::GlobalVariableOp>(varName);
 
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp
index 24679e4d523094..12f0152079ccbe 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp
@@ -27,7 +27,7 @@ static spirv::GlobalVariableOp
 createGlobalVarForEntryPointArgument(OpBuilder &builder, spirv::FuncOp funcOp,
                                      unsigned argIndex,
                                      spirv::InterfaceVarABIAttr abiInfo) {
-  auto spirvModule = funcOp.getParentOfType<spirv::ModuleOp>();
+  auto spirvModule = funcOp->getParentOfType<spirv::ModuleOp>();
   if (!spirvModule)
     return nullptr;
 
@@ -70,7 +70,7 @@ createGlobalVarForEntryPointArgument(OpBuilder &builder, spirv::FuncOp funcOp,
 static LogicalResult
 getInterfaceVariables(spirv::FuncOp funcOp,
                       SmallVectorImpl<Attribute> &interfaceVars) {
-  auto module = funcOp.getParentOfType<spirv::ModuleOp>();
+  auto module = funcOp->getParentOfType<spirv::ModuleOp>();
   if (!module) {
     return failure();
   }
@@ -108,13 +108,13 @@ static LogicalResult lowerEntryPointABIAttr(spirv::FuncOp funcOp,
                                             OpBuilder &builder) {
   auto entryPointAttrName = spirv::getEntryPointABIAttrName();
   auto entryPointAttr =
-      funcOp.getAttrOfType<spirv::EntryPointABIAttr>(entryPointAttrName);
+      funcOp->getAttrOfType<spirv::EntryPointABIAttr>(entryPointAttrName);
   if (!entryPointAttr) {
     return failure();
   }
 
   OpBuilder::InsertionGuard moduleInsertionGuard(builder);
-  auto spirvModule = funcOp.getParentOfType<spirv::ModuleOp>();
+  auto spirvModule = funcOp->getParentOfType<spirv::ModuleOp>();
   builder.setInsertionPoint(spirvModule.body().front().getTerminator());
 
   // Adds the spv.EntryPointOp after collecting all the interface variables
@@ -169,7 +169,7 @@ class LowerABIAttributesPass final
 LogicalResult ProcessInterfaceVarABI::matchAndRewrite(
     spirv::FuncOp funcOp, ArrayRef<Value> operands,
     ConversionPatternRewriter &rewriter) const {
-  if (!funcOp.getAttrOfType<spirv::EntryPointABIAttr>(
+  if (!funcOp->getAttrOfType<spirv::EntryPointABIAttr>(
           spirv::getEntryPointABIAttrName())) {
     // TODO: Non-entry point functions are not handled.
     return failure();
@@ -271,7 +271,7 @@ void LowerABIAttributesPass::runOnOperation() {
   SmallVector<spirv::FuncOp, 1> entryPointFns;
   auto entryPointAttrName = spirv::getEntryPointABIAttrName();
   module.walk([&](spirv::FuncOp funcOp) {
-    if (funcOp.getAttrOfType<spirv::EntryPointABIAttr>(entryPointAttrName)) {
+    if (funcOp->getAttrOfType<spirv::EntryPointABIAttr>(entryPointAttrName)) {
       entryPointFns.push_back(funcOp);
     }
   });
diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
index acb35b916f7e75..44f897cbf505b9 100644
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -832,7 +832,7 @@ void SizeToIndexOp::getCanonicalizationPatterns(
 //===----------------------------------------------------------------------===//
 
 static LogicalResult verify(shape::YieldOp op) {
-  auto *parentOp = op.getParentOp();
+  auto *parentOp = op->getParentOp();
   auto results = parentOp->getResults();
   auto operands = op.getOperands();
 
diff --git a/mlir/lib/Dialect/Shape/Transforms/ShapeToShapeLowering.cpp b/mlir/lib/Dialect/Shape/Transforms/ShapeToShapeLowering.cpp
index 49af5d7ce9a203..6190ff351ebf48 100644
--- a/mlir/lib/Dialect/Shape/Transforms/ShapeToShapeLowering.cpp
+++ b/mlir/lib/Dialect/Shape/Transforms/ShapeToShapeLowering.cpp
@@ -34,7 +34,7 @@ NumElementsOpConverter::matchAndRewrite(NumElementsOp op,
                                         PatternRewriter &rewriter) const {
   auto loc = op.getLoc();
   Type valueType = op.getResult().getType();
-  Value init = op.getDialect()
+  Value init = op->getDialect()
                    ->materializeConstant(rewriter, rewriter.getIndexAttr(1),
                                          valueType, loc)
                    ->getResult(0);
diff --git a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
index 300b8a42656c8c..34c3da9b5eca04 100644
--- a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
+++ b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
@@ -287,7 +287,7 @@ static LogicalResult verify(AllocOp op) { return verifyAllocLikeOp(op); }
 
 static LogicalResult verify(AllocaOp op) {
   // An alloca op needs to have an ancestor with an allocation scope trait.
-  if (!op.getParentWithTrait<OpTrait::AutomaticAllocationScope>())
+  if (!op->getParentWithTrait<OpTrait::AutomaticAllocationScope>())
     return op.emitOpError(
         "requires an ancestor op with AutomaticAllocationScope trait");
 
@@ -547,7 +547,7 @@ static void print(OpAsmPrinter &p, GenericAtomicRMWOp op) {
 //===----------------------------------------------------------------------===//
 
 static LogicalResult verify(AtomicYieldOp op) {
-  Type parentType = op.getParentOp()->getResultTypes().front();
+  Type parentType = op->getParentOp()->getResultTypes().front();
   Type resultType = op.result().getType();
   if (parentType != resultType)
     return op.emitOpError() << "types mismatch between yield op: " << resultType
@@ -660,9 +660,7 @@ Block *BranchOp::getDest() { return getSuccessor(); }
 
 void BranchOp::setDest(Block *block) { return setSuccessor(block); }
 
-void BranchOp::eraseOperand(unsigned index) {
-  getOperation()->eraseOperand(index);
-}
+void BranchOp::eraseOperand(unsigned index) { (*this)->eraseOperand(index); }
 
 void BranchOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                            MLIRContext *context) {
@@ -684,7 +682,7 @@ Block *BranchOp::getSuccessorForOperands(ArrayRef<Attribute>) { return dest(); }
 
 LogicalResult CallOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
   // Check that the callee attribute was specified.
-  auto fnAttr = getAttrOfType<FlatSymbolRefAttr>("callee");
+  auto fnAttr = (*this)->getAttrOfType<FlatSymbolRefAttr>("callee");
   if (!fnAttr)
     return emitOpError("requires a 'callee' symbol reference attribute");
   FuncOp fn = symbolTable.lookupNearestSymbolFrom<FuncOp>(*this, fnAttr);
@@ -1176,7 +1174,7 @@ static LogicalResult verify(ConstantOp &op) {
 
     // Try to find the referenced function.
     auto fn =
-        op.getParentOfType<ModuleOp>().lookupSymbol<FuncOp>(fnAttr.getValue());
+        op->getParentOfType<ModuleOp>().lookupSymbol<FuncOp>(fnAttr.getValue());
     if (!fn)
       return op.emitOpError()
              << "reference to undefined function '" << fnAttr.getValue() << "'";
@@ -2626,7 +2624,7 @@ OpFoldResult RankOp::fold(ArrayRef<Attribute> operands) {
 //===----------------------------------------------------------------------===//
 
 static LogicalResult verify(ReturnOp op) {
-  auto function = cast<FuncOp>(op.getParentOp());
+  auto function = cast<FuncOp>(op->getParentOp());
 
   // The operand number and types must match the function signature.
   const auto &results = function.getType().getResults();
diff --git a/mlir/lib/Dialect/Vector/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/VectorTransferOpTransforms.cpp
index 8c00cadf7032aa..b7de983dd3b1cc 100644
--- a/mlir/lib/Dialect/Vector/VectorTransferOpTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/VectorTransferOpTransforms.cpp
@@ -190,7 +190,7 @@ void TransferOptimization::storeToLoadForwarding(vector::TransferReadOp read) {
   if (lastwrite == nullptr)
     return;
 
-  Region *topRegion = lastwrite.getParentRegion();
+  Region *topRegion = lastwrite->getParentRegion();
   Operation *readAncestor = findAncestorOpInRegion(topRegion, read);
   assert(readAncestor &&
          "read op should be recursively part of the top region");
diff --git a/mlir/lib/Dialect/Vector/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
index f15bfe90132686..f7bfc66c3fdce6 100644
--- a/mlir/lib/Dialect/Vector/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
@@ -2417,7 +2417,7 @@ LogicalResult mlir::vector::splitFullAndPartialTransfer(
   // Top of the function `alloc` for transient storage.
   Value alloc;
   {
-    FuncOp funcOp = xferOp.getParentOfType<FuncOp>();
+    FuncOp funcOp = xferOp->getParentOfType<FuncOp>();
     OpBuilder::InsertionGuard guard(b);
     b.setInsertionPointToStart(&funcOp.getRegion().front());
     auto shape = xferOp.getVectorType().getShape();
diff --git a/mlir/lib/Target/LLVMIR/ConvertToROCDLIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToROCDLIR.cpp
index 92c0cb52a395e6..c091c72c7702f2 100644
--- a/mlir/lib/Target/LLVMIR/ConvertToROCDLIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertToROCDLIR.cpp
@@ -86,7 +86,8 @@ mlir::translateModuleToROCDLIR(Operation *m, llvm::LLVMContext &llvmContext,
   // 2. Insert amdgpu-flat-workgroup-size(1, 1024) attribute.
   for (auto func :
        ModuleTranslation::getModuleBody(m).getOps<LLVM::LLVMFuncOp>()) {
-    if (!func.getAttrOfType<UnitAttr>(gpu::GPUDialect::getKernelFuncAttrName()))
+    if (!func->getAttrOfType<UnitAttr>(
+            gpu::GPUDialect::getKernelFuncAttrName()))
       continue;
 
     auto *llvmFunc = llvmModule->getFunction(func.getName());
diff --git a/mlir/lib/Transforms/Inliner.cpp b/mlir/lib/Transforms/Inliner.cpp
index 6f144e7a0e833f..64c7ca86dc1e38 100644
--- a/mlir/lib/Transforms/Inliner.cpp
+++ b/mlir/lib/Transforms/Inliner.cpp
@@ -414,7 +414,7 @@ static bool shouldInline(ResolvedCall &resolvedCall) {
   // Don't allow inlining if the target is an ancestor of the call. This
   // prevents inlining recursively.
   if (resolvedCall.targetNode->getCallableRegion()->isAncestor(
-          resolvedCall.call.getParentRegion()))
+          resolvedCall.call->getParentRegion()))
     return false;
 
   // Otherwise, inline.
diff --git a/mlir/lib/Transforms/LoopCoalescing.cpp b/mlir/lib/Transforms/LoopCoalescing.cpp
index 59b2fb9333dabc..ed7bff36321e1b 100644
--- a/mlir/lib/Transforms/LoopCoalescing.cpp
+++ b/mlir/lib/Transforms/LoopCoalescing.cpp
@@ -25,7 +25,7 @@ struct LoopCoalescingPass : public LoopCoalescingBase<LoopCoalescingPass> {
 
     func.walk([](scf::ForOp op) {
       // Ignore nested loops.
-      if (op.getParentOfType<scf::ForOp>())
+      if (op->getParentOfType<scf::ForOp>())
         return;
 
       SmallVector<scf::ForOp, 4> loops;
diff --git a/mlir/lib/Transforms/Utils/InliningUtils.cpp b/mlir/lib/Transforms/Utils/InliningUtils.cpp
index 8b1e2fa630ec66..7d18de076e4bf2 100644
--- a/mlir/lib/Transforms/Utils/InliningUtils.cpp
+++ b/mlir/lib/Transforms/Utils/InliningUtils.cpp
@@ -329,7 +329,7 @@ LogicalResult mlir::inlineCall(InlinerInterface &interface,
   // Builder used for any conversion operations that need to be materialized.
   OpBuilder castBuilder(call);
   Location castLoc = call.getLoc();
-  auto *callInterface = interface.getInterfaceFor(call.getDialect());
+  const auto *callInterface = interface.getInterfaceFor(call->getDialect());
 
   // Map the provided call operands to the arguments of the region.
   BlockAndValueMapping mapper;
diff --git a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp b/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
index 77b6744b07a6a9..9759300f2e42ca 100644
--- a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
@@ -402,7 +402,7 @@ void mlir::fuseLoops(AffineForOp srcForOp, AffineForOp dstForOp,
 bool mlir::getLoopNestStats(AffineForOp forOpRoot, LoopNestStats *stats) {
   auto walkResult = forOpRoot.walk([&](AffineForOp forOp) {
     auto *childForOp = forOp.getOperation();
-    auto *parentForOp = forOp.getParentOp();
+    auto *parentForOp = forOp->getParentOp();
     if (!llvm::isa<FuncOp>(parentForOp)) {
       if (!isa<AffineForOp>(parentForOp)) {
         LLVM_DEBUG(llvm::dbgs() << "Expected parent AffineForOp");
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
index bfbc2211e06172..1f549ca9efda11 100644
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -163,7 +163,7 @@ LogicalResult mlir::promoteIfSingleIteration(AffineForOp forOp) {
   auto *parentBlock = forOp->getBlock();
   if (!iv.use_empty()) {
     if (forOp.hasConstantLowerBound()) {
-      OpBuilder topBuilder(forOp.getParentOfType<FuncOp>().getBody());
+      OpBuilder topBuilder(forOp->getParentOfType<FuncOp>().getBody());
       auto constOp = topBuilder.create<ConstantIndexOp>(
           forOp.getLoc(), forOp.getConstantLowerBound());
       iv.replaceAllUsesWith(constOp);
@@ -1484,7 +1484,7 @@ mlir::isPerfectlyNested(ArrayRef<AffineForOp> loops) {
 
   auto enclosingLoop = loops.front();
   for (auto loop : loops.drop_front()) {
-    auto parentForOp = dyn_cast<AffineForOp>(loop.getParentOp());
+    auto parentForOp = dyn_cast<AffineForOp>(loop->getParentOp());
     // parentForOp's body should be just this loop and the terminator.
     if (parentForOp != enclosingLoop || !hasTwoElements(parentForOp.getBody()))
       return false;
@@ -3073,7 +3073,7 @@ mlir::separateFullTiles(MutableArrayRef<AffineForOp> inputNest,
   // Each successive for op has to be nested in the other.
   auto prevLoop = firstLoop;
   for (auto loop : inputNest.drop_front(1)) {
-    assert(loop.getParentOp() == prevLoop && "input not contiguously nested");
+    assert(loop->getParentOp() == prevLoop && "input not contiguously nested");
     prevLoop = loop;
   }
 
diff --git a/mlir/test/lib/Dialect/Shape/TestShapeFunctions.cpp b/mlir/test/lib/Dialect/Shape/TestShapeFunctions.cpp
index 688f24e5ec47c8..b7127c5edf326f 100644
--- a/mlir/test/lib/Dialect/Shape/TestShapeFunctions.cpp
+++ b/mlir/test/lib/Dialect/Shape/TestShapeFunctions.cpp
@@ -57,7 +57,7 @@ void ReportShapeFnPass::runOnOperation() {
 
   module.getBodyRegion().walk([&](FuncOp func) {
     // Skip ops in the shape function library.
-    if (isa<shape::FunctionLibraryOp>(func.getParentOp()))
+    if (isa<shape::FunctionLibraryOp>(func->getParentOp()))
       return;
 
     func.walk([&](Operation *op) { remarkShapeFn(op); });
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index 4e2c5b5c6a95ee..c7e1b7f48f4378 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -726,7 +726,7 @@ struct TestResource : public SideEffects::Resource::Base<TestResource> {
 void SideEffectOp::getEffects(
     SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
   // Check for an effects attribute on the op instance.
-  ArrayAttr effectsAttr = getAttrOfType<ArrayAttr>("effects");
+  ArrayAttr effectsAttr = (*this)->getAttrOfType<ArrayAttr>("effects");
   if (!effectsAttr)
     return;
 
@@ -761,7 +761,7 @@ void SideEffectOp::getEffects(
 
 void SideEffectOp::getEffects(
     SmallVectorImpl<TestEffects::EffectInstance> &effects) {
-  auto effectsAttr = getAttrOfType<AffineMapAttr>("effect_parameter");
+  auto effectsAttr = (*this)->getAttrOfType<AffineMapAttr>("effect_parameter");
   if (!effectsAttr)
     return;
 
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 7547500b2835d7..1579e53e5277e8 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -321,7 +321,7 @@ def ConversionCallOp : TEST_Op<"conversion_call_op",
 
     /// Return the callee of this operation.
     CallInterfaceCallable getCallableForCallee() {
-      return getAttrOfType<SymbolRefAttr>("callee");
+      return (*this)->getAttrOfType<SymbolRefAttr>("callee");
     }
   }];
 }
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index 437a72176cf6e9..5aab1bbfe46363 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -899,7 +899,7 @@ struct TestMergeSingleBlockOps
   matchAndRewrite(SingleBlockImplicitTerminatorOp op, ArrayRef<Value> operands,
                   ConversionPatternRewriter &rewriter) const final {
     SingleBlockImplicitTerminatorOp parentOp =
-        op.getParentOfType<SingleBlockImplicitTerminatorOp>();
+        op->getParentOfType<SingleBlockImplicitTerminatorOp>();
     if (!parentOp)
       return failure();
     Block &innerBlock = op.region().front();
@@ -936,14 +936,14 @@ struct TestMergeBlocksPatternDriver
 
     /// Only allow `test.br` within test.merge_blocks op.
     target.addDynamicallyLegalOp<TestBranchOp>([&](TestBranchOp op) -> bool {
-      return op.getParentOfType<TestMergeBlocksOp>();
+      return op->getParentOfType<TestMergeBlocksOp>();
     });
 
     /// Expect that all nested test.SingleBlockImplicitTerminator ops are
     /// inlined.
     target.addDynamicallyLegalOp<SingleBlockImplicitTerminatorOp>(
         [&](SingleBlockImplicitTerminatorOp op) -> bool {
-          return !op.getParentOfType<SingleBlockImplicitTerminatorOp>();
+          return !op->getParentOfType<SingleBlockImplicitTerminatorOp>();
         });
 
     DenseSet<Operation *> unlegalizedOps;
diff --git a/mlir/test/lib/IR/TestFunc.cpp b/mlir/test/lib/IR/TestFunc.cpp
index cdb4663e0b3675..10326ebe9631d0 100644
--- a/mlir/test/lib/IR/TestFunc.cpp
+++ b/mlir/test/lib/IR/TestFunc.cpp
@@ -68,7 +68,7 @@ struct TestFuncSetType
     SymbolTable symbolTable(module);
 
     for (FuncOp func : module.getOps<FuncOp>()) {
-      auto sym = func.getAttrOfType<FlatSymbolRefAttr>("test.set_type_from");
+      auto sym = func->getAttrOfType<FlatSymbolRefAttr>("test.set_type_from");
       if (!sym)
         continue;
       func.setType(symbolTable.lookup<FuncOp>(sym.getValue()).getType());
diff --git a/mlir/test/lib/Transforms/TestAffineLoopParametricTiling.cpp b/mlir/test/lib/Transforms/TestAffineLoopParametricTiling.cpp
index cd4d91f60bc28e..4872dcc6e77c53 100644
--- a/mlir/test/lib/Transforms/TestAffineLoopParametricTiling.cpp
+++ b/mlir/test/lib/Transforms/TestAffineLoopParametricTiling.cpp
@@ -33,7 +33,7 @@ static void checkIfTilingParametersExist(ArrayRef<AffineForOp> band) {
   assert(!band.empty() && "no loops in input band");
   AffineForOp topLoop = band[0];
 
-  if (FuncOp funcOp = dyn_cast<FuncOp>(topLoop.getParentOp()))
+  if (FuncOp funcOp = dyn_cast<FuncOp>(topLoop->getParentOp()))
     assert(funcOp.getNumArguments() >= band.size() && "Too few tile sizes");
 }
 
@@ -44,7 +44,7 @@ static void checkIfTilingParametersExist(ArrayRef<AffineForOp> band) {
 static void getTilingParameters(ArrayRef<AffineForOp> band,
                                 SmallVectorImpl<Value> &tilingParameters) {
   AffineForOp topLoop = band[0];
-  Region *funcOpRegion = topLoop.getParentRegion();
+  Region *funcOpRegion = topLoop->getParentRegion();
   unsigned nestDepth = band.size();
 
   for (BlockArgument blockArgument :
diff --git a/mlir/test/lib/Transforms/TestLinalgFusionTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgFusionTransforms.cpp
index 1591a7435c72a4..046fad43c3bf96 100644
--- a/mlir/test/lib/Transforms/TestLinalgFusionTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestLinalgFusionTransforms.cpp
@@ -192,7 +192,7 @@ struct TestLinalgGreedyFusion
       pm.addPass(createLoopInvariantCodeMotionPass());
       pm.addPass(createCanonicalizerPass());
       pm.addPass(createCSEPass());
-      LogicalResult res = pm.run(getFunction().getParentOfType<ModuleOp>());
+      LogicalResult res = pm.run(getFunction()->getParentOfType<ModuleOp>());
       if (failed(res))
         this->signalPassFailure();
     }
diff --git a/mlir/test/lib/Transforms/TestLoopMapping.cpp b/mlir/test/lib/Transforms/TestLoopMapping.cpp
index d373b3933128ad..591fac32698f61 100644
--- a/mlir/test/lib/Transforms/TestLoopMapping.cpp
+++ b/mlir/test/lib/Transforms/TestLoopMapping.cpp
@@ -43,7 +43,7 @@ class TestLoopMappingPass
 
     func.walk([&processorIds, &numProcessors](scf::ForOp op) {
       // Ignore nested loops.
-      if (op.getParentRegion()->getParentOfType<scf::ForOp>())
+      if (op->getParentRegion()->getParentOfType<scf::ForOp>())
         return;
       mapLoopToProcessorIds(op, processorIds, numProcessors);
     });
diff --git a/mlir/test/lib/Transforms/TestLoopParametricTiling.cpp b/mlir/test/lib/Transforms/TestLoopParametricTiling.cpp
index 26491018233542..095a3f1e7ef49a 100644
--- a/mlir/test/lib/Transforms/TestLoopParametricTiling.cpp
+++ b/mlir/test/lib/Transforms/TestLoopParametricTiling.cpp
@@ -35,7 +35,7 @@ class SimpleParametricLoopTilingPass
     FuncOp func = getFunction();
     func.walk([this](scf::ForOp op) {
       // Ignore nested loops.
-      if (op.getParentRegion()->getParentOfType<scf::ForOp>())
+      if (op->getParentRegion()->getParentOfType<scf::ForOp>())
         return;
       extractFixedOuterLoops(op, sizes);
     });
diff --git a/mlir/test/mlir-tblgen/op-attribute.td b/mlir/test/mlir-tblgen/op-attribute.td
index 171b5f5757782e..e170d9f9fe0acc 100644
--- a/mlir/test/mlir-tblgen/op-attribute.td
+++ b/mlir/test/mlir-tblgen/op-attribute.td
@@ -71,11 +71,11 @@ def AOp : NS_Op<"a_op", []> {
 // ---
 
 // DEF:      void AOp::aAttrAttr(some-attr-kind attr) {
-// DEF-NEXT:   this->getOperation()->setAttr("aAttr", attr);
+// DEF-NEXT:   (*this)->setAttr("aAttr", attr);
 // DEF:      void AOp::bAttrAttr(some-attr-kind attr) {
-// DEF-NEXT:   this->getOperation()->setAttr("bAttr", attr);
+// DEF-NEXT:   (*this)->setAttr("bAttr", attr);
 // DEF:      void AOp::cAttrAttr(some-attr-kind attr) {
-// DEF-NEXT:   this->getOperation()->setAttr("cAttr", attr);
+// DEF-NEXT:   (*this)->setAttr("cAttr", attr);
 
 // Test build methods
 // ---
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index ccfb13fa34363f..04bc10d338d7ca 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -90,7 +90,8 @@ const char *adapterSegmentSizeAttrInitCode = R"(
   auto sizeAttr = odsAttrs.get("{0}").cast<::mlir::DenseIntElementsAttr>();
 )";
 const char *opSegmentSizeAttrInitCode = R"(
-  auto sizeAttr = getAttrOfType<::mlir::DenseIntElementsAttr>("{0}");
+  auto sizeAttr =
+      getOperation()->getAttrOfType<::mlir::DenseIntElementsAttr>("{0}");
 )";
 const char *attrSizedSegmentValueRangeCalcCode = R"(
   unsigned start = 0;
@@ -614,7 +615,7 @@ void OpEmitter::genAttrSetters() {
     if (!method)
       return;
     auto &body = method->body();
-    body << "  this->getOperation()->setAttr(\"" << name << "\", attr);";
+    body << "  (*this)->setAttr(\"" << name << "\", attr);";
   };
 
   for (auto &namedAttr : op.getAttributes()) {