diff --git a/clang-tools-extra/clangd/ConfigFragment.h b/clang-tools-extra/clangd/ConfigFragment.h
index 7e07e9d065b37d..65772715095f20 100644
--- a/clang-tools-extra/clangd/ConfigFragment.h
+++ b/clang-tools-extra/clangd/ConfigFragment.h
@@ -51,8 +51,8 @@ template <typename T> struct Located {
       : Range(Range), Value(std::move(Value)) {}
 
   llvm::SMRange Range;
-  T &operator->() { return Value; }
-  const T &operator->() const { return Value; }
+  T *operator->() { return &Value; }
+  const T *operator->() const { return &Value; }
   T &operator*() { return Value; }
   const T &operator*() const { return Value; }
 
diff --git a/llvm/include/llvm/CodeGen/LiveRangeEdit.h b/llvm/include/llvm/CodeGen/LiveRangeEdit.h
index af8fe91431c884..87d48adc7f2791 100644
--- a/llvm/include/llvm/CodeGen/LiveRangeEdit.h
+++ b/llvm/include/llvm/CodeGen/LiveRangeEdit.h
@@ -56,14 +56,14 @@ class LiveRangeEdit : private MachineRegisterInfo::Delegate {
 
     /// Called when a virtual register is no longer used. Return false to defer
     /// its deletion from LiveIntervals.
-    virtual bool LRE_CanEraseVirtReg(unsigned) { return true; }
+    virtual bool LRE_CanEraseVirtReg(Register) { return true; }
 
     /// Called before shrinking the live range of a virtual register.
-    virtual void LRE_WillShrinkVirtReg(unsigned) {}
+    virtual void LRE_WillShrinkVirtReg(Register) {}
 
     /// Called after cloning a virtual register.
     /// This is used for new registers representing connected components of Old.
-    virtual void LRE_DidCloneVirtReg(unsigned New, unsigned Old) {}
+    virtual void LRE_DidCloneVirtReg(Register New, Register Old) {}
   };
 
 private:
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 5d5a6efab22077..68fc129cc0eda9 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1737,6 +1737,21 @@ class TargetInstrInfo : public MCInstrInfo {
     return 5;
   }
 
+  /// Return the maximal number of alias checks on memory operands. For
+  /// instructions with more than one memory operands, the alias check on a
+  /// single MachineInstr pair has quadratic overhead and results in
+  /// unacceptable performance in the worst case. The limit here is to clamp
+  /// that maximal checks performed. Usually, that's the product of memory
+  /// operand numbers from that pair of MachineInstr to be checked. For
+  /// instance, with two MachineInstrs with 4 and 5 memory operands
+  /// correspondingly, a total of 20 checks are required. With this limit set to
+  /// 16, their alias check is skipped. We choose to limit the product instead
+  /// of the individual instruction as targets may have special MachineInstrs
+  /// with a considerably high number of memory operands, such as `ldm` in ARM.
+  /// Setting this limit per MachineInstr would result in either too high
+  /// overhead or too rigid restriction.
+  virtual unsigned getMemOperandAACheckLimit() const { return 16; }
+
   /// Return an array that contains the ids of the target indices (used for the
   /// TargetIndex machine operand) and their names.
   ///
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index 07f7c948d040a8..975d51c4cd1310 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -153,7 +153,7 @@ class HoistSpillHelper : private LiveRangeEdit::Delegate {
                             unsigned Original);
   bool rmFromMergeableSpills(MachineInstr &Spill, int StackSlot);
   void hoistAllSpills();
-  void LRE_DidCloneVirtReg(unsigned, unsigned) override;
+  void LRE_DidCloneVirtReg(Register, Register) override;
 };
 
 class InlineSpiller : public Spiller {
@@ -1551,7 +1551,7 @@ void HoistSpillHelper::hoistAllSpills() {
 
 /// For VirtReg clone, the \p New register should have the same physreg or
 /// stackslot as the \p old register.
-void HoistSpillHelper::LRE_DidCloneVirtReg(unsigned New, unsigned Old) {
+void HoistSpillHelper::LRE_DidCloneVirtReg(Register New, Register Old) {
   if (VRM.hasPhys(Old))
     VRM.assignVirt2Phys(New, VRM.getPhys(Old));
   else if (VRM.getStackSlot(Old) != VirtRegMap::NO_STACK_SLOT)
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index d45c53b8181681..fd658cdb41b911 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1276,81 +1276,96 @@ bool MachineInstr::mayAlias(AAResults *AA, const MachineInstr &Other,
   if (TII->areMemAccessesTriviallyDisjoint(*this, Other))
     return false;
 
-  // FIXME: Need to handle multiple memory operands to support all targets.
-  if (!hasOneMemOperand() || !Other.hasOneMemOperand())
+  // Memory operations without memory operands may access anything. Be
+  // conservative and assume `MayAlias`.
+  if (memoperands_empty() || Other.memoperands_empty())
     return true;
 
-  MachineMemOperand *MMOa = *memoperands_begin();
-  MachineMemOperand *MMOb = *Other.memoperands_begin();
-
-  // The following interface to AA is fashioned after DAGCombiner::isAlias
-  // and operates with MachineMemOperand offset with some important
-  // assumptions:
-  //   - LLVM fundamentally assumes flat address spaces.
-  //   - MachineOperand offset can *only* result from legalization and
-  //     cannot affect queries other than the trivial case of overlap
-  //     checking.
-  //   - These offsets never wrap and never step outside
-  //     of allocated objects.
-  //   - There should never be any negative offsets here.
-  //
-  // FIXME: Modify API to hide this math from "user"
-  // Even before we go to AA we can reason locally about some
-  // memory objects. It can save compile time, and possibly catch some
-  // corner cases not currently covered.
-
-  int64_t OffsetA = MMOa->getOffset();
-  int64_t OffsetB = MMOb->getOffset();
-  int64_t MinOffset = std::min(OffsetA, OffsetB);
-
-  uint64_t WidthA = MMOa->getSize();
-  uint64_t WidthB = MMOb->getSize();
-  bool KnownWidthA = WidthA != MemoryLocation::UnknownSize;
-  bool KnownWidthB = WidthB != MemoryLocation::UnknownSize;
-
-  const Value *ValA = MMOa->getValue();
-  const Value *ValB = MMOb->getValue();
-  bool SameVal = (ValA && ValB && (ValA == ValB));
-  if (!SameVal) {
-    const PseudoSourceValue *PSVa = MMOa->getPseudoValue();
-    const PseudoSourceValue *PSVb = MMOb->getPseudoValue();
-    if (PSVa && ValB && !PSVa->mayAlias(&MFI))
-      return false;
-    if (PSVb && ValA && !PSVb->mayAlias(&MFI))
-      return false;
-    if (PSVa && PSVb && (PSVa == PSVb))
-      SameVal = true;
-  }
+  // Skip if there are too many memory operands.
+  auto NumChecks = getNumMemOperands() * Other.getNumMemOperands();
+  if (NumChecks > TII->getMemOperandAACheckLimit())
+    return true;
+
+  auto HasAlias = [MFI, AA, UseTBAA](const MachineMemOperand *MMOa,
+                                     const MachineMemOperand *MMOb) {
+    // The following interface to AA is fashioned after DAGCombiner::isAlias
+    // and operates with MachineMemOperand offset with some important
+    // assumptions:
+    //   - LLVM fundamentally assumes flat address spaces.
+    //   - MachineOperand offset can *only* result from legalization and
+    //     cannot affect queries other than the trivial case of overlap
+    //     checking.
+    //   - These offsets never wrap and never step outside
+    //     of allocated objects.
+    //   - There should never be any negative offsets here.
+    //
+    // FIXME: Modify API to hide this math from "user"
+    // Even before we go to AA we can reason locally about some
+    // memory objects. It can save compile time, and possibly catch some
+    // corner cases not currently covered.
+
+    int64_t OffsetA = MMOa->getOffset();
+    int64_t OffsetB = MMOb->getOffset();
+    int64_t MinOffset = std::min(OffsetA, OffsetB);
+
+    uint64_t WidthA = MMOa->getSize();
+    uint64_t WidthB = MMOb->getSize();
+    bool KnownWidthA = WidthA != MemoryLocation::UnknownSize;
+    bool KnownWidthB = WidthB != MemoryLocation::UnknownSize;
+
+    const Value *ValA = MMOa->getValue();
+    const Value *ValB = MMOb->getValue();
+    bool SameVal = (ValA && ValB && (ValA == ValB));
+    if (!SameVal) {
+      const PseudoSourceValue *PSVa = MMOa->getPseudoValue();
+      const PseudoSourceValue *PSVb = MMOb->getPseudoValue();
+      if (PSVa && ValB && !PSVa->mayAlias(&MFI))
+        return false;
+      if (PSVb && ValA && !PSVb->mayAlias(&MFI))
+        return false;
+      if (PSVa && PSVb && (PSVa == PSVb))
+        SameVal = true;
+    }
+
+    if (SameVal) {
+      if (!KnownWidthA || !KnownWidthB)
+        return true;
+      int64_t MaxOffset = std::max(OffsetA, OffsetB);
+      int64_t LowWidth = (MinOffset == OffsetA) ? WidthA : WidthB;
+      return (MinOffset + LowWidth > MaxOffset);
+    }
 
-  if (SameVal) {
-    if (!KnownWidthA || !KnownWidthB)
+    if (!AA)
       return true;
-    int64_t MaxOffset = std::max(OffsetA, OffsetB);
-    int64_t LowWidth = (MinOffset == OffsetA) ? WidthA : WidthB;
-    return (MinOffset + LowWidth > MaxOffset);
-  }
 
-  if (!AA)
-    return true;
+    if (!ValA || !ValB)
+      return true;
 
-  if (!ValA || !ValB)
-    return true;
+    assert((OffsetA >= 0) && "Negative MachineMemOperand offset");
+    assert((OffsetB >= 0) && "Negative MachineMemOperand offset");
 
-  assert((OffsetA >= 0) && "Negative MachineMemOperand offset");
-  assert((OffsetB >= 0) && "Negative MachineMemOperand offset");
+    int64_t OverlapA = KnownWidthA ? WidthA + OffsetA - MinOffset
+                                   : MemoryLocation::UnknownSize;
+    int64_t OverlapB = KnownWidthB ? WidthB + OffsetB - MinOffset
+                                   : MemoryLocation::UnknownSize;
 
-  int64_t OverlapA = KnownWidthA ? WidthA + OffsetA - MinOffset
-                                 : MemoryLocation::UnknownSize;
-  int64_t OverlapB = KnownWidthB ? WidthB + OffsetB - MinOffset
-                                 : MemoryLocation::UnknownSize;
+    AliasResult AAResult =
+        AA->alias(MemoryLocation(ValA, OverlapA,
+                                 UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
+                  MemoryLocation(ValB, OverlapB,
+                                 UseTBAA ? MMOb->getAAInfo() : AAMDNodes()));
 
-  AliasResult AAResult = AA->alias(
-      MemoryLocation(ValA, OverlapA,
-                     UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
-      MemoryLocation(ValB, OverlapB,
-                     UseTBAA ? MMOb->getAAInfo() : AAMDNodes()));
+    return (AAResult != NoAlias);
+  };
 
-  return (AAResult != NoAlias);
+  // Check each pair of memory operands from both instructions, which can't
+  // alias only if all pairs won't alias.
+  for (auto *MMOa : memoperands())
+    for (auto *MMOb : Other.memoperands())
+      if (HasAlias(MMOa, MMOb))
+        return true;
+
+  return false;
 }
 
 /// hasOrderedMemoryRef - Return true if this instruction may have an ordered
diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp
index d49a64b3f141bb..aa749ca43e74fc 100644
--- a/llvm/lib/CodeGen/RegAllocBase.cpp
+++ b/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -73,7 +73,7 @@ void RegAllocBase::seedLiveRegs() {
   NamedRegionTimer T("seed", "Seed Live Regs", TimerGroupName,
                      TimerGroupDescription, TimePassesIsEnabled);
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = Register::index2VirtReg(i);
+    Register Reg = Register::index2VirtReg(i);
     if (MRI->reg_nodbg_empty(Reg))
       continue;
     enqueue(&LIS->getInterval(Reg));
@@ -143,7 +143,7 @@ void RegAllocBase::allocatePhysRegs() {
     if (AvailablePhysReg)
       Matrix->assign(*VirtReg, AvailablePhysReg);
 
-    for (unsigned Reg : SplitVRegs) {
+    for (Register Reg : SplitVRegs) {
       assert(LIS->hasInterval(Reg));
 
       LiveInterval *SplitVirtReg = &LIS->getInterval(Reg);
diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp
index f96dc13132ed0c..8f2cb48c5d69b7 100644
--- a/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -72,8 +72,8 @@ class RABasic : public MachineFunctionPass,
   // selectOrSplit().
   BitVector UsableRegs;
 
-  bool LRE_CanEraseVirtReg(unsigned) override;
-  void LRE_WillShrinkVirtReg(unsigned) override;
+  bool LRE_CanEraseVirtReg(Register) override;
+  void LRE_WillShrinkVirtReg(Register) override;
 
 public:
   RABasic();
@@ -146,7 +146,7 @@ INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
 INITIALIZE_PASS_END(RABasic, "regallocbasic", "Basic Register Allocator", false,
                     false)
 
-bool RABasic::LRE_CanEraseVirtReg(unsigned VirtReg) {
+bool RABasic::LRE_CanEraseVirtReg(Register VirtReg) {
   LiveInterval &LI = LIS->getInterval(VirtReg);
   if (VRM->hasPhys(VirtReg)) {
     Matrix->unassign(LI);
@@ -161,7 +161,7 @@ bool RABasic::LRE_CanEraseVirtReg(unsigned VirtReg) {
   return false;
 }
 
-void RABasic::LRE_WillShrinkVirtReg(unsigned VirtReg) {
+void RABasic::LRE_WillShrinkVirtReg(Register VirtReg) {
   if (!VRM->hasPhys(VirtReg))
     return;
 
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index ecb9a5a2c53aea..166414e4ffa178 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -441,9 +441,9 @@ class RAGreedy : public MachineFunctionPass,
   MCRegister selectOrSplitImpl(LiveInterval &, SmallVectorImpl<Register> &,
                                SmallVirtRegSet &, unsigned = 0);
 
-  bool LRE_CanEraseVirtReg(unsigned) override;
-  void LRE_WillShrinkVirtReg(unsigned) override;
-  void LRE_DidCloneVirtReg(unsigned, unsigned) override;
+  bool LRE_CanEraseVirtReg(Register) override;
+  void LRE_WillShrinkVirtReg(Register) override;
+  void LRE_DidCloneVirtReg(Register, Register) override;
   void enqueue(PQueue &CurQueue, LiveInterval *LI);
   LiveInterval *dequeue(PQueue &CurQueue);
 
@@ -470,9 +470,9 @@ class RAGreedy : public MachineFunctionPass,
   bool canEvictInterferenceInRange(LiveInterval &VirtReg, MCRegister PhysReg,
                                    SlotIndex Start, SlotIndex End,
                                    EvictionCost &MaxCost);
-  unsigned getCheapestEvicteeWeight(const AllocationOrder &Order,
-                                    LiveInterval &VirtReg, SlotIndex Start,
-                                    SlotIndex End, float *BestEvictWeight);
+  MCRegister getCheapestEvicteeWeight(const AllocationOrder &Order,
+                                      LiveInterval &VirtReg, SlotIndex Start,
+                                      SlotIndex End, float *BestEvictWeight);
   void evictInterference(LiveInterval &, MCRegister,
                          SmallVectorImpl<Register> &);
   bool mayRecolorAllInterferences(MCRegister PhysReg, LiveInterval &VirtReg,
@@ -499,9 +499,10 @@ class RAGreedy : public MachineFunctionPass,
                          SmallVectorImpl<Register> &NewVRegs);
   /// Check other options before using a callee-saved register for the first
   /// time.
-  unsigned tryAssignCSRFirstTime(LiveInterval &VirtReg, AllocationOrder &Order,
-                                 Register PhysReg, unsigned &CostPerUseLimit,
-                                 SmallVectorImpl<Register> &NewVRegs);
+  MCRegister tryAssignCSRFirstTime(LiveInterval &VirtReg,
+                                   AllocationOrder &Order, MCRegister PhysReg,
+                                   unsigned &CostPerUseLimit,
+                                   SmallVectorImpl<Register> &NewVRegs);
   void initializeCSRCost();
   unsigned tryBlockSplit(LiveInterval&, AllocationOrder&,
                          SmallVectorImpl<Register>&);
@@ -536,7 +537,7 @@ class RAGreedy : public MachineFunctionPass,
   using HintsInfo = SmallVector<HintInfo, 4>;
 
   BlockFrequency getBrokenHintFreq(const HintsInfo &, MCRegister);
-  void collectHintInfo(unsigned, HintsInfo &);
+  void collectHintInfo(Register, HintsInfo &);
 
   bool isUnusedCalleeSavedReg(MCRegister PhysReg) const;
 
@@ -633,7 +634,7 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
 //                     LiveRangeEdit delegate methods
 //===----------------------------------------------------------------------===//
 
-bool RAGreedy::LRE_CanEraseVirtReg(unsigned VirtReg) {
+bool RAGreedy::LRE_CanEraseVirtReg(Register VirtReg) {
   LiveInterval &LI = LIS->getInterval(VirtReg);
   if (VRM->hasPhys(VirtReg)) {
     Matrix->unassign(LI);
@@ -648,7 +649,7 @@ bool RAGreedy::LRE_CanEraseVirtReg(unsigned VirtReg) {
   return false;
 }
 
-void RAGreedy::LRE_WillShrinkVirtReg(unsigned VirtReg) {
+void RAGreedy::LRE_WillShrinkVirtReg(Register VirtReg) {
   if (!VRM->hasPhys(VirtReg))
     return;
 
@@ -658,7 +659,7 @@ void RAGreedy::LRE_WillShrinkVirtReg(unsigned VirtReg) {
   enqueue(&LI);
 }
 
-void RAGreedy::LRE_DidCloneVirtReg(unsigned New, unsigned Old) {
+void RAGreedy::LRE_DidCloneVirtReg(Register New, Register Old) {
   // Cloning a register we haven't even heard about yet?  Just ignore it.
   if (!ExtraRegInfo.inBounds(Old))
     return;
@@ -684,9 +685,8 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
   // Prioritize live ranges by size, assigning larger ranges first.
   // The queue holds (size, reg) pairs.
   const unsigned Size = LI->getSize();
-  const unsigned Reg = LI->reg();
-  assert(Register::isVirtualRegister(Reg) &&
-         "Can only enqueue virtual registers");
+  const Register Reg = LI->reg();
+  assert(Reg.isVirtual() && "Can only enqueue virtual registers");
   unsigned Prio;
 
   ExtraRegInfo.grow(Reg);
@@ -1026,17 +1026,17 @@ bool RAGreedy::canEvictInterferenceInRange(LiveInterval &VirtReg,
 /// \param BestEvictweight  The eviction cost of that eviction
 /// \return The PhysReg which is the best candidate for eviction and the
 /// eviction cost in BestEvictweight
-unsigned RAGreedy::getCheapestEvicteeWeight(const AllocationOrder &Order,
-                                            LiveInterval &VirtReg,
-                                            SlotIndex Start, SlotIndex End,
-                                            float *BestEvictweight) {
+MCRegister RAGreedy::getCheapestEvicteeWeight(const AllocationOrder &Order,
+                                              LiveInterval &VirtReg,
+                                              SlotIndex Start, SlotIndex End,
+                                              float *BestEvictweight) {
   EvictionCost BestEvictCost;
   BestEvictCost.setMax();
   BestEvictCost.MaxWeight = VirtReg.weight();
-  unsigned BestEvicteePhys = 0;
+  MCRegister BestEvicteePhys;
 
   // Go over all physical registers and find the best candidate for eviction
-  for (auto PhysReg : Order.getOrder()) {
+  for (MCRegister PhysReg : Order.getOrder()) {
 
     if (!canEvictInterferenceInRange(VirtReg, PhysReg, Start, End,
                                      BestEvictCost))
@@ -1498,7 +1498,7 @@ bool RAGreedy::splitCanCauseEvictionChain(Register Evictee,
     return false;
 
   float MaxWeight = 0;
-  unsigned FutureEvictedPhysReg =
+  MCRegister FutureEvictedPhysReg =
       getCheapestEvicteeWeight(Order, LIS->getInterval(Evictee),
                                Cand.Intf.first(), Cand.Intf.last(), &MaxWeight);
 
@@ -1559,7 +1559,7 @@ bool RAGreedy::splitCanCauseLocalSpill(unsigned VirtRegToSplit,
 
   // Check if the local interval will evict a cheaper interval.
   float CheapestEvictWeight = 0;
-  unsigned FutureEvictedPhysReg = getCheapestEvicteeWeight(
+  MCRegister FutureEvictedPhysReg = getCheapestEvicteeWeight(
       Order, LIS->getInterval(VirtRegToSplit), Cand.Intf.first(),
       Cand.Intf.last(), &CheapestEvictWeight);
 
@@ -1688,7 +1688,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
   // Isolate even single instructions when dealing with a proper sub-class.
   // That guarantees register class inflation for the stack interval because it
   // is all copies.
-  unsigned Reg = SA->getParent().reg();
+  Register Reg = SA->getParent().reg();
   bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
 
   // First handle all the blocks with uses.
@@ -2051,7 +2051,7 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 /// Get the number of allocatable registers that match the constraints of \p Reg
 /// on \p MI and that are also in \p SuperRC.
 static unsigned getNumAllocatableRegsForConstraints(
-    const MachineInstr *MI, unsigned Reg, const TargetRegisterClass *SuperRC,
+    const MachineInstr *MI, Register Reg, const TargetRegisterClass *SuperRC,
     const TargetInstrInfo *TII, const TargetRegisterInfo *TRI,
     const RegisterClassInfo &RCI) {
   assert(SuperRC && "Invalid register class");
@@ -2791,11 +2791,10 @@ MCRegister RAGreedy::selectOrSplit(LiveInterval &VirtReg,
 /// Spilling a live range in the cold path can have lower cost than using
 /// the CSR for the first time. Returns the physical register if we decide
 /// to use the CSR; otherwise return 0.
-unsigned RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg,
-                                         AllocationOrder &Order,
-                                         Register PhysReg,
-                                         unsigned &CostPerUseLimit,
-                                         SmallVectorImpl<Register> &NewVRegs) {
+MCRegister
+RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg, AllocationOrder &Order,
+                                MCRegister PhysReg, unsigned &CostPerUseLimit,
+                                SmallVectorImpl<Register> &NewVRegs) {
   if (getStage(VirtReg) == RS_Spill && VirtReg.isSpillable()) {
     // We choose spill over using the CSR for the first time if the spill cost
     // is lower than CSRCost.
@@ -2860,7 +2859,7 @@ void RAGreedy::initializeCSRCost() {
 /// Collect the hint info for \p Reg.
 /// The results are stored into \p Out.
 /// \p Out is not cleared before being populated.
-void RAGreedy::collectHintInfo(unsigned Reg, HintsInfo &Out) {
+void RAGreedy::collectHintInfo(Register Reg, HintsInfo &Out) {
   for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) {
     if (!Instr.isFullCopy())
       continue;
@@ -2872,9 +2871,8 @@ void RAGreedy::collectHintInfo(unsigned Reg, HintsInfo &Out) {
         continue;
     }
     // Get the current assignment.
-    Register OtherPhysReg = Register::isPhysicalRegister(OtherReg)
-                                ? OtherReg
-                                : Register(VRM->getPhys(OtherReg));
+    MCRegister OtherPhysReg =
+        OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg);
     // Push the collected information.
     Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg,
                            OtherPhysReg));
@@ -2906,10 +2904,10 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
   // We have a broken hint, check if it is possible to fix it by
   // reusing PhysReg for the copy-related live-ranges. Indeed, we evicted
   // some register and PhysReg may be available for the other live-ranges.
-  SmallSet<unsigned, 4> Visited;
+  SmallSet<Register, 4> Visited;
   SmallVector<unsigned, 2> RecoloringCandidates;
   HintsInfo Info;
-  unsigned Reg = VirtReg.reg();
+  Register Reg = VirtReg.reg();
   MCRegister PhysReg = VRM->getPhys(Reg);
   // Start the recoloring algorithm from the input live-interval, then
   // it will propagate to the ones that are copy-related with it.
@@ -3030,7 +3028,8 @@ MCRegister RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
   // First try assigning a free register.
   auto Order =
       AllocationOrder::create(VirtReg.reg(), *VRM, RegClassInfo, Matrix);
-  if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs, FixedRegisters)) {
+  if (MCRegister PhysReg =
+          tryAssign(VirtReg, Order, NewVRegs, FixedRegisters)) {
     // If VirtReg got an assignment, the eviction info is no longre relevant.
     LastEvicted.clearEvicteeInfo(VirtReg.reg());
     // When NewVRegs is not empty, we may have made decisions such as evicting
diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp
index 4d610abc3dfe5f..bb2db85a090bb8 100644
--- a/llvm/lib/CodeGen/RegAllocPBQP.cpp
+++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp
@@ -146,12 +146,6 @@ class RegAllocPBQP : public MachineFunctionPass {
   }
 
 private:
-  using LI2NodeMap = std::map<const LiveInterval *, unsigned>;
-  using Node2LIMap = std::vector<const LiveInterval *>;
-  using AllowedSet = std::vector<unsigned>;
-  using AllowedSetMap = std::vector<AllowedSet>;
-  using RegPair = std::pair<unsigned, unsigned>;
-  using CoalesceMap = std::map<RegPair, PBQP::PBQPNum>;
   using RegSet = std::set<Register>;
 
   char *customPassID;
@@ -660,8 +654,9 @@ void RegAllocPBQP::initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM,
       spillVReg(VReg, NewVRegs, MF, LIS, VRM, VRegSpiller);
       Worklist.insert(Worklist.end(), NewVRegs.begin(), NewVRegs.end());
       continue;
-    } else
-      VRegAllowedMap[VReg] = std::move(VRegAllowed);
+    }
+
+    VRegAllowedMap[VReg.id()] = std::move(VRegAllowed);
   }
 
   for (auto &KV : VRegAllowedMap) {
@@ -774,7 +769,7 @@ void RegAllocPBQP::finalizeAlloc(MachineFunction &MF,
     if (PReg == 0) {
       const TargetRegisterClass &RC = *MRI.getRegClass(LI.reg());
       const ArrayRef<MCPhysReg> RawPRegOrder = RC.getRawAllocationOrder(MF);
-      for (unsigned CandidateReg : RawPRegOrder) {
+      for (MCRegister CandidateReg : RawPRegOrder) {
         if (!VRM.getRegInfo().isReserved(CandidateReg)) {
           PReg = CandidateReg;
           break;
diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index 7258feafb7e909..2b3715d02e9bdf 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -373,6 +373,36 @@ STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
 // before visiting the memcpy block (which will contain the lifetime start
 // for "b" then it will appear that 'b' has a degenerate lifetime.
 //
+// Handle Windows Exception with LifetimeStartOnFirstUse:
+// -----------------
+//
+// There was a bug for using LifetimeStartOnFirstUse in win32.
+// class Type1 {
+// ...
+// ~Type1(){ write memory;}
+// }
+// ...
+// try{
+// Type1 V
+// ...
+// } catch (Type2 X){
+// ...
+// }
+// For variable X in catch(X), we put point pX=&(&X) into ConservativeSlots
+// to prevent using LifetimeStartOnFirstUse. Because pX may merged with
+// object V which may call destructor after implicitly writing pX. All these
+// are done in C++ EH runtime libs (through CxxThrowException), and can't
+// obviously check it in IR level.
+//
+// The loader of pX, without obvious writing IR, is usually the first LOAD MI
+// in EHPad, Some like:
+// bb.x.catch.i (landing-pad, ehfunclet-entry):
+// ; predecessors: %bb...
+//   successors: %bb...
+//  %n:gr32 = MOV32rm %stack.pX ...
+//  ...
+// The Type2** %stack.pX will only be written in EH runtime libs, so we
+// check the StoreSlots to screen it out.
 
 namespace {
 
@@ -434,6 +464,9 @@ class StackColoring : public MachineFunctionPass {
   /// slots lifetime-start-on-first-use is disabled).
   BitVector ConservativeSlots;
 
+  /// Record the FI slots referenced by a 'may write to memory'.
+  BitVector StoreSlots;
+
   /// Number of iterations taken during data flow analysis.
   unsigned NumIterations;
 
@@ -629,10 +662,13 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
   InterestingSlots.resize(NumSlot);
   ConservativeSlots.clear();
   ConservativeSlots.resize(NumSlot);
+  StoreSlots.clear();
+  StoreSlots.resize(NumSlot);
 
   // number of start and end lifetime ops for each slot
   SmallVector<int, 8> NumStartLifetimes(NumSlot, 0);
   SmallVector<int, 8> NumEndLifetimes(NumSlot, 0);
+  SmallVector<int, 8> NumLoadInCatchPad(NumSlot, 0);
 
   // Step 1: collect markers and populate the "InterestingSlots"
   // and "ConservativeSlots" sets.
@@ -687,6 +723,13 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
           if (! BetweenStartEnd.test(Slot)) {
             ConservativeSlots.set(Slot);
           }
+          // Here we check the StoreSlots to screen catch point out. For more
+          // information, please refer "Handle Windows Exception with
+          // LifetimeStartOnFirstUse" at the head of this file.
+          if (MI.mayStore())
+            StoreSlots.set(Slot);
+          if (MF->getWinEHFuncInfo() && MBB->isEHPad() && MI.mayLoad())
+            NumLoadInCatchPad[Slot] += 1;
         }
       }
     }
@@ -697,11 +740,14 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
     return 0;
   }
 
-  // PR27903: slots with multiple start or end lifetime ops are not
+  // 1) PR27903: slots with multiple start or end lifetime ops are not
   // safe to enable for "lifetime-start-on-first-use".
-  for (unsigned slot = 0; slot < NumSlot; ++slot)
-    if (NumStartLifetimes[slot] > 1 || NumEndLifetimes[slot] > 1)
+  // 2) And also not safe for variable X in catch(X) in windows.
+  for (unsigned slot = 0; slot < NumSlot; ++slot) {
+    if (NumStartLifetimes[slot] > 1 || NumEndLifetimes[slot] > 1 ||
+        (NumLoadInCatchPad[slot] > 1 && !StoreSlots.test(slot)))
       ConservativeSlots.set(slot);
+  }
   LLVM_DEBUG(dumpBV("Conservative slots", ConservativeSlots));
 
   // Step 2: compute begin/end sets for each block
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 1e0be249e525ae..560e362b074b58 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -94,6 +94,14 @@ def adjust_icmp_imm : GICombineRule <
 
 def icmp_lowering : GICombineGroup<[adjust_icmp_imm]>;
 
+def extractvecelt_pairwise_add_matchdata : GIDefMatchData<"std::tuple<unsigned, LLT, Register>">;
+def extractvecelt_pairwise_add : GICombineRule<
+  (defs root:$root, extractvecelt_pairwise_add_matchdata:$matchinfo),
+  (match (wip_match_opcode G_EXTRACT_VECTOR_ELT):$root,
+          [{ return matchExtractVecEltPairwiseAdd(*${root}, MRI, ${matchinfo}); }]),
+  (apply [{ applyExtractVecEltPairwiseAdd(*${root}, MRI, B, ${matchinfo}); }])
+>;
+
 // Post-legalization combines which should happen at all optimization levels.
 // (E.g. ones that facilitate matching for the selector) For example, matching
 // pseudos.
@@ -110,6 +118,7 @@ def AArch64PostLegalizerCombinerHelper
                        [copy_prop, erase_undef_store, combines_for_extload,
                         sext_trunc_sextload,
                         hoist_logic_op_with_same_opcode_hands,
-                        and_trivial_mask, xor_of_and_with_same_reg]> {
+                        and_trivial_mask, xor_of_and_with_same_reg,
+                        extractvecelt_pairwise_add]> {
   let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule";
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index 79b563e345a80c..1bd9ce25125d9b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -135,4 +135,9 @@ def : GINodeEquiv<G_TRN1, AArch64trn1>;
 def : GINodeEquiv<G_TRN2, AArch64trn2>;
 def : GINodeEquiv<G_EXT, AArch64ext>;
 def : GINodeEquiv<G_VASHR, AArch64vashr>;
-def : GINodeEquiv<G_VLSHR, AArch64vlshr>;
\ No newline at end of file
+def : GINodeEquiv<G_VLSHR, AArch64vlshr>;
+
+// These are patterns that we only use for GlobalISel via the importer.
+def : Pat<(f32 (fadd (vector_extract (v2f32 FPR64:$Rn), (i64 0)),
+                     (vector_extract (v2f32 FPR64:$Rn), (i64 1)))),
+           (f32 (FADDPv2i32p (v2f32 FPR64:$Rn)))>;
\ No newline at end of file
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index 4f3938852a4007..17520ded4ba73b 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -24,8 +24,11 @@
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/Debug.h"
 
@@ -33,6 +36,74 @@
 
 using namespace llvm;
 
+/// This combine tries do what performExtractVectorEltCombine does in SDAG.
+/// Rewrite for pairwise fadd pattern
+///   (s32 (g_extract_vector_elt
+///           (g_fadd (vXs32 Other)
+///                  (g_vector_shuffle (vXs32 Other) undef <1,X,...> )) 0))
+/// ->
+///   (s32 (g_fadd (g_extract_vector_elt (vXs32 Other) 0)
+///              (g_extract_vector_elt (vXs32 Other) 1))
+bool matchExtractVecEltPairwiseAdd(
+    MachineInstr &MI, MachineRegisterInfo &MRI,
+    std::tuple<unsigned, LLT, Register> &MatchInfo) {
+  Register Src1 = MI.getOperand(1).getReg();
+  Register Src2 = MI.getOperand(2).getReg();
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+  auto Cst = getConstantVRegValWithLookThrough(Src2, MRI);
+  if (!Cst || Cst->Value != 0)
+    return false;
+  // SDAG also checks for FullFP16, but this looks to be beneficial anyway.
+
+  // Now check for an fadd operation. TODO: expand this for integer add?
+  auto *FAddMI = getOpcodeDef(TargetOpcode::G_FADD, Src1, MRI);
+  if (!FAddMI)
+    return false;
+
+  // If we add support for integer add, must restrict these types to just s64.
+  unsigned DstSize = DstTy.getSizeInBits();
+  if (DstSize != 16 && DstSize != 32 && DstSize != 64)
+    return false;
+
+  Register Src1Op1 = FAddMI->getOperand(1).getReg();
+  Register Src1Op2 = FAddMI->getOperand(2).getReg();
+  MachineInstr *Shuffle =
+      getOpcodeDef(TargetOpcode::G_SHUFFLE_VECTOR, Src1Op2, MRI);
+  MachineInstr *Other = MRI.getVRegDef(Src1Op1);
+  if (!Shuffle) {
+    Shuffle = getOpcodeDef(TargetOpcode::G_SHUFFLE_VECTOR, Src1Op1, MRI);
+    Other = MRI.getVRegDef(Src1Op2);
+  }
+
+  // We're looking for a shuffle that moves the second element to index 0.
+  if (Shuffle && Shuffle->getOperand(3).getShuffleMask()[0] == 1 &&
+      Other == MRI.getVRegDef(Shuffle->getOperand(1).getReg())) {
+    std::get<0>(MatchInfo) = TargetOpcode::G_FADD;
+    std::get<1>(MatchInfo) = DstTy;
+    std::get<2>(MatchInfo) = Other->getOperand(0).getReg();
+    return true;
+  }
+  return false;
+}
+
+bool applyExtractVecEltPairwiseAdd(
+    MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+    std::tuple<unsigned, LLT, Register> &MatchInfo) {
+  unsigned Opc = std::get<0>(MatchInfo);
+  assert(Opc == TargetOpcode::G_FADD && "Unexpected opcode!");
+  // We want to generate two extracts of elements 0 and 1, and add them.
+  LLT Ty = std::get<1>(MatchInfo);
+  Register Src = std::get<2>(MatchInfo);
+  LLT s64 = LLT::scalar(64);
+  B.setInstrAndDebugLoc(MI);
+  auto Elt0 = B.buildExtractVectorElement(Ty, Src, B.buildConstant(s64, 0));
+  auto Elt1 = B.buildExtractVectorElement(Ty, Src, B.buildConstant(s64, 1));
+  B.buildInstr(Opc, {MI.getOperand(0).getReg()}, {Elt0, Elt1});
+  MI.eraseFromParent();
+  return true;
+}
+
 #define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
 #include "AArch64GenPostLegalizeGICombiner.inc"
 #undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizercombiner-extractvec-faddp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizercombiner-extractvec-faddp.mir
new file mode 100644
index 00000000000000..790634563068a0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizercombiner-extractvec-faddp.mir
@@ -0,0 +1,188 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+---
+name:            f64_faddp
+alignment:       4
+legalized:       true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$q0' }
+body:             |
+  bb.1:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: f64_faddp
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s64>), [[C]](s64)
+    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK: [[EVEC1:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s64>), [[C1]](s64)
+    ; CHECK: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[EVEC]], [[EVEC1]]
+    ; CHECK: $d0 = COPY [[FADD]](s64)
+    ; CHECK: RET_ReallyLR implicit $d0
+    %0:_(<2 x s64>) = COPY $q0
+    %2:_(<2 x s64>) = G_IMPLICIT_DEF
+    %5:_(s64) = G_CONSTANT i64 0
+    %1:_(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %2, shufflemask(1, undef)
+    %3:_(<2 x s64>) = G_FADD %1, %0
+    %4:_(s64) = G_EXTRACT_VECTOR_ELT %3(<2 x s64>), %5(s64)
+    $d0 = COPY %4(s64)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            f64_faddp_commuted
+alignment:       4
+legalized:       true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$q0' }
+body:             |
+  bb.1:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: f64_faddp_commuted
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s64>), [[C]](s64)
+    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK: [[EVEC1:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s64>), [[C1]](s64)
+    ; CHECK: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[EVEC]], [[EVEC1]]
+    ; CHECK: $d0 = COPY [[FADD]](s64)
+    ; CHECK: RET_ReallyLR implicit $d0
+    %0:_(<2 x s64>) = COPY $q0
+    %2:_(<2 x s64>) = G_IMPLICIT_DEF
+    %5:_(s64) = G_CONSTANT i64 0
+    %1:_(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %2, shufflemask(1, undef)
+    %3:_(<2 x s64>) = G_FADD %0, %1
+    %4:_(s64) = G_EXTRACT_VECTOR_ELT %3(<2 x s64>), %5(s64)
+    $d0 = COPY %4(s64)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            f32_faddp
+alignment:       4
+legalized:       true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+body:             |
+  bb.1:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: f32_faddp
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s32>), [[C]](s64)
+    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s32>), [[C1]](s64)
+    ; CHECK: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[EVEC]], [[EVEC1]]
+    ; CHECK: $s0 = COPY [[FADD]](s32)
+    ; CHECK: RET_ReallyLR implicit $s0
+    %0:_(<2 x s32>) = COPY $d0
+    %2:_(<2 x s32>) = G_IMPLICIT_DEF
+    %5:_(s64) = G_CONSTANT i64 0
+    %1:_(<2 x s32>) = G_SHUFFLE_VECTOR %0(<2 x s32>), %2, shufflemask(1, undef)
+    %3:_(<2 x s32>) = G_FADD %1, %0
+    %4:_(s32) = G_EXTRACT_VECTOR_ELT %3(<2 x s32>), %5(s64)
+    $s0 = COPY %4(s32)
+    RET_ReallyLR implicit $s0
+
+...
+---
+name:            f32_faddp_commuted
+alignment:       4
+legalized:       true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+body:             |
+  bb.1:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: f32_faddp_commuted
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s32>), [[C]](s64)
+    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s32>), [[C1]](s64)
+    ; CHECK: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[EVEC]], [[EVEC1]]
+    ; CHECK: $s0 = COPY [[FADD]](s32)
+    ; CHECK: RET_ReallyLR implicit $s0
+    %0:_(<2 x s32>) = COPY $d0
+    %2:_(<2 x s32>) = G_IMPLICIT_DEF
+    %5:_(s64) = G_CONSTANT i64 0
+    %1:_(<2 x s32>) = G_SHUFFLE_VECTOR %0(<2 x s32>), %2, shufflemask(1, undef)
+    %3:_(<2 x s32>) = G_FADD %0, %1
+    %4:_(s32) = G_EXTRACT_VECTOR_ELT %3(<2 x s32>), %5(s64)
+    $s0 = COPY %4(s32)
+    RET_ReallyLR implicit $s0
+
+...
+---
+name:            wrong_extract_idx
+alignment:       4
+legalized:       true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$q0' }
+body:             |
+  bb.1:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: wrong_extract_idx
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK: [[SHUF:%[0-9]+]]:_(<2 x s64>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s64>), [[DEF]], shufflemask(1, undef)
+    ; CHECK: [[FADD:%[0-9]+]]:_(<2 x s64>) = G_FADD [[SHUF]], [[COPY]]
+    ; CHECK: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[FADD]](<2 x s64>), [[C]](s64)
+    ; CHECK: $d0 = COPY [[EVEC]](s64)
+    ; CHECK: RET_ReallyLR implicit $d0
+    %0:_(<2 x s64>) = COPY $q0
+    %2:_(<2 x s64>) = G_IMPLICIT_DEF
+    %5:_(s64) = G_CONSTANT i64 1
+    %1:_(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %2, shufflemask(1, undef)
+    %3:_(<2 x s64>) = G_FADD %1, %0
+    %4:_(s64) = G_EXTRACT_VECTOR_ELT %3(<2 x s64>), %5(s64)
+    $d0 = COPY %4(s64)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            wrong_shuffle_mask
+alignment:       4
+legalized:       true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$q0' }
+body:             |
+  bb.1:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: wrong_shuffle_mask
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK: [[SHUF:%[0-9]+]]:_(<2 x s64>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s64>), [[DEF]], shufflemask(0, undef)
+    ; CHECK: [[FADD:%[0-9]+]]:_(<2 x s64>) = G_FADD [[SHUF]], [[COPY]]
+    ; CHECK: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[FADD]](<2 x s64>), [[C]](s64)
+    ; CHECK: $d0 = COPY [[EVEC]](s64)
+    ; CHECK: RET_ReallyLR implicit $d0
+    %0:_(<2 x s64>) = COPY $q0
+    %2:_(<2 x s64>) = G_IMPLICIT_DEF
+    %5:_(s64) = G_CONSTANT i64 0
+    %1:_(<2 x s64>) = G_SHUFFLE_VECTOR %0(<2 x s64>), %2, shufflemask(0, undef)
+    %3:_(<2 x s64>) = G_FADD %1, %0
+    %4:_(s64) = G_EXTRACT_VECTOR_ELT %3(<2 x s64>), %5(s64)
+    $d0 = COPY %4(s64)
+    RET_ReallyLR implicit $d0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-faddp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-faddp.mir
new file mode 100644
index 00000000000000..770630851d1b2b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-faddp.mir
@@ -0,0 +1,62 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -verify-machineinstrs -mtriple aarch64-unknown-unknown -run-pass=instruction-select %s -o - | FileCheck %s
+---
+name:            f64_faddp
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$q0' }
+frameInfo:
+  maxAlignment:    1
+body:             |
+  bb.1:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: f64_faddp
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK: [[FADDPv2i64p:%[0-9]+]]:fpr64 = FADDPv2i64p [[COPY]]
+    ; CHECK: $d0 = COPY [[FADDPv2i64p]]
+    ; CHECK: RET_ReallyLR implicit $d0
+    %0:fpr(<2 x s64>) = COPY $q0
+    %6:gpr(s64) = G_CONSTANT i64 0
+    %7:fpr(s64) = G_EXTRACT_VECTOR_ELT %0(<2 x s64>), %6(s64)
+    %8:gpr(s64) = G_CONSTANT i64 1
+    %9:fpr(s64) = G_EXTRACT_VECTOR_ELT %0(<2 x s64>), %8(s64)
+    %4:fpr(s64) = G_FADD %7, %9
+    $d0 = COPY %4(s64)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            f32_faddp
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+frameInfo:
+  maxAlignment:    1
+body:             |
+  bb.1:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: f32_faddp
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK: [[FADDPv2i32p:%[0-9]+]]:fpr32 = FADDPv2i32p [[COPY]]
+    ; CHECK: $s0 = COPY [[FADDPv2i32p]]
+    ; CHECK: RET_ReallyLR implicit $s0
+    %0:fpr(<2 x s32>) = COPY $d0
+    %6:gpr(s64) = G_CONSTANT i64 0
+    %7:fpr(s32) = G_EXTRACT_VECTOR_ELT %0(<2 x s32>), %6(s64)
+    %8:gpr(s64) = G_CONSTANT i64 1
+    %9:fpr(s32) = G_EXTRACT_VECTOR_ELT %0(<2 x s32>), %8(s64)
+    %4:fpr(s32) = G_FADD %7, %9
+    $s0 = COPY %4(s32)
+    RET_ReallyLR implicit $s0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/merge-store-dependency.ll b/llvm/test/CodeGen/AArch64/merge-store-dependency.ll
index 6850846fec0686..3a768d0e3f9b49 100644
--- a/llvm/test/CodeGen/AArch64/merge-store-dependency.ll
+++ b/llvm/test/CodeGen/AArch64/merge-store-dependency.ll
@@ -19,11 +19,11 @@ define void @test(%struct1* %fde, i32 %fd, void (i32, i32, i8*)* %func, i8* %arg
 ; A53-NEXT:    mov x19, x8
 ; A53-NEXT:    mov w0, w1
 ; A53-NEXT:    mov w9, #256
+; A53-NEXT:    stp x2, x3, [x8, #32]
+; A53-NEXT:    mov x2, x8
 ; A53-NEXT:    str q0, [x19, #16]!
 ; A53-NEXT:    str w1, [x19]
 ; A53-NEXT:    mov w1, #4
-; A53-NEXT:    stp x2, x3, [x8, #32]
-; A53-NEXT:    mov x2, x8
 ; A53-NEXT:    str q0, [x8]
 ; A53-NEXT:    strh w9, [x8, #24]
 ; A53-NEXT:    str wzr, [x8, #20]
diff --git a/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll b/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll
index 9942d6df99a4b8..693f33553591ab 100644
--- a/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll
+++ b/llvm/test/CodeGen/ARM/big-endian-neon-fp16-bitconv.ll
@@ -503,12 +503,12 @@ define void @conv_v8f16_to_i128( <8 x half> %a, i128* %store ) {
 ; CHECK-NEXT:    vmov.32 r3, d16[1]
 ; CHECK-NEXT:    vmov.32 r1, d16[0]
 ; CHECK-NEXT:    subs r12, r12, #1
+; CHECK-NEXT:    str r12, [r0, #12]
 ; CHECK-NEXT:    sbcs r2, r2, #0
+; CHECK-NEXT:    str r2, [r0, #8]
 ; CHECK-NEXT:    sbcs r3, r3, #0
 ; CHECK-NEXT:    sbc r1, r1, #0
 ; CHECK-NEXT:    stm r0, {r1, r3}
-; CHECK-NEXT:    str r2, [r0, #8]
-; CHECK-NEXT:    str r12, [r0, #12]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index a43f564951e93d..4fe8877aa8bd46 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1094,6 +1094,7 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    ldrd r11, r8, [r12, #24]
 ; CHECK-NEXT:    vstrb.8 q0, [r9], #16
 ; CHECK-NEXT:    vldrw.u32 q0, [r5], #32
+; CHECK-NEXT:    strd r9, r1, [sp, #24] @ 8-byte Folded Spill
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #-28]
 ; CHECK-NEXT:    vmul.f32 q0, q0, r7
 ; CHECK-NEXT:    vldrw.u32 q6, [r5, #-24]
@@ -1105,13 +1106,12 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:    vfma.f32 q0, q4, r6
 ; CHECK-NEXT:    vldrw.u32 q3, [r5, #-8]
 ; CHECK-NEXT:    vfma.f32 q0, q5, r3
-; CHECK-NEXT:    vldrw.u32 q1, [r5, #-4]
-; CHECK-NEXT:    vfma.f32 q0, q2, lr
 ; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    vfma.f32 q0, q2, lr
+; CHECK-NEXT:    vldrw.u32 q1, [r5, #-4]
 ; CHECK-NEXT:    vfma.f32 q0, q3, r11
-; CHECK-NEXT:    strd r9, r1, [sp, #24] @ 8-byte Folded Spill
-; CHECK-NEXT:    vfma.f32 q0, q1, r8
 ; CHECK-NEXT:    cmp r0, #16
+; CHECK-NEXT:    vfma.f32 q0, q1, r8
 ; CHECK-NEXT:    blo .LBB16_7
 ; CHECK-NEXT:  @ %bb.5: @ %for.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
index e7d6a7323bc1e3..0fe26fbc475346 100644
--- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
@@ -168,16 +168,14 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    vmov q1, q4
 ; CHECK-NEXT:    vmov s1, r7
 ; CHECK-NEXT:    vmov.32 q1[1], r6
-; CHECK-NEXT:    mov.w r10, #0
-; CHECK-NEXT:    vmov.32 q1[2], r5
 ; CHECK-NEXT:    vmov.32 q5[0], r7
+; CHECK-NEXT:    vmov.32 q1[2], r5
+; CHECK-NEXT:    vmov s9, r4
 ; CHECK-NEXT:    vmov.32 q1[3], r4
-; CHECK-NEXT:    strd r0, r10, [sp, #24]
+; CHECK-NEXT:    vdup.32 q6, r7
 ; CHECK-NEXT:    vstrw.32 q1, [sp, #76]
 ; CHECK-NEXT:    vmov q1, q5
-; CHECK-NEXT:    vmov s9, r4
 ; CHECK-NEXT:    vmov.32 q1[1], r7
-; CHECK-NEXT:    vdup.32 q6, r7
 ; CHECK-NEXT:    vmov.f32 s2, s1
 ; CHECK-NEXT:    vmov.f32 s8, s0
 ; CHECK-NEXT:    vmov.32 q1[2], r6
@@ -185,6 +183,7 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    vmov q7, q6
 ; CHECK-NEXT:    vmov.f32 s10, s1
 ; CHECK-NEXT:    mov.w r8, #4
+; CHECK-NEXT:    mov.w r10, #0
 ; CHECK-NEXT:    vmov.32 q1[3], r4
 ; CHECK-NEXT:    vmov.32 q3[0], r4
 ; CHECK-NEXT:    vmov.32 q7[1], r4
@@ -192,6 +191,7 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    vmov.f32 s11, s3
 ; CHECK-NEXT:    movs r1, #64
 ; CHECK-NEXT:    strh.w r8, [sp, #390]
+; CHECK-NEXT:    strd r0, r10, [sp, #24]
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #44]
 ; CHECK-NEXT:    str r0, [r0]
 ; CHECK-NEXT:    vstrw.32 q2, [r0]
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index 600c5279ca9173..1ae74c1738c79f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -24,8 +24,8 @@ define void @vst3_v2i32(<2 x i32> *%src, <6 x i32> *%dst) {
 ; CHECK-NEXT:    vmov.f32 s9, s6
 ; CHECK-NEXT:    vmov.f32 s10, s0
 ; CHECK-NEXT:    vmov.f32 s11, s5
-; CHECK-NEXT:    strd r2, r0, [r1, #16]
 ; CHECK-NEXT:    vstrw.32 q2, [r1]
+; CHECK-NEXT:    strd r2, r0, [r1, #16]
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0
diff --git a/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
index ac1c814b838ea7..f57c9226179b5e 100644
--- a/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
@@ -8,17 +8,17 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; THUMBV7-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; THUMBV7-NEXT:    .pad #44
 ; THUMBV7-NEXT:    sub sp, #44
-; THUMBV7-NEXT:    ldrd r4, r7, [sp, #88]
-; THUMBV7-NEXT:    mov r5, r3
 ; THUMBV7-NEXT:    str r0, [sp, #40] @ 4-byte Spill
 ; THUMBV7-NEXT:    movs r0, #0
-; THUMBV7-NEXT:    strd r4, r7, [sp]
-; THUMBV7-NEXT:    mov r1, r3
+; THUMBV7-NEXT:    ldrd r4, r7, [sp, #88]
+; THUMBV7-NEXT:    mov r5, r3
 ; THUMBV7-NEXT:    strd r0, r0, [sp, #8]
+; THUMBV7-NEXT:    mov r1, r3
 ; THUMBV7-NEXT:    mov r6, r2
 ; THUMBV7-NEXT:    mov r0, r2
 ; THUMBV7-NEXT:    movs r2, #0
 ; THUMBV7-NEXT:    movs r3, #0
+; THUMBV7-NEXT:    strd r4, r7, [sp]
 ; THUMBV7-NEXT:    bl __multi3
 ; THUMBV7-NEXT:    strd r1, r0, [sp, #32] @ 8-byte Folded Spill
 ; THUMBV7-NEXT:    strd r3, r2, [sp, #24] @ 8-byte Folded Spill
diff --git a/llvm/test/CodeGen/X86/pr48064.mir b/llvm/test/CodeGen/X86/pr48064.mir
new file mode 100644
index 00000000000000..8ddfdec9b5903a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr48064.mir
@@ -0,0 +1,435 @@
+# RUN: llc -mtriple="i386-pc-windows-msvc" -run-pass=stack-coloring %s -o - | FileCheck %s
+
+# There is a problem with the exception handler, we found in windows, when set
+# LifetimeStartOnFirstUse=true for stack-coloring in default. Take the following
+# case for example:
+#
+#// Compile with "clang-cl -m32 -O2 -EHs test.cpp"
+#__attribute__((noinline,nothrow,weak)) void escape(int *p) { }
+#struct object {
+#  int i;
+#  object() {
+#    i = 1;
+#  }
+#  ~object() {
+#    // if "object" and "exp" are assigned to the same slot,
+#    // this assign will corrupt "exp".
+#    i = 9999;
+#    escape(&i);
+#  }
+#};
+#inline void throwit() { throw 999; }
+#
+#volatile int v;
+#inline void func() {
+#  try {
+#    object o;
+#    throwit();
+#  }
+#  // "exp" is written by the OS when the "throw" occurs.
+#  // Then the destructor is called, and the store-assign
+#  // clobbers the value of "exp".
+#  // The dereference of "exp" (with value 9999) causes a crash.
+#  // All these done in libruntime, so it is hard to check in IR.
+#  catch (int &exp) {
+#    v = exp;
+#  }
+#}
+#
+#int main() {
+#  func();
+#  return 0;
+#}
+
+## Make sure that o.i not merge with exp.i
+# CHECK:      stack:
+# CHECK:  id: 2, name: o.i, type: default, offset: 0, size: 4, alignment: 4,
+# CHECK:  id: 3, name: exp.i, type: default, offset: 0, size: 4, alignment: 4,
+
+## Make sure that %stack.3.exp.i not replaced with %stack.2.o.i
+# CHECK:  bb.3.catch.i (landing-pad, ehfunclet-entry):
+# CHECK:    %7:gr32 = MOV32rm %stack.3.exp.i, 1, $noreg, 0, $noreg :: (dereferenceable load 4 from %ir.exp.i)
+
+--- |
+  ; ModuleID = 'test-pre-stc.mir'
+  source_filename = "test.cpp"
+  target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:32-n8:16:32-a:0:32-S32"
+
+  %rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
+  %eh.CatchableType = type { i32, i8*, i32, i32, i32, i32, i8* }
+  %eh.CatchableTypeArray.1 = type { i32, [1 x %eh.CatchableType*] }
+  %eh.ThrowInfo = type { i32, i8*, i8*, i8* }
+  %CXXExceptionRegistration = type { i8*, %EHRegistrationNode, i32 }
+  %EHRegistrationNode = type { %EHRegistrationNode*, i8* }
+  %struct.object = type { i32 }
+
+  $"_R0H@8" = comdat any
+
+  $"_CT_R0H@84" = comdat any
+
+  $_CTA1H = comdat any
+
+  $_TI1H = comdat any
+
+  @v__3HC = dso_local global i32 0, align 4
+  @"_7type_info__6B@" = external constant i8*
+  @"_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"_7type_info__6B@", i8* null, [3 x i8] c".H\00" }, comdat
+  @"_CT_R0H@84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i8* bitcast (%rtti.TypeDescriptor2* @"_R0H@8" to i8*), i32 0, i32 -1, i32 0, i32 4, i8* null }, section ".xdata", comdat
+  @_CTA1H = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x %eh.CatchableType*] [%eh.CatchableType* @"_CT_R0H@84"] }, section ".xdata", comdat
+  @_TI1H = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i8* null, i8* null, i8* bitcast (%eh.CatchableTypeArray.1* @_CTA1H to i8*) }, section ".xdata", comdat
+
+  ; Function Attrs: noinline nounwind sspstrong
+  define weak dso_local void @"escape__YAXPAH@Z"(i32* %p) local_unnamed_addr #0 {
+  entry:
+    ret void
+  }
+
+  ; Function Attrs: norecurse sspstrong
+  define dso_local i32 @main() local_unnamed_addr #1 personality i32 (...)* @__CxxFrameHandler3 {
+  entry:
+    %0 = alloca %CXXExceptionRegistration, align 4
+    %1 = bitcast %CXXExceptionRegistration* %0 to i8*
+    call void @llvm.x86.seh.ehregnode(i8* %1)
+    %2 = call i8* @llvm.stacksave()
+    %3 = getelementptr inbounds %CXXExceptionRegistration, %CXXExceptionRegistration* %0, i32 0, i32 0
+    store i8* %2, i8** %3, align 4
+    %4 = getelementptr inbounds %CXXExceptionRegistration, %CXXExceptionRegistration* %0, i32 0, i32 2
+    store i32 -1, i32* %4, align 4
+    %5 = getelementptr inbounds %CXXExceptionRegistration, %CXXExceptionRegistration* %0, i32 0, i32 1
+    %6 = getelementptr inbounds %EHRegistrationNode, %EHRegistrationNode* %5, i32 0, i32 1
+    store i8* bitcast (i32 (i8*, i8*, i8*, i8*)* @"__ehhandler$main" to i8*), i8** %6, align 4
+    %7 = load %EHRegistrationNode*, %EHRegistrationNode* addrspace(257)* null, align 4
+    %8 = getelementptr inbounds %EHRegistrationNode, %EHRegistrationNode* %5, i32 0, i32 0
+    store %EHRegistrationNode* %7, %EHRegistrationNode** %8, align 4
+    store %EHRegistrationNode* %5, %EHRegistrationNode* addrspace(257)* null, align 4
+    %tmp.i.i = alloca i32, align 4
+    %o.i = alloca %struct.object, align 4
+    %zx = alloca i32*, align 4
+    %exp.i = alloca i32*, align 4
+    %9 = bitcast i32** %exp.i to i8*
+    call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %9)
+    %10 = bitcast %struct.object* %o.i to i8*
+    call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %10) #7
+    %i.i.i1 = bitcast %struct.object* %o.i to i32*
+    store i32 1, i32* %i.i.i1, align 4
+    %11 = bitcast i32* %tmp.i.i to i8*
+    call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %11)
+    store i32 999, i32* %tmp.i.i, align 4
+    %12 = getelementptr inbounds %CXXExceptionRegistration, %CXXExceptionRegistration* %0, i32 0, i32 2
+    store i32 1, i32* %12, align 4
+    invoke void @_CxxThrowException(i8* nonnull %11, %eh.ThrowInfo* nonnull @_TI1H) #8
+            to label %.noexc.i unwind label %ehcleanup.i
+
+  .noexc.i:                                         ; preds = %entry
+    unreachable
+
+  ehcleanup.i:                                      ; preds = %entry
+    %13 = cleanuppad within none []
+    %14 = bitcast %struct.object* %o.i to i32*
+    %15 = bitcast %struct.object* %o.i to i8*
+    store i32 9999, i32* %14, align 4
+    call void @"escape__YAXPAH@Z"(i32* nonnull %14) #7 [ "funclet"(token %13) ]
+    call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %15) #7
+    cleanupret from %13 unwind label %catch.dispatch.i
+
+  catch.dispatch.i:                                 ; preds = %ehcleanup.i
+    %16 = catchswitch within none [label %catch.i] unwind to caller
+
+  catch.i:                                          ; preds = %catch.dispatch.i
+    %17 = catchpad within %16 [%rtti.TypeDescriptor2* @"_R0H@8", i32 8, i32** %exp.i]
+    %18 = load i32*, i32** %exp.i, align 4
+    %19 = load i32, i32* %18, align 4
+    store atomic volatile i32 %19, i32* @v__3HC release, align 4
+    catchret from %17 to label %func__YAXXZ.exit
+
+  func__YAXXZ.exit:                                 ; preds = %catch.i
+    %20 = bitcast i32** %exp.i to i8*
+    call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %20)
+    %21 = getelementptr inbounds %CXXExceptionRegistration, %CXXExceptionRegistration* %0, i32 0, i32 1
+    %22 = getelementptr inbounds %EHRegistrationNode, %EHRegistrationNode* %21, i32 0, i32 0
+    %23 = load %EHRegistrationNode*, %EHRegistrationNode** %22, align 4
+    store %EHRegistrationNode* %23, %EHRegistrationNode* addrspace(257)* null, align 4
+    ret i32 0
+  }
+
+  ; Function Attrs: argmemonly nofree nosync nounwind willreturn
+  declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2
+
+  ; Function Attrs: nofree
+  declare dso_local i32 @__CxxFrameHandler3(...) #3
+
+  ; Function Attrs: argmemonly nofree nosync nounwind willreturn
+  declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2
+
+  ; Function Attrs: nofree
+  declare dso_local x86_stdcallcc void @_CxxThrowException(i8*, %eh.ThrowInfo*) local_unnamed_addr #3
+
+  declare i32 @_setjmp3(i8*, i32, ...)
+
+  ; Function Attrs: nofree nosync nounwind willreturn
+  declare i8* @llvm.stacksave() #4
+
+  define internal i32 @"__ehhandler$main"(i8* %0, i8* %1, i8* %2, i8* %3) #5 {
+  entry:
+    %4 = call i8* @llvm.x86.seh.lsda(i8* bitcast (i32 ()* @main to i8*))
+    %5 = tail call i32 bitcast (i32 (...)* @__CxxFrameHandler3 to i32 (i8*, i8*, i8*, i8*, i8*)*)(i8* inreg %4, i8* %0, i8* %1, i8* %2, i8* %3)
+    ret i32 %5
+  }
+
+  ; Function Attrs: nounwind readnone
+  declare i8* @llvm.x86.seh.lsda(i8*) #6
+
+  declare x86_stdcallcc void @__CxxLongjmpUnwind(i8*)
+
+  ; Function Attrs: nounwind
+  declare void @llvm.x86.seh.ehregnode(i8*) #7
+
+  attributes #0 = { noinline nounwind sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { norecurse sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #2 = { argmemonly nofree nosync nounwind willreturn }
+  attributes #3 = { nofree }
+  attributes #4 = { nofree nosync nounwind willreturn }
+  attributes #5 = { "safeseh" }
+  attributes #6 = { nounwind readnone }
+  attributes #7 = { nounwind }
+  attributes #8 = { noreturn }
+
+  !llvm.linker.options = !{!0, !1, !2}
+  !llvm.module.flags = !{!3, !4}
+  !llvm.ident = !{!5}
+
+  !0 = !{!"/DEFAULTLIB:libcmt.lib"}
+  !1 = !{!"/DEFAULTLIB:libmmt.lib"}
+  !2 = !{!"/DEFAULTLIB:oldnames.lib"}
+  !3 = !{i32 1, !"NumRegisterParameters", i32 0}
+  !4 = !{i32 1, !"wchar_size", i32 2}
+  !5 = !{!"Intel(R) oneAPI DPC++ Compiler Pro 2021.1 (YYYY.x.0.MMDD)"}
+
+...
+---
+name:            'escape__YAXPAH@Z'
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:         []
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+  - { id: 0, type: default, offset: 0, size: 4, alignment: 4, stack-id: default,
+      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+stack:           []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    RET 0
+
+...
+---
+name:            main
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:
+  - { id: 0, class: gr32, preferred-register: '' }
+  - { id: 1, class: gr32, preferred-register: '' }
+  - { id: 2, class: gr32, preferred-register: '' }
+  - { id: 3, class: gr32, preferred-register: '' }
+  - { id: 4, class: gr32, preferred-register: '' }
+  - { id: 5, class: gr32, preferred-register: '' }
+  - { id: 6, class: gr32, preferred-register: '' }
+  - { id: 7, class: gr32, preferred-register: '' }
+  - { id: 8, class: gr32, preferred-register: '' }
+  - { id: 9, class: gr32, preferred-register: '' }
+  - { id: 10, class: gr32, preferred-register: '' }
+liveins:         []
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: true
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: zx, type: default, offset: 0, size: 16, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: tmp.i.i, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 2, name: o.i, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 3, name: exp.i, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x7ffff800), %bb.2(0x00000800)
+
+    %0:gr32 = COPY $esp
+    MOV32mr %stack.0.zx, 1, $noreg, 0, $noreg, %0 :: (store 4 into %ir.3)
+    MOV32mi %stack.0.zx, 1, $noreg, 12, $noreg, -1 :: (store 4 into %ir.4)
+    %1:gr32 = nuw LEA32r %stack.0.zx, 1, $noreg, 4, $noreg
+    MOV32mi %stack.0.zx, 1, $noreg, 8, $noreg, @"__ehhandler$main" :: (store 4 into %ir.6)
+    %2:gr32 = MOV32rm $noreg, 1, $noreg, 0, $fs :: (load 4 from `%EHRegistrationNode* addrspace(257)* null`, addrspace 257)
+    MOV32mr %stack.0.zx, 1, $noreg, 4, $noreg, killed %2 :: (store 4 into %ir.8)
+    MOV32mr $noreg, 1, $noreg, 0, $fs, killed %1 :: (store 4 into `%EHRegistrationNode* addrspace(257)* null`, addrspace 257)
+    MOV32mi %stack.2.o.i, 1, $noreg, 0, $noreg, 1 :: (store 4 into %ir.i.i.i1)
+    MOV32mi %stack.1.tmp.i.i, 1, $noreg, 0, $noreg, 999 :: (store 4 into %ir.tmp.i.i)
+    MOV32mi %stack.0.zx, 1, $noreg, 12, $noreg, 1 :: (store 4 into %ir.12)
+    ADJCALLSTACKDOWN32 8, 0, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
+    %3:gr32 = COPY $esp
+    %4:gr32 = LEA32r %stack.1.tmp.i.i, 1, $noreg, 0, $noreg
+    MOV32mr %3, 1, $noreg, 0, $noreg, killed %4 :: (store 4 into stack)
+    MOV32mi %3, 1, $noreg, 4, $noreg, @_TI1H :: (store 4 into stack + 4)
+    CALLpcrel32 @_CxxThrowException, csr_noregs, implicit $esp, implicit $ssp, implicit-def $esp, implicit-def $ssp
+    ADJCALLSTACKUP32 8, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
+    JMP_1 %bb.1
+
+  bb.1..noexc.i:
+    successors:
+
+
+  bb.2.ehcleanup.i (landing-pad, ehfunclet-entry):
+    successors: %bb.3(0x80000000)
+
+    MOV32mi %stack.2.o.i, 1, $noreg, 0, $noreg, 9999 :: (store 4 into %ir.14)
+    ADJCALLSTACKDOWN32 4, 0, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
+    %5:gr32 = COPY $esp
+    %6:gr32 = LEA32r %stack.2.o.i, 1, $noreg, 0, $noreg
+    MOV32mr %5, 1, $noreg, 0, $noreg, killed %6 :: (store 4 into stack)
+    CALLpcrel32 @"escape__YAXPAH@Z", csr_32, implicit $esp, implicit $ssp, implicit-def $esp, implicit-def $ssp
+    ADJCALLSTACKUP32 4, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
+    CLEANUPRET
+
+  bb.3.catch.i (landing-pad, ehfunclet-entry):
+    successors: %bb.4(0x80000000)
+
+    %7:gr32 = MOV32rm %stack.3.exp.i, 1, $noreg, 0, $noreg :: (dereferenceable load 4 from %ir.exp.i)
+    %8:gr32 = MOV32rm killed %7, 1, $noreg, 0, $noreg :: (load 4 from %ir.18)
+    MOV32mr $noreg, 1, $noreg, @v__3HC, $noreg, killed %8 :: (volatile store release 4 into @v__3HC)
+    CATCHRET %bb.4, %bb.0
+
+  bb.4.catch.i (landing-pad):
+    successors: %bb.5(0x80000000)
+
+    JMP_4 %bb.5
+
+  bb.5.func__YAXXZ.exit:
+    %9:gr32 = MOV32rm %stack.0.zx, 1, $noreg, 4, $noreg :: (dereferenceable load 4 from %ir.22)
+    MOV32mr $noreg, 1, $noreg, 0, $fs, killed %9 :: (store 4 into `%EHRegistrationNode* addrspace(257)* null`, addrspace 257)
+    %10:gr32 = MOV32r0 implicit-def dead $eflags
+    $eax = COPY %10
+    RET 0, $eax
+
+...
+---
+name:            '__ehhandler$main'
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:
+  - { id: 0, class: gr32, preferred-register: '' }
+liveins:         []
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+  - { id: 0, type: default, offset: 0, size: 4, alignment: 4, stack-id: default,
+      isImmutable: false, isAliased: false, callee-saved-register: '',
+      callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, type: default, offset: 4, size: 4, alignment: 4, stack-id: default,
+      isImmutable: false, isAliased: false, callee-saved-register: '',
+      callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 2, type: default, offset: 8, size: 4, alignment: 4, stack-id: default,
+      isImmutable: false, isAliased: false, callee-saved-register: '',
+      callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 3, type: default, offset: 12, size: 4, alignment: 4, stack-id: default,
+      isImmutable: false, isAliased: false, callee-saved-register: '',
+      callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+stack:           []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    %0:gr32 = MOV32ri <mcsymbol L__ehtable$main>
+    $eax = COPY %0
+    TCRETURNdi @__CxxFrameHandler3, 0, csr_32, implicit $esp, implicit $ssp, implicit $eax
+
+...
diff --git a/llvm/test/CodeGen/X86/store_op_load_fold2.ll b/llvm/test/CodeGen/X86/store_op_load_fold2.ll
index 674b8d8f93842a..6f088772436ec5 100644
--- a/llvm/test/CodeGen/X86/store_op_load_fold2.ll
+++ b/llvm/test/CodeGen/X86/store_op_load_fold2.ll
@@ -17,13 +17,12 @@ cond_true2732.preheader:                ; preds = %entry
         store i64 %tmp2676.us.us, i64* %tmp2666
         ret i32 0
 
-; INTEL: 	and	{{e..}}, dword ptr [356]
-; INTEL:	and	dword ptr [360], {{e..}}
-; FIXME:	mov	dword ptr [356], {{e..}}
-; The above line comes out as 'mov 360, eax', but when the register is ecx it works?
+; INTEL-DAG: 	and	{{e..}}, dword ptr [356]
+; INTEL-DAG:	and	dword ptr [360], {{e..}}
+; INTEL:	mov	dword ptr [356], {{e..}}
 
-; ATT: 	andl	356, %{{e..}}
-; ATT:	andl	%{{e..}}, 360
+; ATT-DAG: 	andl	356, %{{e..}}
+; ATT-DAG:	andl	%{{e..}}, 360
 ; ATT:	movl	%{{e..}}, 356
 
 }