From 75db7cf78ad5138e767b8d04c9a758009191ee0c Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Wed, 30 Sep 2020 10:55:51 +0100
Subject: [PATCH 1/7] [SVE][CodeGen] Legalisation of integer -> floating point
 conversions

Splitting the operand of a scalable [S|U]INT_TO_FP results in a
concat_vectors operation where the operands are unpacked FP
scalable vectors (e.g. nxv2f32).
This patch adds custom lowering of concat_vectors which
checks that the number of operands is 2, and isel patterns
to match concat_vectors of scalable FP types with uzp1.

Reviewed By: efriedma, paulwalker-arm

Differential Revision: https://reviews.llvm.org/D88033
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  18 ++-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   8 +
 llvm/test/CodeGen/AArch64/sve-split-fcvt.ll   | 141 ++++++++++++++++++
 3 files changed, 165 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0c8da4e20d7d05..d8072dbb856e48 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -990,7 +990,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     // FIXME: Add custom lowering of MLOAD to handle different passthrus (not a
     // splat of 0 or undef) once vector selects supported in SVE codegen. See
     // D68877 for more details.
-
     for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::UINT_TO_FP, VT, Custom);
@@ -1018,7 +1017,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     }
 
     for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
-      setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
       setOperationAction(ISD::SELECT, VT, Custom);
       setOperationAction(ISD::SETCC, VT, Custom);
       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
@@ -1035,6 +1034,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
     for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
                     MVT::nxv4f32, MVT::nxv2f64}) {
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
       setOperationAction(ISD::SELECT, VT, Custom);
@@ -3835,6 +3835,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerRETURNADDR(Op, DAG);
   case ISD::ADDROFRETURNADDR:
     return LowerADDROFRETURNADDR(Op, DAG);
+  case ISD::CONCAT_VECTORS:
+    return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:
     return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT:
@@ -9150,6 +9152,18 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
   return SDValue();
 }
 
+SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  assert(Op.getValueType().isScalableVector() &&
+         isTypeLegal(Op.getValueType()) &&
+         "Expected legal scalable vector type!");
+
+  if (isTypeLegal(Op.getOperand(0).getValueType()) && Op.getNumOperands() == 2)
+    return Op;
+
+  return SDValue();
+}
+
 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
                                                       SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index a356f8390d2b3e..49e8ac86e0df7f 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1195,6 +1195,14 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   def : Pat<(nxv16i1 (concat_vectors nxv8i1:$p1, nxv8i1:$p2)),
             (UZP1_PPP_B $p1, $p2)>;
 
+  // Concatenate two floating point vectors.
+  def : Pat<(nxv4f16 (concat_vectors nxv2f16:$v1, nxv2f16:$v2)),
+            (UZP1_ZZZ_S $v1, $v2)>;
+  def : Pat<(nxv8f16 (concat_vectors nxv4f16:$v1, nxv4f16:$v2)),
+            (UZP1_ZZZ_H $v1, $v2)>;
+  def : Pat<(nxv4f32 (concat_vectors nxv2f32:$v1, nxv2f32:$v2)),
+            (UZP1_ZZZ_S $v1, $v2)>;
+
   defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
   defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
   defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>;
diff --git a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
index fbd9beceaa1f02..41b3e0ee13e16c 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
@@ -95,3 +95,144 @@ define <vscale x 4 x i64> @fcvtzu_d_nxv4f32(<vscale x 4 x float> %a) {
   %res = fptoui <vscale x 4 x float> %a to <vscale x 4 x i64>
   ret <vscale x 4 x i64> %res
 }
+
+; SINT_TO_FP
+
+; Split operand
+define <vscale x 4 x float> @scvtf_s_nxv4i64(<vscale x 4 x i64> %a) {
+; CHECK-LABEL: scvtf_s_nxv4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z1.s, p0/m, z1.d
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %res = sitofp <vscale x 4 x i64> %a to <vscale x 4 x float>
+  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 8 x half> @scvtf_h_nxv8i64(<vscale x 8 x i64> %a) {
+; CHECK-LABEL: scvtf_h_nxv8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z3.h, p0/m, z3.d
+; CHECK-NEXT:    scvtf z2.h, p0/m, z2.d
+; CHECK-NEXT:    scvtf z1.h, p0/m, z1.d
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    ret
+  %res = sitofp <vscale x 8 x i64> %a to <vscale x 8 x half>
+  ret <vscale x 8 x half> %res
+}
+
+; Split result
+define <vscale x 16 x float> @scvtf_s_nxv16i8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: scvtf_s_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sunpklo z1.h, z0.b
+; CHECK-NEXT:    sunpkhi z0.h, z0.b
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    sunpklo z2.s, z1.h
+; CHECK-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-NEXT:    sunpklo z3.s, z0.h
+; CHECK-NEXT:    sunpkhi z4.s, z0.h
+; CHECK-NEXT:    scvtf z0.s, p0/m, z2.s
+; CHECK-NEXT:    scvtf z1.s, p0/m, z1.s
+; CHECK-NEXT:    scvtf z2.s, p0/m, z3.s
+; CHECK-NEXT:    scvtf z3.s, p0/m, z4.s
+; CHECK-NEXT:    ret
+  %res = sitofp <vscale x 16 x i8> %a to <vscale x 16 x float>
+  ret <vscale x 16 x float> %res
+}
+
+define <vscale x 4 x double> @scvtf_d_nxv4i32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: scvtf_d_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sunpklo z1.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sunpkhi z2.d, z0.s
+; CHECK-NEXT:    scvtf z0.d, p0/m, z1.d
+; CHECK-NEXT:    scvtf z1.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+  %res = sitofp <vscale x 4 x i32> %a to <vscale x 4 x double>
+  ret <vscale x 4 x double> %res
+}
+
+define <vscale x 4 x double> @scvtf_d_nxv4i1(<vscale x 4 x i1> %a) {
+; CHECK-LABEL: scvtf_d_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    zip1 p3.s, p0.s, p1.s
+; CHECK-NEXT:    zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ptrue p2.d
+; CHECK-NEXT:    mov z0.d, p3/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    scvtf z0.d, p2/m, z0.d
+; CHECK-NEXT:    scvtf z1.d, p2/m, z1.d
+; CHECK-NEXT:    ret
+  %res = sitofp <vscale x 4 x i1> %a to <vscale x 4 x double>
+  ret <vscale x 4 x double> %res
+}
+
+; UINT_TO_FP
+
+; Split operand
+define <vscale x 4 x float> @ucvtf_s_nxv4i64(<vscale x 4 x i64> %a) {
+; CHECK-LABEL: ucvtf_s_nxv4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.d
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %res = uitofp <vscale x 4 x i64> %a to <vscale x 4 x float>
+  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 8 x half> @ucvtf_h_nxv8i64(<vscale x 8 x i64> %a) {
+; CHECK-LABEL: ucvtf_h_nxv8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z3.h, p0/m, z3.d
+; CHECK-NEXT:    ucvtf z2.h, p0/m, z2.d
+; CHECK-NEXT:    ucvtf z1.h, p0/m, z1.d
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    ret
+  %res = uitofp <vscale x 8 x i64> %a to <vscale x 8 x half>
+  ret <vscale x 8 x half> %res
+}
+
+; Split result
+define <vscale x 4 x double> @ucvtf_d_nxv4i32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: ucvtf_d_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z1.d
+; CHECK-NEXT:    ucvtf z1.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+  %res = uitofp <vscale x 4 x i32> %a to <vscale x 4 x double>
+  ret <vscale x 4 x double> %res
+}
+
+define <vscale x 4 x double> @ucvtf_d_nxv4i1(<vscale x 4 x i1> %a) {
+; CHECK-LABEL: ucvtf_d_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    zip1 p3.s, p0.s, p1.s
+; CHECK-NEXT:    zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ptrue p2.d
+; CHECK-NEXT:    mov z0.d, p3/z, #1 // =0x1
+; CHECK-NEXT:    mov z1.d, p0/z, #1 // =0x1
+; CHECK-NEXT:    ucvtf z0.d, p2/m, z0.d
+; CHECK-NEXT:    ucvtf z1.d, p2/m, z1.d
+; CHECK-NEXT:    ret
+  %res = uitofp <vscale x 4 x i1> %a to <vscale x 4 x double>
+  ret <vscale x 4 x double> %res
+}

From 38f625d0d1360b035271422bab922d22ed04d79a Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Thu, 1 Oct 2020 10:47:02 +0100
Subject: [PATCH 2/7] [ARM][LowOverheadLoops] Adjust Start insertion.

Try to move the insertion point to become the terminator of the
block, usually the preheader.

Differential Revision: https://reviews.llvm.org/D88638
---
 llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp   | 63 ++++++++-----------
 .../lstp-insertion-position.mir               | 41 ++++--------
 .../LowOverheadLoops/mov-after-dlstp.mir      | 15 ++---
 .../Thumb2/LowOverheadLoops/mov-operand.ll    | 11 ++--
 .../move-def-before-start.mir                 | 23 ++-----
 .../LowOverheadLoops/move-start-after-def.mir | 23 ++-----
 .../Thumb2/LowOverheadLoops/reductions.ll     |  4 +-
 .../CodeGen/Thumb2/mve-float16regloops.ll     |  2 +-
 .../CodeGen/Thumb2/mve-float32regloops.ll     |  6 +-
 llvm/test/CodeGen/Thumb2/mve-fma-loops.ll     |  4 +-
 .../Thumb2/mve-gather-scatter-optimisation.ll |  2 +-
 .../Thumb2/mve-gather-scatter-ptr-address.ll  |  2 +-
 .../Thumb2/mve-gather-scatter-tailpred.ll     | 30 +++------
 llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll   | 10 +--
 .../test/CodeGen/Thumb2/mve-pred-threshold.ll |  4 +-
 llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll  | 14 ++---
 16 files changed, 92 insertions(+), 162 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index cd9c38752ad237..ac787a1674ab7a 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -644,47 +644,10 @@ bool LowOverheadLoop::ValidateTailPredicate() {
     return false;
   }
 
-  // The element count register maybe defined after InsertPt, in which case we
-  // need to try to move either InsertPt or the def so that the [w|d]lstp can
-  // use the value.
-
-  if (StartInsertPt != StartInsertBB->end() &&
-      !RDA.isReachingDefLiveOut(&*StartInsertPt, NumElements)) {
-    if (auto *ElemDef = RDA.getLocalLiveOutMIDef(StartInsertBB, NumElements)) {
-      if (RDA.isSafeToMoveForwards(ElemDef, &*StartInsertPt)) {
-        ElemDef->removeFromParent();
-        StartInsertBB->insert(StartInsertPt, ElemDef);
-        LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: "
-                   << *ElemDef);
-      } else if (RDA.isSafeToMoveBackwards(&*StartInsertPt, ElemDef)) {
-        StartInsertPt->removeFromParent();
-        StartInsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),
-                                   &*StartInsertPt);
-        LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
-      } else {
-        // If we fail to move an instruction and the element count is provided
-        // by a mov, use the mov operand if it will have the same value at the
-        // insertion point
-        MachineOperand Operand = ElemDef->getOperand(1);
-        if (isMovRegOpcode(ElemDef->getOpcode()) &&
-            RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg()) ==
-               RDA.getUniqueReachingMIDef(&*StartInsertPt, Operand.getReg())) {
-          TPNumElements = Operand;
-          NumElements = TPNumElements.getReg();
-        } else {
-          LLVM_DEBUG(dbgs()
-                     << "ARM Loops: Unable to move element count to loop "
-                     << "start instruction.\n");
-          return false;
-        }
-      }
-    }
-  }
-
   // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect
   // world the [w|d]lstp instruction would be last instruction in the preheader
   // and so it would only affect instructions within the loop body. But due to
-  // scheduling, and/or the logic in this pass (above), the insertion point can
+  // scheduling, and/or the logic in this pass, the insertion point can
   // be moved earlier. So if the Loop Start isn't the last instruction in the
   // preheader, and if the initial element count is smaller than the vector
   // width, the Loop Start instruction will immediately generate one or more
@@ -1091,12 +1054,36 @@ void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
     return true;
   };
 
+  // We know that we can define safely LR at InsertPt, but maybe we could
+  // push the insertion point to later on in the basic block.
+  auto TryAdjustInsertionPoint = [](MachineBasicBlock::iterator &InsertPt,
+                                    MachineInstr *Start,
+                                    ReachingDefAnalysis &RDA) {
+
+    MachineBasicBlock *MBB = InsertPt->getParent();
+    MachineBasicBlock::iterator FirstNonTerminator =
+      MBB->getFirstTerminator();
+    unsigned CountReg = Start->getOperand(0).getReg();
+
+    // Get the latest possible insertion point and check whether the semantics
+    // will be maintained if Start was inserted there.
+    if (FirstNonTerminator == MBB->end()) {
+      if (RDA.isReachingDefLiveOut(Start, CountReg) &&
+          RDA.isReachingDefLiveOut(Start, ARM::LR))
+        InsertPt = FirstNonTerminator;
+    } else if (RDA.hasSameReachingDef(Start, &*FirstNonTerminator, CountReg) &&
+               RDA.hasSameReachingDef(Start, &*FirstNonTerminator, ARM::LR))
+      InsertPt = FirstNonTerminator;
+  };
+
   if (!FindStartInsertionPoint(Start, Dec, StartInsertPt, StartInsertBB, RDA,
                                ToRemove)) {
     LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
     Revert = true;
     return;
   }
+
+  TryAdjustInsertionPoint(StartInsertPt, Start, RDA);
   Revert = !ValidateRanges(Start, End, BBUtils, ML);
   CannotTailPredicate = !ValidateTailPredicate();
 }
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
index 3e7c87de0282cb..e5131fd4e1b41e 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lstp-insertion-position.mir
@@ -153,25 +153,17 @@ body:             |
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
   ; CHECK:   dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_register $r7
-  ; CHECK:   renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r3 = tLDRpci %const.0, 14 /* CC::al */, $noreg :: (load 4 from constant-pool)
-  ; CHECK:   $lr = t2DLS killed renamable $lr
   ; CHECK:   renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
   ; CHECK:   $s4 = VMOVS killed $s0, 14 /* CC::al */, $noreg, implicit killed $q1, implicit-def $q1
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $q1, $r0, $r1, $r2
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
-  ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   MVE_VPST 2, implicit $vpr
-  ; CHECK:   renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv12, align 4)
-  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1315, align 4)
-  ; CHECK:   renamable $q1 = MVE_VFMAf32 killed renamable $q1, killed renamable $q2, killed renamable $q0, 1, killed renamable $vpr
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   liveins: $lr, $q1, $r0, $r1
+  ; CHECK:   renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.lsr.iv12, align 4)
+  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1315, align 4)
+  ; CHECK:   renamable $q1 = MVE_VFMAf32 killed renamable $q1, killed renamable $q2, killed renamable $q0, 0, killed $noreg
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.middle.block:
   ; CHECK:   liveins: $q1
   ; CHECK:   renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS renamable $s6, renamable $s7, 14 /* CC::al */, $noreg
@@ -285,27 +277,18 @@ body:             |
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
   ; CHECK:   dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_register $r7
-  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 3, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r3 = nuw nsw t2ADDrs killed renamable $r3, renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $lr = t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r3 = tLDRpci %const.0, 14 /* CC::al */, $noreg :: (load 4 from constant-pool)
-  ; CHECK:   $lr = t2DLS killed renamable $lr
   ; CHECK:   renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
   ; CHECK:   renamable $r2, dead $cpsr = tLSRri killed renamable $r2, 2, 14 /* CC::al */, $noreg
   ; CHECK:   $s4 = VMOVS killed $s0, 14 /* CC::al */, $noreg, implicit killed $q1, implicit-def $q1
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $q1, $r0, $r1, $r2
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg
-  ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   MVE_VPST 2, implicit $vpr
-  ; CHECK:   renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv13, align 4)
-  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1416, align 4)
-  ; CHECK:   renamable $q1 = MVE_VFMAf32 killed renamable $q1, killed renamable $q2, killed renamable $q0, 1, killed renamable $vpr
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   liveins: $lr, $q1, $r0, $r1
+  ; CHECK:   renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.lsr.iv13, align 4)
+  ; CHECK:   renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.lsr.iv1416, align 4)
+  ; CHECK:   renamable $q1 = MVE_VFMAf32 killed renamable $q1, killed renamable $q2, killed renamable $q0, 0, killed $noreg
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.middle.block:
   ; CHECK:   liveins: $q1
   ; CHECK:   renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS renamable $s6, renamable $s7, 14 /* CC::al */, $noreg
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir
index 94e3e26c819d66..5bafc295a3eff5 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir
@@ -163,17 +163,14 @@ body:             |
   ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
   ; CHECK:   $r12 = tMOVr $r0, 14 /* CC::al */, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
-  ; CHECK:   $r4 = tMOVr $lr, 14 /* CC::al */, $noreg
+  ; CHECK:   $r4 = tMOVr killed $lr, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r3
   ; CHECK: bb.1.do.body.i:
   ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
-  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r12
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   MVE_VPST 4, implicit $vpr
-  ; CHECK:   renamable $r12, renamable $q1 = MVE_VLDRWU32_post killed renamable $r12, 16, 1, renamable $vpr :: (load 16 from %ir.pSrc.addr.0.i2, align 4)
-  ; CHECK:   renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VADDf32 killed renamable $q0, killed renamable $q1, 1, killed renamable $vpr, killed renamable $q0
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.1
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r4, $r12
+  ; CHECK:   renamable $r12, renamable $q1 = MVE_VLDRWU32_post killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.0.i2, align 4)
+  ; CHECK:   renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VADDf32 killed renamable $q0, killed renamable $q1, 0, killed $noreg, killed renamable $q0
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
   ; CHECK: bb.2.arm_mean_f32_mve.exit:
   ; CHECK:   successors: %bb.3(0x80000000)
   ; CHECK:   liveins: $q0, $r0, $r1, $r2, $r4
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
index 1404075dce9014..12c6858c961b5d 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
@@ -17,16 +17,13 @@ define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float
 ; CHECK-NEXT:    add.w lr, r12, r3, lsr #2
 ; CHECK-NEXT:    mov r3, r1
 ; CHECK-NEXT:    mov r12, r0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    mov r4, lr
+; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB0_1: @ %do.body.i
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vctp.32 r3
-; CHECK-NEXT:    subs r3, #4
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q1, [r12], #16
-; CHECK-NEXT:    vaddt.f32 q0, q0, q1
-; CHECK-NEXT:    le lr, .LBB0_1
+; CHECK-NEXT:    vldrw.u32 q1, [r12], #16
+; CHECK-NEXT:    vadd.f32 q0, q0, q1
+; CHECK-NEXT:    letp lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %arm_mean_f32_mve.exit
 ; CHECK-NEXT:    vmov s4, r1
 ; CHECK-NEXT:    vadd.f32 s0, s3, s3
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
index ea3589f48fdb7b..005524b8788944 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
@@ -117,32 +117,21 @@ body:             |
   ; CHECK: bb.1.vector.ph:
   ; CHECK:   successors: %bb.2(0x80000000)
   ; CHECK:   liveins: $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $r12 = t2MOVi 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $lr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = nuw t2ADDrs killed renamable $r12, renamable $r3, 11, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2BICri killed renamable $r12, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   $r12 = t2MOVr killed $r3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
   ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r12 = t2LSRri killed renamable $r12, 1, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r12
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r12
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 1, renamable $vpr :: (load 4 from %ir.scevgep45, align 1)
+  ; CHECK:   renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1)
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 1, renamable $vpr :: (load 4 from %ir.scevgep23, align 1)
+  ; CHECK:   renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1)
   ; CHECK:   renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4)
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.for.cond.cleanup:
   ; CHECK:   tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
   bb.0.entry:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
index 0295acb67962d1..f7e0e699c75a1e 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
@@ -117,32 +117,21 @@ body:             |
   ; CHECK: bb.1.vector.ph:
   ; CHECK:   successors: %bb.2(0x80000000)
   ; CHECK:   liveins: $r0, $r1, $r2, $r3
-  ; CHECK:   renamable $r12 = t2MOVi 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $lr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = nuw t2ADDrs killed renamable $r12, renamable $r3, 11, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2BICri killed renamable $r12, 3, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
   ; CHECK:   $r12 = t2MOVr killed $r3, 14 /* CC::al */, $noreg, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 0, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r12 = t2LSRri killed renamable $r12, 1, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r12
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r12
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 1, renamable $vpr :: (load 4 from %ir.scevgep45, align 1)
+  ; CHECK:   renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1)
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14 /* CC::al */, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 1, renamable $vpr :: (load 4 from %ir.scevgep23, align 1)
+  ; CHECK:   renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1)
   ; CHECK:   renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
-  ; CHECK:   MVE_VPST 8, implicit $vpr
-  ; CHECK:   renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4)
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.for.cond.cleanup:
   ; CHECK:   tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
   bb.0.entry:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
index b5cac5d6a3cf86..a0cdb822b370fd 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
@@ -451,9 +451,9 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(i8* nocapture read
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    add.w lr, r3, r6, lsr #2
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vdup.32 q0, r3
 ; CHECK-NEXT:    vmov.32 q0[0], r12
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB6_5: @ %vector.body46
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
@@ -686,8 +686,8 @@ define i32 @wrongop(%struct.date* nocapture readonly %pd) {
 ; CHECK-NEXT:    mla r2, r4, r3, r2
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q0, r3
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB8_6: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r1
diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index d364eb97fff72a..f3db06e571caf4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -1156,8 +1156,8 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* noca
 ; CHECK-NEXT:  @ %bb.5: @ %for.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
 ; CHECK-NEXT:    ldr.w lr, [sp] @ 4-byte Reload
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB16_6: @ %for.body
 ; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index a43f564951e93d..6f9b001ea992be 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1116,8 +1116,8 @@ define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* noc
 ; CHECK-NEXT:  @ %bb.5: @ %for.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_4 Depth=1
 ; CHECK-NEXT:    ldr.w lr, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB16_6: @ %for.body
 ; CHECK-NEXT:    @ Parent Loop BB16_4 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -1436,9 +1436,9 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_stereo_df2T_f32(%struct.arm_biqu
 ; CHECK-NEXT:    mov r6, r2
 ; CHECK-NEXT:    vmov.f32 s6, s12
 ; CHECK-NEXT:    vmov.f32 s10, s14
-; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:    vmov.f32 s7, s12
 ; CHECK-NEXT:    vmov.f32 s11, s14
+; CHECK-NEXT:    dls lr, r3
 ; CHECK-NEXT:  .LBB17_3: @ Parent Loop BB17_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vldrw.u32 q4, [r1, q0, uxtw #2]
@@ -1589,8 +1589,8 @@ define arm_aapcs_vfpcc void @fms(float* nocapture readonly %pSrc1, float* nocapt
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB18_3 Depth 2
 ; CHECK-NEXT:    ldr r4, [r2]
-; CHECK-NEXT:    dls lr, r5
 ; CHECK-NEXT:    vdup.32 q0, r4
+; CHECK-NEXT:    dls lr, r5
 ; CHECK-NEXT:  .LBB18_3: @ %while.body
 ; CHECK-NEXT:    @ Parent Loop BB18_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
diff --git a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
index 86cbec661f1f5c..68ebeaa830cb22 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll
@@ -265,9 +265,9 @@ define arm_aapcs_vfpcc void @fmss1(float* nocapture readonly %x, float* nocaptur
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB4_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:    eor r12, r4, #-2147483648
 ; CHECK-NEXT:    movs r4, #0
+; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB4_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, #4
@@ -529,9 +529,9 @@ define arm_aapcs_vfpcc void @fms1(float* nocapture readonly %x, float* nocapture
 ; CHECK-NEXT:    poplt {r4, pc}
 ; CHECK-NEXT:  .LBB8_1: @ %vector.ph
 ; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:    eor r12, r4, #-2147483648
 ; CHECK-NEXT:    movs r4, #0
+; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB8_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r4, #4
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index bba302d7fbcc0d..d158c85e401b8a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -709,12 +709,12 @@ define dso_local void @arm_mat_mult_q15(i16* noalias nocapture readonly %A, i16*
 ; CHECK-NEXT:    @ in Loop: Header=BB10_8 Depth=2
 ; CHECK-NEXT:    ldr r0, [sp, #112]
 ; CHECK-NEXT:    sub.w lr, r11, r5
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    mla r3, r0, r5, r1
 ; CHECK-NEXT:    add r5, r9
 ; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r5, r0, r5, lsl #1
 ; CHECK-NEXT:    add.w r3, r6, r3, lsl #1
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB10_14: @ %for.body8.us.us
 ; CHECK-NEXT:    @ Parent Loop BB10_5 Depth=1
 ; CHECK-NEXT:    @ Parent Loop BB10_8 Depth=2
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll
index bfc64b8c8e261c..030fb3b91cf8fa 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll
@@ -556,8 +556,8 @@ define void @ptr_iv_v8f16_mult(half* noalias nocapture readonly %A, half* noalia
 ; CHECK-NEXT:    vmov.f16 r1, s0
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
 ; CHECK-NEXT:    adr r2, .LCPI9_1
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB9_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrh.u16 q2, [r0, q0, uxtw #1]
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
index 4054b75edd0ed0..a4a67512b7199f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll
@@ -11,9 +11,9 @@ define dso_local void @mve_gather_qi_wb(i32* noalias nocapture readonly %A, i32*
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    movw lr, #1250
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vadd.i32 q0, q0, r1
 ; CHECK-NEXT:    adds r1, r3, #4
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r3
@@ -231,17 +231,11 @@ define void @justoffsets(i8* noalias nocapture readonly %r, i8* noalias nocaptur
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    beq.w .LBB3_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    adds r3, r2, #3
 ; CHECK-NEXT:    adr r7, .LCPI3_5
-; CHECK-NEXT:    bic r3, r3, #3
 ; CHECK-NEXT:    vmov.i32 q0, #0x8000
-; CHECK-NEXT:    sub.w r12, r3, #4
-; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    adr r6, .LCPI3_4
 ; CHECK-NEXT:    adr r5, .LCPI3_3
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
 ; CHECK-NEXT:    adr r4, .LCPI3_2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #160] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r7]
 ; CHECK-NEXT:    adr.w r8, .LCPI3_1
@@ -274,22 +268,18 @@ define void @justoffsets(i8* noalias nocapture readonly %r, i8* noalias nocaptur
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r3]
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #192] @ 16-byte Reload
-; CHECK-NEXT:    vctp.32 r2
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrbt.u32 q4, [r0, q0]
+; CHECK-NEXT:    vldrb.u32 q4, [r0, q0]
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #176] @ 16-byte Reload
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrbt.u32 q7, [r0, q0]
+; CHECK-NEXT:    vldrb.u32 q7, [r0, q0]
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #144] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q5, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vmul.i32 q6, q7, q0
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrbt.u32 q1, [r0, q5]
+; CHECK-NEXT:    vldrb.u32 q1, [r0, q5]
 ; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
 ; CHECK-NEXT:    vmul.i32 q3, q4, q0
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
@@ -320,14 +310,12 @@ define void @justoffsets(i8* noalias nocapture readonly %r, i8* noalias nocaptur
 ; CHECK-NEXT:    vadd.i32 q1, q1, q0
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #192] @ 16-byte Reload
 ; CHECK-NEXT:    vshr.u32 q1, q1, #16
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrbt.32 q1, [r1, q0]
+; CHECK-NEXT:    vstrb.32 q1, [r1, q0]
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #176] @ 16-byte Reload
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vstrbt.32 q2, [r1, q0]
-; CHECK-NEXT:    vstrbt.32 q6, [r1, q5]
+; CHECK-NEXT:    vstrb.32 q2, [r1, q0]
+; CHECK-NEXT:    vstrb.32 q6, [r1, q5]
 ; CHECK-NEXT:    adds r1, #12
-; CHECK-NEXT:    le lr, .LBB3_2
+; CHECK-NEXT:    letp lr, .LBB3_2
 ; CHECK-NEXT:  .LBB3_3: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #216
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
index 0f3e893fd8017a..d67ccd9393cc40 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll
@@ -257,13 +257,13 @@ define i8* @test(i8* nocapture readonly %input_row, i8* nocapture readonly %inpu
 ; CHECK-NEXT:    ldr r3, [sp, #64]
 ; CHECK-NEXT:    mov r6, r12
 ; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    mov r10, r12
 ; CHECK-NEXT:    mla r7, r11, r3, r1
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    ldrd r4, r3, [sp] @ 8-byte Folded Reload
 ; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB2_7: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB2_5 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -425,13 +425,13 @@ define i8* @test_optsize(i8* nocapture readonly %input_row, i8* nocapture readon
 ; CHECK-NEXT:    ldr r3, [sp, #64]
 ; CHECK-NEXT:    mov r6, r12
 ; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    mov r10, r12
 ; CHECK-NEXT:    mla r7, r11, r3, r1
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    ldrd r4, r3, [sp] @ 8-byte Folded Reload
 ; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB3_5: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB3_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -735,13 +735,13 @@ define i8* @signext(i8* %input_row, i8* %input_col, i16 zeroext %output_ch, i16
 ; CHECK-NEXT:    ldr.w r11, [sp, #88]
 ; CHECK-NEXT:    mov r6, r12
 ; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    dlstp.16 lr, r11
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    mov r10, r12
 ; CHECK-NEXT:    mla r3, r9, r11, r0
 ; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    ldrd r7, r0, [sp] @ 8-byte Folded Reload
 ; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:    dlstp.16 lr, r11
 ; CHECK-NEXT:  .LBB5_7: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB5_5 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -907,13 +907,13 @@ define i8* @signext_optsize(i8* %input_row, i8* %input_col, i16 zeroext %output_
 ; CHECK-NEXT:    ldr.w r11, [sp, #88]
 ; CHECK-NEXT:    mov r6, r12
 ; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    dlstp.16 lr, r11
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    mov r10, r12
 ; CHECK-NEXT:    mla r3, r9, r11, r0
 ; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    ldrd r7, r0, [sp] @ 8-byte Folded Reload
 ; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:    dlstp.16 lr, r11
 ; CHECK-NEXT:  .LBB6_5: @ %for.body24
 ; CHECK-NEXT:    @ Parent Loop BB6_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -1120,7 +1120,6 @@ define arm_aapcs_vfpcc void @_Z37_arm_radix4_butterfly_inverse_f32_mvePK21arm_cf
 ; CHECK-NEXT:    ldr.w r1, [r1, r10, lsl #2]
 ; CHECK-NEXT:    ldrd r6, r7, [r0, #32]
 ; CHECK-NEXT:    ldr.w r3, [r3, r10, lsl #2]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    add.w r6, r6, r2, lsl #2
 ; CHECK-NEXT:    add.w r12, r12, r1, lsl #2
 ; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
@@ -1129,6 +1128,7 @@ define arm_aapcs_vfpcc void @_Z37_arm_radix4_butterfly_inverse_f32_mvePK21arm_cf
 ; CHECK-NEXT:    add.w r1, r2, r11, lsl #2
 ; CHECK-NEXT:    add.w r8, r1, r11, lsl #2
 ; CHECK-NEXT:    add.w r9, r8, r11, lsl #2
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB7_7: @ Parent Loop BB7_3 Depth=1
 ; CHECK-NEXT:    @ Parent Loop BB7_6 Depth=2
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=3
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll b/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll
index 12561d560309a0..35e02faa14e018 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll
@@ -187,8 +187,8 @@ define arm_aapcs_vfpcc void @thresh_f32(float* %data, i16 zeroext %N, float %T)
 ; CHECK-NEXT:    add.w lr, r2, r1, lsr #2
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    eor r2, r1, #-2147483648
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB3_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
@@ -480,8 +480,8 @@ define arm_aapcs_vfpcc void @thresh_rev_f32(float* %data, i16 zeroext %N, float
 ; CHECK-NEXT:    add.w lr, r2, r1, lsr #2
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    eor r2, r1, #-2147483648
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB8_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index f586857f289f75..fdaea92c4329ce 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -36,8 +36,8 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    vmvn.i32 q1, #0x80000000
 ; CHECK-NEXT:    mov.w r10, #-1
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrd r4, r5, [r0]
@@ -256,10 +256,10 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    adr r7, .LCPI1_1
 ; CHECK-NEXT:    add.w r12, r0, r3, lsl #2
 ; CHECK-NEXT:    vldrw.u32 q1, [r7]
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    mov.w r3, #-1
 ; CHECK-NEXT:    mvn r9, #-2147483648
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB1_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
@@ -544,8 +544,8 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vdup.32 q1, r7
 ; CHECK-NEXT:    mov.w r12, #-1
 ; CHECK-NEXT:    mvn r8, #-2147483648
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB2_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
@@ -773,8 +773,8 @@ define arm_aapcs_vfpcc void @usatmul_2_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    add.w r11, r1, r5, lsl #2
 ; CHECK-NEXT:    add.w lr, r6, r7, lsr #1
 ; CHECK-NEXT:    add.w r12, r0, r5, lsl #2
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    str r5, [sp] @ 4-byte Spill
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB3_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldrd r4, r9, [r0]
@@ -1617,8 +1617,8 @@ define arm_aapcs_vfpcc void @ssatmul_8t_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q1, r12
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB9_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
@@ -2842,7 +2842,6 @@ define arm_aapcs_vfpcc void @ssatmul_16t_q7(i8* nocapture readonly %pSrcA, i8* n
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    add.w lr, lr, r12, lsr #4
 ; CHECK-NEXT:    sub.w r12, r3, #1
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    adr r4, .LCPI18_2
@@ -2854,6 +2853,7 @@ define arm_aapcs_vfpcc void @ssatmul_16t_q7(i8* nocapture readonly %pSrcA, i8* n
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vldrw.u32 q6, [r4]
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB18_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
@@ -3142,7 +3142,6 @@ define arm_aapcs_vfpcc void @ssatmul_16ti_q7(i8* nocapture readonly %pSrcA, i8*
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    add.w lr, lr, r12, lsr #4
 ; CHECK-NEXT:    sub.w r12, r3, #1
-; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r4]
 ; CHECK-NEXT:    adr r4, .LCPI19_2
@@ -3154,6 +3153,7 @@ define arm_aapcs_vfpcc void @ssatmul_16ti_q7(i8* nocapture readonly %pSrcA, i8*
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vldrw.u32 q6, [r4]
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB19_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload

From 69acdfe075fa8eb18781f88f4d0cd1ea40fa6e48 Mon Sep 17 00:00:00 2001
From: Max Kazantsev <mkazantsev@azul.com>
Date: Thu, 1 Oct 2020 15:58:31 +0700
Subject: [PATCH 3/7] [SCEV] Prove implicaitons via AddRec start

If we know that some predicate is true for AddRec and an invariant
(w.r.t. this AddRec's loop), this fact is, in particular, true on the first
iteration. We can try to prove the facts we need using the start value.

The motivating example is proving things like
```
  isImpliedCondOperands(>=, X, 0, {X,+,-1}, 0}
```

Differential Revision: https://reviews.llvm.org/D88208
Reviewed By: reames
---
 llvm/include/llvm/Analysis/ScalarEvolution.h  |  31 +++++-
 llvm/lib/Analysis/ScalarEvolution.cpp         | 103 +++++++++++++-----
 .../Analysis/ScalarEvolutionTest.cpp          |  32 ++++++
 3 files changed, 134 insertions(+), 32 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index febca473776aa6..158257a5aa9a10 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1677,23 +1677,30 @@ class ScalarEvolution {
   getPredecessorWithUniqueSuccessorForBB(const BasicBlock *BB) const;
 
   /// Test whether the condition described by Pred, LHS, and RHS is true
-  /// whenever the given FoundCondValue value evaluates to true.
+  /// whenever the given FoundCondValue value evaluates to true in given
+  /// Context. If Context is nullptr, then the found predicate is true
+  /// everywhere.
   bool isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
-                     const Value *FoundCondValue, bool Inverse);
+                     const Value *FoundCondValue, bool Inverse,
+                     const Instruction *Context = nullptr);
 
   /// Test whether the condition described by Pred, LHS, and RHS is true
   /// whenever the condition described by FoundPred, FoundLHS, FoundRHS is
-  /// true.
+  /// true in given Context. If Context is nullptr, then the found predicate is
+  /// true everywhere.
   bool isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
                      ICmpInst::Predicate FoundPred, const SCEV *FoundLHS,
-                     const SCEV *FoundRHS);
+                     const SCEV *FoundRHS,
+                     const Instruction *Context = nullptr);
 
   /// Test whether the condition described by Pred, LHS, and RHS is true
   /// whenever the condition described by Pred, FoundLHS, and FoundRHS is
-  /// true.
+  /// true in given Context. If Context is nullptr, then the found predicate is
+  /// true everywhere.
   bool isImpliedCondOperands(ICmpInst::Predicate Pred, const SCEV *LHS,
                              const SCEV *RHS, const SCEV *FoundLHS,
-                             const SCEV *FoundRHS);
+                             const SCEV *FoundRHS,
+                             const Instruction *Context = nullptr);
 
   /// Test whether the condition described by Pred, LHS, and RHS is true
   /// whenever the condition described by Pred, FoundLHS, and FoundRHS is
@@ -1740,6 +1747,18 @@ class ScalarEvolution {
                                           const SCEV *FoundLHS,
                                           const SCEV *FoundRHS);
 
+  /// Test whether the condition described by Pred, LHS, and RHS is true
+  /// whenever the condition described by Pred, FoundLHS, and FoundRHS is
+  /// true.
+  ///
+  /// This routine tries to weaken the known condition basing on fact that
+  /// FoundLHS is an AddRec.
+  bool isImpliedCondOperandsViaAddRecStart(ICmpInst::Predicate Pred,
+                                           const SCEV *LHS, const SCEV *RHS,
+                                           const SCEV *FoundLHS,
+                                           const SCEV *FoundRHS,
+                                           const Instruction *Context);
+
   /// Test whether the condition described by Pred, LHS, and RHS is true
   /// whenever the condition described by Pred, FoundLHS, and FoundRHS is
   /// true.
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index e51b31673105cf..a3e454fefcf0fd 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -9549,15 +9549,16 @@ bool ScalarEvolution::isBasicBlockEntryGuardedByCond(const BasicBlock *BB,
 
   // Try to prove (Pred, LHS, RHS) using isImpliedCond.
   auto ProveViaCond = [&](const Value *Condition, bool Inverse) {
-    if (isImpliedCond(Pred, LHS, RHS, Condition, Inverse))
+    const Instruction *Context = &BB->front();
+    if (isImpliedCond(Pred, LHS, RHS, Condition, Inverse, Context))
       return true;
     if (ProvingStrictComparison) {
       if (!ProvedNonStrictComparison)
-        ProvedNonStrictComparison =
-            isImpliedCond(NonStrictPredicate, LHS, RHS, Condition, Inverse);
+        ProvedNonStrictComparison = isImpliedCond(NonStrictPredicate, LHS, RHS,
+                                                  Condition, Inverse, Context);
       if (!ProvedNonEquality)
-        ProvedNonEquality =
-            isImpliedCond(ICmpInst::ICMP_NE, LHS, RHS, Condition, Inverse);
+        ProvedNonEquality = isImpliedCond(ICmpInst::ICMP_NE, LHS, RHS,
+                                          Condition, Inverse, Context);
       if (ProvedNonStrictComparison && ProvedNonEquality)
         return true;
     }
@@ -9623,7 +9624,8 @@ bool ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
 
 bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
                                     const SCEV *RHS,
-                                    const Value *FoundCondValue, bool Inverse) {
+                                    const Value *FoundCondValue, bool Inverse,
+                                    const Instruction *Context) {
   if (!PendingLoopPredicates.insert(FoundCondValue).second)
     return false;
 
@@ -9634,12 +9636,16 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
   if (const BinaryOperator *BO = dyn_cast<BinaryOperator>(FoundCondValue)) {
     if (BO->getOpcode() == Instruction::And) {
       if (!Inverse)
-        return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse) ||
-               isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse);
+        return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse,
+                             Context) ||
+               isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse,
+                             Context);
     } else if (BO->getOpcode() == Instruction::Or) {
       if (Inverse)
-        return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse) ||
-               isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse);
+        return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse,
+                             Context) ||
+               isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse,
+                             Context);
     }
   }
 
@@ -9657,14 +9663,14 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
   const SCEV *FoundLHS = getSCEV(ICI->getOperand(0));
   const SCEV *FoundRHS = getSCEV(ICI->getOperand(1));
 
-  return isImpliedCond(Pred, LHS, RHS, FoundPred, FoundLHS, FoundRHS);
+  return isImpliedCond(Pred, LHS, RHS, FoundPred, FoundLHS, FoundRHS, Context);
 }
 
 bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
                                     const SCEV *RHS,
                                     ICmpInst::Predicate FoundPred,
-                                    const SCEV *FoundLHS,
-                                    const SCEV *FoundRHS) {
+                                    const SCEV *FoundLHS, const SCEV *FoundRHS,
+                                    const Instruction *Context) {
   // Balance the types.
   if (getTypeSizeInBits(LHS->getType()) <
       getTypeSizeInBits(FoundLHS->getType())) {
@@ -9708,16 +9714,16 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
 
   // Check whether the found predicate is the same as the desired predicate.
   if (FoundPred == Pred)
-    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS);
+    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, Context);
 
   // Check whether swapping the found predicate makes it the same as the
   // desired predicate.
   if (ICmpInst::getSwappedPredicate(FoundPred) == Pred) {
     if (isa<SCEVConstant>(RHS))
-      return isImpliedCondOperands(Pred, LHS, RHS, FoundRHS, FoundLHS);
+      return isImpliedCondOperands(Pred, LHS, RHS, FoundRHS, FoundLHS, Context);
     else
-      return isImpliedCondOperands(ICmpInst::getSwappedPredicate(Pred),
-                                   RHS, LHS, FoundLHS, FoundRHS);
+      return isImpliedCondOperands(ICmpInst::getSwappedPredicate(Pred), RHS,
+                                   LHS, FoundLHS, FoundRHS, Context);
   }
 
   // Unsigned comparison is the same as signed comparison when both the operands
@@ -9725,7 +9731,7 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
   if (CmpInst::isUnsigned(FoundPred) &&
       CmpInst::getSignedPredicate(FoundPred) == Pred &&
       isKnownNonNegative(FoundLHS) && isKnownNonNegative(FoundRHS))
-    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS);
+    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, Context);
 
   // Check if we can make progress by sharpening ranges.
   if (FoundPred == ICmpInst::ICMP_NE &&
@@ -9762,8 +9768,8 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
         case ICmpInst::ICMP_UGE:
           // We know V `Pred` SharperMin.  If this implies LHS `Pred`
           // RHS, we're done.
-          if (isImpliedCondOperands(Pred, LHS, RHS, V,
-                                    getConstant(SharperMin)))
+          if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(SharperMin),
+                                    Context))
             return true;
           LLVM_FALLTHROUGH;
 
@@ -9778,7 +9784,8 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
           //
           // If V `Pred` Min implies LHS `Pred` RHS, we're done.
 
-          if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(Min)))
+          if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(Min),
+                                    Context))
             return true;
           break;
 
@@ -9786,14 +9793,14 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
         case ICmpInst::ICMP_SLE:
         case ICmpInst::ICMP_ULE:
           if (isImpliedCondOperands(CmpInst::getSwappedPredicate(Pred), RHS,
-                                    LHS, V, getConstant(SharperMin)))
+                                    LHS, V, getConstant(SharperMin), Context))
             return true;
           LLVM_FALLTHROUGH;
 
         case ICmpInst::ICMP_SLT:
         case ICmpInst::ICMP_ULT:
           if (isImpliedCondOperands(CmpInst::getSwappedPredicate(Pred), RHS,
-                                    LHS, V, getConstant(Min)))
+                                    LHS, V, getConstant(Min), Context))
             return true;
           break;
 
@@ -9807,11 +9814,12 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
   // Check whether the actual condition is beyond sufficient.
   if (FoundPred == ICmpInst::ICMP_EQ)
     if (ICmpInst::isTrueWhenEqual(Pred))
-      if (isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS))
+      if (isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS, Context))
         return true;
   if (Pred == ICmpInst::ICMP_NE)
     if (!ICmpInst::isTrueWhenEqual(FoundPred))
-      if (isImpliedCondOperands(FoundPred, LHS, RHS, FoundLHS, FoundRHS))
+      if (isImpliedCondOperands(FoundPred, LHS, RHS, FoundLHS, FoundRHS,
+                                Context))
         return true;
 
   // Otherwise assume the worst.
@@ -9890,6 +9898,44 @@ Optional<APInt> ScalarEvolution::computeConstantDifference(const SCEV *More,
   return None;
 }
 
+bool ScalarEvolution::isImpliedCondOperandsViaAddRecStart(
+    ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
+    const SCEV *FoundLHS, const SCEV *FoundRHS, const Instruction *Context) {
+  // Try to recognize the following pattern:
+  //
+  //   FoundRHS = ...
+  // ...
+  // loop:
+  //   FoundLHS = {Start,+,W}
+  // context_bb: // Basic block from the same loop
+  //   known(Pred, FoundLHS, FoundRHS)
+  //
+  // If some predicate is known in the context of a loop, it is also known on
+  // each iteration of this loop, including the first iteration. Therefore, in
+  // this case, `FoundLHS Pred FoundRHS` implies `Start Pred FoundRHS`. Try to
+  // prove the original pred using this fact.
+  if (!Context)
+    return false;
+  // Make sure AR varies in the context block.
+  if (auto *AR = dyn_cast<SCEVAddRecExpr>(FoundLHS)) {
+    if (!AR->getLoop()->contains(Context->getParent()))
+      return false;
+    if (!isAvailableAtLoopEntry(FoundRHS, AR->getLoop()))
+      return false;
+    return isImpliedCondOperands(Pred, LHS, RHS, AR->getStart(), FoundRHS);
+  }
+
+  if (auto *AR = dyn_cast<SCEVAddRecExpr>(FoundRHS)) {
+    if (!AR->getLoop()->contains(Context))
+      return false;
+    if (!isAvailableAtLoopEntry(FoundLHS, AR->getLoop()))
+      return false;
+    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, AR->getStart());
+  }
+
+  return false;
+}
+
 bool ScalarEvolution::isImpliedCondOperandsViaNoOverflow(
     ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
     const SCEV *FoundLHS, const SCEV *FoundRHS) {
@@ -10080,13 +10126,18 @@ bool ScalarEvolution::isImpliedViaMerge(ICmpInst::Predicate Pred,
 bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred,
                                             const SCEV *LHS, const SCEV *RHS,
                                             const SCEV *FoundLHS,
-                                            const SCEV *FoundRHS) {
+                                            const SCEV *FoundRHS,
+                                            const Instruction *Context) {
   if (isImpliedCondOperandsViaRanges(Pred, LHS, RHS, FoundLHS, FoundRHS))
     return true;
 
   if (isImpliedCondOperandsViaNoOverflow(Pred, LHS, RHS, FoundLHS, FoundRHS))
     return true;
 
+  if (isImpliedCondOperandsViaAddRecStart(Pred, LHS, RHS, FoundLHS, FoundRHS,
+                                          Context))
+    return true;
+
   return isImpliedCondOperandsHelper(Pred, LHS, RHS,
                                      FoundLHS, FoundRHS) ||
          // ~x < ~y --> x > y
diff --git a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
index ff33495f22711f..e5ffc21fb66466 100644
--- a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -1251,4 +1251,36 @@ TEST_F(ScalarEvolutionsTest, SCEVgetExitLimitForGuardedLoop) {
   });
 }
 
+TEST_F(ScalarEvolutionsTest, ImpliedViaAddRecStart) {
+  LLVMContext C;
+  SMDiagnostic Err;
+  std::unique_ptr<Module> M = parseAssemblyString(
+      "define void @foo(i32* %p) { "
+      "entry: "
+      "  %x = load i32, i32* %p, !range !0 "
+      "  br label %loop "
+      "loop: "
+      "  %iv = phi i32 [ %x, %entry], [%iv.next, %backedge] "
+      "  %ne.check = icmp ne i32 %iv, 0 "
+      "  br i1 %ne.check, label %backedge, label %exit "
+      "backedge: "
+      "  %iv.next = add i32 %iv, -1 "
+      "  br label %loop "
+      "exit:"
+      "  ret void "
+      "} "
+      "!0 = !{i32 0, i32 2147483647}",
+      Err, C);
+
+  ASSERT_TRUE(M && "Could not parse module?");
+  ASSERT_TRUE(!verifyModule(*M) && "Must have been well formed!");
+
+  runWithSE(*M, "foo", [](Function &F, LoopInfo &LI, ScalarEvolution &SE) {
+    auto *X = SE.getSCEV(getInstructionByName(F, "x"));
+    auto *Context = getInstructionByName(F, "iv.next");
+    EXPECT_TRUE(SE.isKnownPredicateAt(ICmpInst::ICMP_NE, X,
+                                      SE.getZero(X->getType()), Context));
+  });
+}
+
 }  // end namespace llvm

From a81b938b6dee0e1ed4dd44e7d59325d0aa4774cc Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <ntv@google.com>
Date: Thu, 1 Oct 2020 06:57:35 -0400
Subject: [PATCH 4/7] [mlir][Linalg] Fix ASAN bug

```
 LinalgTilingOptions &setTileSizes(ValueRange ts)
```
makes it all too easy to create stack-use-after-return errors.

In particular, c694588fc52a8845174fee06ad0bcfa338e87816 introduced one such issue.

Instead just take a copy in the lambda and be done with it.
---
 mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index a7f8c31e226437..e47dafc9bf52b4 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -326,9 +326,9 @@ struct LinalgTilingOptions {
   /// Set the `tileSizeComputationFunction` to return the values `ts`. The
   /// values must not fold away when tiling. Otherwise, use a more robust
   /// `tileSizeComputationFunction`.
-  LinalgTilingOptions &setTileSizes(ValueRange ts) {
-    tileSizeComputationFunction = [&](OpBuilder &, Operation *) {
-      return SmallVector<Value, 4>(ts.begin(), ts.end());
+  LinalgTilingOptions &setTileSizes(SmallVector<Value, 4> ts) {
+    tileSizeComputationFunction = [=](OpBuilder &, Operation *) {
+      return ts;
     };
     return *this;
   }

From fcf70e1e3b1d57d5fde6b99d0188d1b1774429af Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Thu, 1 Oct 2020 11:06:55 +0100
Subject: [PATCH 5/7] [SVE][CodeGen] Lower scalable fp_extend & fp_round
 operations

This patch adds FP_EXTEND_MERGE_PASSTHRU & FP_ROUND_MERGE_PASSTHRU
ISD nodes, used to lower scalable vector fp_extend/fp_round operations.
fp_round has an additional argument, the 'trunc' flag, which is an integer of zero or one.

This also fixes a warning introduced by the new tests added to sve-split-fcvt.ll,
resulting from an implicit TypeSize -> uint64_t cast in SplitVecOp_FP_ROUND.

Reviewed By: sdesmalen, paulwalker-arm

Differential Revision: https://reviews.llvm.org/D88321
---
 .../SelectionDAG/LegalizeVectorTypes.cpp      |   2 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |   4 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  15 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   2 +
 .../lib/Target/AArch64/AArch64InstrFormats.td |   7 +
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  88 +++++++----
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  26 ++++
 llvm/test/CodeGen/AArch64/sve-fcvt.ll         |  88 +++++++++++
 llvm/test/CodeGen/AArch64/sve-split-fcvt.ll   | 146 ++++++++++++++++++
 9 files changed, 340 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 356eb1ce0964b1..0b3edc3416859d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -2715,7 +2715,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) {
   EVT InVT = Lo.getValueType();
 
   EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
-                               InVT.getVectorNumElements());
+                               InVT.getVectorElementCount());
 
   if (N->isStrictFPOpcode()) {
     Lo = DAG.getNode(N->getOpcode(), DL, { OutVT, MVT::Other }, 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b9362f1e762d3e..eef467d116b7ff 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4613,8 +4613,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            Operand.getValueType().isFloatingPoint() && "Invalid FP cast!");
     if (Operand.getValueType() == VT) return Operand;  // noop conversion.
     assert((!VT.isVector() ||
-            VT.getVectorNumElements() ==
-            Operand.getValueType().getVectorNumElements()) &&
+            VT.getVectorElementCount() ==
+            Operand.getValueType().getVectorElementCount()) &&
            "Vector element count mismatch!");
     assert(Operand.getValueType().bitsLT(VT) &&
            "Invalid fpext node, dst < src!");
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d8072dbb856e48..fb70b2d801da09 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -183,6 +183,8 @@ static bool isMergePassthruOpcode(unsigned Opc) {
   case AArch64ISD::FROUND_MERGE_PASSTHRU:
   case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
   case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
+  case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
+  case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
   case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
   case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
   case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
@@ -1052,6 +1054,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FROUNDEVEN, VT, Custom);
       setOperationAction(ISD::FTRUNC, VT, Custom);
       setOperationAction(ISD::FSQRT, VT, Custom);
+      setOperationAction(ISD::FP_EXTEND, VT, Custom);
+      setOperationAction(ISD::FP_ROUND, VT, Custom);
     }
 
     setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom);
@@ -1580,6 +1584,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU)
     MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU)
@@ -2908,6 +2914,9 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
 
 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
                                               SelectionDAG &DAG) const {
+  if (Op.getValueType().isScalableVector())
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
+
   assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
 
   RTLIB::Libcall LC;
@@ -2918,6 +2927,9 @@ SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
 
 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
                                              SelectionDAG &DAG) const {
+  if (Op.getValueType().isScalableVector())
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
+
   bool IsStrict = Op->isStrictFPOpcode();
   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
   EVT SrcVT = SrcVal.getValueType();
@@ -16003,7 +16015,8 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
 
   SmallVector<SDValue, 4> Operands = {Pg};
   for (const SDValue &V : Op->op_values()) {
-    assert((isa<CondCodeSDNode>(V) || V.getValueType().isScalableVector()) &&
+    assert((!V.getValueType().isVector() ||
+            V.getValueType().isScalableVector()) &&
            "Only scalable vectors are supported!");
     Operands.push_back(V);
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 51391d309b4045..1b8f62e427dbb8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -105,6 +105,8 @@ enum NodeType : unsigned {
   FROUNDEVEN_MERGE_PASSTHRU,
   FSQRT_MERGE_PASSTHRU,
   FTRUNC_MERGE_PASSTHRU,
+  FP_ROUND_MERGE_PASSTHRU,
+  FP_EXTEND_MERGE_PASSTHRU,
   UINT_TO_FP_MERGE_PASSTHRU,
   SINT_TO_FP_MERGE_PASSTHRU,
   FCVTZU_MERGE_PASSTHRU,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 61155087cbe280..68dc477567a5d8 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -914,6 +914,13 @@ def imm0_1 : Operand<i64>, ImmLeaf<i64, [{
   let ParserMatchClass = Imm0_1Operand;
 }
 
+// timm0_1 - as above, but use TargetConstant (TImmLeaf)
+def timm0_1 : Operand<i64>, TImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 2;
+}]> {
+  let ParserMatchClass = Imm0_1Operand;
+}
+
 // imm0_15 predicate - True if the immediate is in the range [0,15]
 def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
   return ((uint64_t)Imm) < 16;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 49e8ac86e0df7f..e2c8eb9115cfa6 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -219,6 +219,13 @@ def SDT_AArch64FCVT : SDTypeProfile<1, 3, [
   SDTCVecEltisVT<1,i1>
 ]>;
 
+def SDT_AArch64FCVTR : SDTypeProfile<1, 4, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVec<4>,
+  SDTCVecEltisVT<1,i1>
+]>;
+
+def AArch64fcvtr_mt  : SDNode<"AArch64ISD::FP_ROUND_MERGE_PASSTHRU", SDT_AArch64FCVTR>;
+def AArch64fcvte_mt  : SDNode<"AArch64ISD::FP_EXTEND_MERGE_PASSTHRU", SDT_AArch64FCVT>;
 def AArch64ucvtf_mt  : SDNode<"AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU", SDT_AArch64FCVT>;
 def AArch64scvtf_mt  : SDNode<"AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU", SDT_AArch64FCVT>;
 def AArch64fcvtzu_mt : SDNode<"AArch64ISD::FCVTZU_MERGE_PASSTHRU", SDT_AArch64FCVT>;
@@ -1178,6 +1185,11 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
             (ZIP2_PPP_B PPR:$Ps, (PFALSE))>;
 
   // Extract subvectors from FP SVE vectors
+  def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 0))),
+        (UUNPKLO_ZZ_D ZPR:$Zs)>;
+  def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 2))),
+        (UUNPKHI_ZZ_D ZPR:$Zs)>;
+
   def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 0))),
             (UUNPKLO_ZZ_S ZPR:$Zs)>;
   def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 4))),
@@ -1400,40 +1412,48 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>;
   defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>;
 
-  defm FCVT_ZPmZ_StoH   : sve_fp_2op_p_zd<0b1001000, "fcvt",   ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32,   null_frag,        nxv8f16, nxv4i1, nxv4f32, ElementSizeS>;
-  defm FCVT_ZPmZ_HtoS   : sve_fp_2op_p_zd<0b1001001, "fcvt",   ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16,   null_frag,        nxv4f32, nxv4i1, nxv8f16, ElementSizeS>;
-  defm SCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd<0b0110010, "scvtf",  ZPR16, ZPR16, null_frag,                     AArch64scvtf_mt,  nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
-  defm SCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd<0b1010100, "scvtf",  ZPR32, ZPR32, null_frag,                     AArch64scvtf_mt,  nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
-  defm UCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd<0b1010101, "ucvtf",  ZPR32, ZPR32, null_frag,                     AArch64ucvtf_mt,  nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
-  defm UCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd<0b0110011, "ucvtf",  ZPR16, ZPR16, null_frag,                     AArch64ucvtf_mt,  nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
-  defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, null_frag,                     AArch64fcvtzs_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
-  defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, null_frag,                     AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
-  defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, null_frag,                     AArch64fcvtzu_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
-  defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, null_frag,                     AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
-  defm FCVT_ZPmZ_DtoH   : sve_fp_2op_p_zd<0b1101000, "fcvt",   ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64,   null_frag,        nxv8f16, nxv2i1, nxv2f64, ElementSizeD>;
-  defm FCVT_ZPmZ_HtoD   : sve_fp_2op_p_zd<0b1101001, "fcvt",   ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16,   null_frag,        nxv2f64, nxv2i1, nxv8f16, ElementSizeD>;
-  defm FCVT_ZPmZ_DtoS   : sve_fp_2op_p_zd<0b1101010, "fcvt",   ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64,   null_frag,        nxv4f32, nxv2i1, nxv2f64, ElementSizeD>;
-  defm FCVT_ZPmZ_StoD   : sve_fp_2op_p_zd<0b1101011, "fcvt",   ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32,   null_frag,        nxv2f64, nxv2i1, nxv4f32, ElementSizeD>;
-  defm SCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd<0b1110000, "scvtf",  ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32,  AArch64scvtf_mt,  nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
-  defm UCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd<0b1110001, "ucvtf",  ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32,  AArch64ucvtf_mt,  nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
-  defm UCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd<0b0110101, "ucvtf",  ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32,  AArch64ucvtf_mt,  nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
-  defm SCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd<0b1110100, "scvtf",  ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64,  AArch64scvtf_mt,  nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
-  defm SCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd<0b0110100, "scvtf",  ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32,  AArch64scvtf_mt,  nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
-  defm SCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd<0b0110110, "scvtf",  ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64,  AArch64scvtf_mt,  nxv2f16, nxv2i1, nxv2i64, ElementSizeD>;
-  defm UCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd<0b1110101, "ucvtf",  ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64,  AArch64ucvtf_mt,  nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
-  defm UCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd<0b0110111, "ucvtf",  ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64,  AArch64ucvtf_mt,  nxv2f16, nxv2i1, nxv2i64, ElementSizeD>;
-  defm SCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd<0b1110110, "scvtf",  ZPR64, ZPR64, null_frag,                     AArch64scvtf_mt,  nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
-  defm UCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd<0b1110111, "ucvtf",  ZPR64, ZPR64, null_frag,                     AArch64ucvtf_mt,  nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
-  defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, null_frag,        nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
-  defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, null_frag,        nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
-  defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>;
-  defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>;
-  defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>;
-  defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>;
-  defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>;
-  defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>;
-  defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, null_frag,                     AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
-  defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, null_frag,                     AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVT_ZPmZ_StoH   : sve_fp_2op_p_zdr<0b1001000, "fcvt",   ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32,   AArch64fcvtr_mt,  nxv4f16, nxv4i1, nxv4f32, ElementSizeS>;
+  defm FCVT_ZPmZ_HtoS   : sve_fp_2op_p_zd< 0b1001001, "fcvt",   ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16,   AArch64fcvte_mt,  nxv4f32, nxv4i1, nxv4f16, ElementSizeS>;
+  defm SCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd< 0b0110010, "scvtf",  ZPR16, ZPR16, null_frag,                     AArch64scvtf_mt,  nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
+  defm SCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd< 0b1010100, "scvtf",  ZPR32, ZPR32, null_frag,                     AArch64scvtf_mt,  nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
+  defm UCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd< 0b1010101, "ucvtf",  ZPR32, ZPR32, null_frag,                     AArch64ucvtf_mt,  nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
+  defm UCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd< 0b0110011, "ucvtf",  ZPR16, ZPR16, null_frag,                     AArch64ucvtf_mt,  nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
+  defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0111010, "fcvtzs", ZPR16, ZPR16, null_frag,                     AArch64fcvtzs_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
+  defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1011100, "fcvtzs", ZPR32, ZPR32, null_frag,                     AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
+  defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd< 0b0111011, "fcvtzu", ZPR16, ZPR16, null_frag,                     AArch64fcvtzu_mt, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
+  defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd< 0b1011101, "fcvtzu", ZPR32, ZPR32, null_frag,                     AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
+  defm FCVT_ZPmZ_DtoH   : sve_fp_2op_p_zdr<0b1101000, "fcvt",   ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64,   AArch64fcvtr_mt,  nxv2f16, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVT_ZPmZ_HtoD   : sve_fp_2op_p_zd< 0b1101001, "fcvt",   ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16,   AArch64fcvte_mt,  nxv2f64, nxv2i1, nxv2f16, ElementSizeD>;
+  defm FCVT_ZPmZ_DtoS   : sve_fp_2op_p_zdr<0b1101010, "fcvt",   ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64,   AArch64fcvtr_mt,  nxv2f32, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVT_ZPmZ_StoD   : sve_fp_2op_p_zd< 0b1101011, "fcvt",   ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32,   AArch64fcvte_mt,  nxv2f64, nxv2i1, nxv2f32, ElementSizeD>;
+  defm SCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd< 0b1110000, "scvtf",  ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32,  AArch64scvtf_mt,  nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
+  defm UCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd< 0b1110001, "ucvtf",  ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32,  AArch64ucvtf_mt,  nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
+  defm UCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd< 0b0110101, "ucvtf",  ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32,  AArch64ucvtf_mt,  nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
+  defm SCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd< 0b1110100, "scvtf",  ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64,  AArch64scvtf_mt,  nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
+  defm SCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd< 0b0110100, "scvtf",  ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32,  AArch64scvtf_mt,  nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
+  defm SCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd< 0b0110110, "scvtf",  ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64,  AArch64scvtf_mt,  nxv2f16, nxv2i1, nxv2i64, ElementSizeD>;
+  defm UCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd< 0b1110101, "ucvtf",  ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64,  AArch64ucvtf_mt,  nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
+  defm UCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd< 0b0110111, "ucvtf",  ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64,  AArch64ucvtf_mt,  nxv2f16, nxv2i1, nxv2i64, ElementSizeD>;
+  defm SCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd< 0b1110110, "scvtf",  ZPR64, ZPR64, null_frag,                     AArch64scvtf_mt,  nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
+  defm UCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd< 0b1110111, "ucvtf",  ZPR64, ZPR64, null_frag,                     AArch64ucvtf_mt,  nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
+  defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, null_frag,        nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd< 0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, null_frag,        nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>;
+  defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, AArch64fcvtzs_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>;
+  defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>;
+  defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd< 0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, AArch64fcvtzu_mt, nxv4i32, nxv4i1, nxv4f16, ElementSizeS>;
+  defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd< 0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f16, ElementSizeD>;
+  defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd< 0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f32, ElementSizeD>;
+  defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111110, "fcvtzs", ZPR64, ZPR64, null_frag,                     AArch64fcvtzs_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd< 0b1111111, "fcvtzu", ZPR64, ZPR64, null_frag,                     AArch64fcvtzu_mt, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
+
+  def : Pat<(nxv2f32 (AArch64fcvte_mt (nxv2i1 PPR:$Pg), (nxv2f16 ZPR:$Zs), (nxv2f32 ZPR:$Zd))),
+            (FCVT_ZPmZ_HtoS ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
+
+  // FP_ROUND has an additional 'precise' flag which indicates the type of rounding.
+  // This is ignored by the pattern below where it is matched by (i64 timm0_1)
+  def : Pat<(nxv2f16 (AArch64fcvtr_mt (nxv2i1 PPR:$Pg), (nxv2f32 ZPR:$Zs), (i64 timm0_1), (nxv2f16 ZPR:$Zd))),
+            (FCVT_ZPmZ_StoH ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
 
   // Floating-point -> signed integer
   def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 PPR:$Pg),
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index d0226a73d87d2d..45a712c897a441 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -318,6 +318,13 @@ class SVE_1_Op_Passthru_Pat<ValueType vtd, SDPatternOperator op, ValueType pg,
 : Pat<(vtd (op pg:$Op1, vts:$Op2, vtd:$Op3)),
       (inst $Op3, $Op1, $Op2)>;
 
+// Used to match FP_ROUND_MERGE_PASSTHRU, which has an additional flag for the
+// type of rounding. This is matched by timm0_1 in pattern below and ignored.
+class SVE_1_Op_Passthru_Round_Pat<ValueType vtd, SDPatternOperator op, ValueType pg,
+                                  ValueType vts, Instruction inst>
+: Pat<(vtd (op pg:$Op1, vts:$Op2, (i64 timm0_1), vtd:$Op3)),
+      (inst $Op3, $Op1, $Op2)>;
+
 class SVE_1_Op_Imm_OptLsl_Reverse_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
                                       ValueType it, ComplexPattern cpx, Instruction inst>
   : Pat<(vt (op (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))), (vt zprty:$Op1))),
@@ -2299,6 +2306,25 @@ multiclass sve_fp_2op_p_zd<bits<7> opc, string asm,
   def : SVE_1_Op_Passthru_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>;
 }
 
+multiclass sve_fp_2op_p_zdr<bits<7> opc, string asm,
+                            RegisterOperand i_zprtype,
+                            RegisterOperand o_zprtype,
+                            SDPatternOperator int_op,
+                            SDPatternOperator ir_op, ValueType vt1,
+                            ValueType vt2, ValueType vt3, ElementSizeEnum Sz> {
+  def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>;
+
+  // convert vt1 to a packed type for the intrinsic patterns
+  defvar packedvt1 = !cond(!eq(!cast<string>(vt1), "nxv2f16"): nxv8f16,
+                           !eq(!cast<string>(vt1), "nxv4f16"): nxv8f16,
+                           !eq(!cast<string>(vt1), "nxv2f32"): nxv4f32,
+                           1 : vt1);
+
+  def : SVE_3_Op_Pat<packedvt1, int_op, packedvt1, vt2, vt3, !cast<Instruction>(NAME)>;
+
+  def : SVE_1_Op_Passthru_Round_Pat<vt1, ir_op, vt2, vt3, !cast<Instruction>(NAME)>;
+}
+
 multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm, SDPatternOperator op> {
   def _H : sve_fp_2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16, ElementSizeH>;
   def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32, ElementSizeS>;
diff --git a/llvm/test/CodeGen/AArch64/sve-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-fcvt.ll
index 9b980ac25c108c..1b395806755d12 100644
--- a/llvm/test/CodeGen/AArch64/sve-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fcvt.ll
@@ -5,6 +5,94 @@
 ; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
 ; WARN-NOT: warning
 
+;
+; FP_EXTEND
+;
+
+define <vscale x 2 x float> @fcvts_nxv2f16(<vscale x 2 x half> %a) {
+; CHECK-LABEL: fcvts_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 2 x half> %a to <vscale x 2 x float>
+  ret <vscale x 2 x float> %res
+}
+
+define <vscale x 4 x float> @fcvts_nxv4f16(<vscale x 4 x half> %a) {
+; CHECK-LABEL: fcvts_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 4 x half> %a to <vscale x 4 x float>
+  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x double> @fcvtd_nxv2f16(<vscale x 2 x half> %a) {
+; CHECK-LABEL: fcvtd_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.d, p0/m, z0.h
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 2 x half> %a to <vscale x 2 x double>
+  ret <vscale x 2 x double> %res
+}
+
+define <vscale x 2 x double> @fcvtd_nxv2f32(<vscale x 2 x float> %a) {
+; CHECK-LABEL: fcvtd_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.d, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 2 x float> %a to <vscale x 2 x double>
+  ret <vscale x 2 x double> %res
+}
+
+;
+; FP_ROUND
+;
+
+define <vscale x 2 x half> @fcvth_nxv2f32(<vscale x 2 x float> %a) {
+; CHECK-LABEL: fcvth_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fptrunc <vscale x 2 x float> %a to <vscale x 2 x half>
+  ret <vscale x 2 x half> %res
+}
+
+define <vscale x 4 x half> @fcvth_nxv4f32(<vscale x 4 x float> %a) {
+; CHECK-LABEL: fcvth_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %res = fptrunc <vscale x 4 x float> %a to <vscale x 4 x half>
+  ret <vscale x 4 x half> %res
+}
+
+define <vscale x 2 x half> @fcvth_nxv2f64(<vscale x 2 x double> %a) {
+; CHECK-LABEL: fcvth_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %res = fptrunc <vscale x 2 x double> %a to <vscale x 2 x half>
+  ret <vscale x 2 x half> %res
+}
+
+define <vscale x 2 x float> @fcvts_nxv2f64(<vscale x 2 x double> %a) {
+; CHECK-LABEL: fcvts_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %res = fptrunc <vscale x 2 x double> %a to <vscale x 2 x float>
+  ret <vscale x 2 x float> %res
+}
+
 ;
 ; FP_TO_SINT
 ;
diff --git a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
index 41b3e0ee13e16c..6f608c830cfe51 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
@@ -5,6 +5,152 @@
 ; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
 ; WARN-NOT: warning
 
+; FP_EXTEND
+
+define <vscale x 8 x float> @fcvts_nxv8f16(<vscale x 8 x half> %a) {
+; CHECK-LABEL: fcvts_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z1.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uunpkhi z2.s, z0.h
+; CHECK-NEXT:    fcvt z0.s, p0/m, z1.h
+; CHECK-NEXT:    fcvt z1.s, p0/m, z2.h
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
+  ret <vscale x 8 x float> %res
+}
+
+define <vscale x 4 x double> @fcvtd_nxv4f16(<vscale x 4 x half> %a) {
+; CHECK-LABEL: fcvtd_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    fcvt z0.d, p0/m, z1.h
+; CHECK-NEXT:    fcvt z1.d, p0/m, z2.h
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 4 x half> %a to <vscale x 4 x double>
+  ret <vscale x 4 x double> %res
+}
+
+define <vscale x 8 x double> @fcvtd_nxv8f16(<vscale x 8 x half> %a) {
+; CHECK-LABEL: fcvtd_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z1.s, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z2.d, z1.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z3.d, z0.s
+; CHECK-NEXT:    uunpkhi z4.d, z0.s
+; CHECK-NEXT:    fcvt z0.d, p0/m, z2.h
+; CHECK-NEXT:    fcvt z1.d, p0/m, z1.h
+; CHECK-NEXT:    fcvt z2.d, p0/m, z3.h
+; CHECK-NEXT:    fcvt z3.d, p0/m, z4.h
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 8 x half> %a to <vscale x 8 x double>
+  ret <vscale x 8 x double> %res
+}
+
+define <vscale x 4 x double> @fcvtd_nxv4f32(<vscale x 4 x float> %a) {
+; CHECK-LABEL: fcvtd_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    fcvt z0.d, p0/m, z1.s
+; CHECK-NEXT:    fcvt z1.d, p0/m, z2.s
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 4 x float> %a to <vscale x 4 x double>
+  ret <vscale x 4 x double> %res
+}
+
+define <vscale x 8 x double> @fcvtd_nxv8f32(<vscale x 8 x float> %a) {
+; CHECK-LABEL: fcvtd_nxv8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uunpklo z2.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z3.d, z0.s
+; CHECK-NEXT:    uunpklo z4.d, z1.s
+; CHECK-NEXT:    uunpkhi z5.d, z1.s
+; CHECK-NEXT:    fcvt z0.d, p0/m, z2.s
+; CHECK-NEXT:    fcvt z1.d, p0/m, z3.s
+; CHECK-NEXT:    fcvt z2.d, p0/m, z4.s
+; CHECK-NEXT:    fcvt z3.d, p0/m, z5.s
+; CHECK-NEXT:    ret
+  %res = fpext <vscale x 8 x float> %a to <vscale x 8 x double>
+  ret <vscale x 8 x double> %res
+}
+
+; FP_ROUND
+
+define <vscale x 8 x half> @fcvth_nxv8f32(<vscale x 8 x float> %a) {
+; CHECK-LABEL: fcvth_nxv8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt z1.h, p0/m, z1.s
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %res = fptrunc <vscale x 8 x float> %a to <vscale x 8 x half>
+  ret <vscale x 8 x half> %res
+}
+
+define <vscale x 8 x half> @fcvth_nxv8f64(<vscale x 8 x double> %a) {
+; CHECK-LABEL: fcvth_nxv8f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z3.h, p0/m, z3.d
+; CHECK-NEXT:    fcvt z2.h, p0/m, z2.d
+; CHECK-NEXT:    fcvt z1.h, p0/m, z1.d
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT:    ret
+  %res = fptrunc <vscale x 8 x double> %a to <vscale x 8 x half>
+  ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x half> @fcvth_nxv4f64(<vscale x 4 x double> %a) {
+; CHECK-LABEL: fcvth_nxv4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z1.h, p0/m, z1.d
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %res = fptrunc <vscale x 4 x double> %a to <vscale x 4 x half>
+  ret <vscale x 4 x half> %res
+}
+
+define <vscale x 4 x float> @fcvts_nxv4f64(<vscale x 4 x double> %a) {
+; CHECK-LABEL: fcvts_nxv4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z1.s, p0/m, z1.d
+; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %res = fptrunc <vscale x 4 x double> %a to <vscale x 4 x float>
+  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 8 x float> @fcvts_nxv8f64(<vscale x 8 x double> %a) {
+; CHECK-LABEL: fcvts_nxv8f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z1.s, p0/m, z1.d
+; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
+; CHECK-NEXT:    fcvt z3.s, p0/m, z3.d
+; CHECK-NEXT:    fcvt z2.s, p0/m, z2.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z1.s, z2.s, z3.s
+; CHECK-NEXT:    ret
+  %res = fptrunc <vscale x 8 x double> %a to <vscale x 8 x float>
+  ret <vscale x 8 x float> %res
+}
+
 ; FP_TO_SINT
 
 ; Split operand

From ef4e971e5e18ae796466623df8f26265ba6bdfb5 Mon Sep 17 00:00:00 2001
From: Andrew Paverd <andrew.paverd@microsoft.com>
Date: Thu, 1 Oct 2020 10:07:40 +0100
Subject: [PATCH 6/7] [CFGuard] Add address-taken IAT tables and delay-load
 support

This patch adds support for creating Guard Address-Taken IAT Entry Tables (.giats$y sections) in object files, matching the behavior of MSVC. These contain lists of address-taken imported functions, which are used by the linker to create the final GIATS table.
Additionally, if any DLLs are delay-loaded, the linker must look through the .giats tables and add the respective load thunks of address-taken imports to the GFIDS table, as these are also valid call targets.

Reviewed By: rnk

Differential Revision: https://reviews.llvm.org/D87544
---
 lld/COFF/DLL.cpp                              |  10 ++
 lld/COFF/ICF.cpp                              |   2 +-
 lld/COFF/InputFiles.cpp                       |   2 +
 lld/COFF/InputFiles.h                         |   7 +-
 lld/COFF/Symbols.h                            |   7 ++
 lld/COFF/Writer.cpp                           |  46 ++++++-
 lld/test/COFF/giats.s                         | 117 ++++++++++++++++++
 llvm/include/llvm/MC/MCObjectFileInfo.h       |   2 +
 llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp    |  47 +++++--
 llvm/lib/MC/MCObjectFileInfo.cpp              |   5 +
 llvm/test/CodeGen/WinCFGuard/cfguard-giats.ll |  22 ++++
 llvm/tools/llvm-readobj/COFFDumper.cpp        |  10 ++
 12 files changed, 259 insertions(+), 18 deletions(-)
 create mode 100644 lld/test/COFF/giats.s
 create mode 100644 llvm/test/CodeGen/WinCFGuard/cfguard-giats.ll

diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index 50301ad91b1d58..e88a6b1bffb064 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -19,6 +19,7 @@
 
 #include "DLL.h"
 #include "Chunks.h"
+#include "SymbolTable.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Path.h"
@@ -653,9 +654,18 @@ void DelayLoadContents::create(Defined *h) {
         auto *c = make<HintNameChunk>(extName, 0);
         names.push_back(make<LookupChunk>(c));
         hintNames.push_back(c);
+        // Add a syntentic symbol for this load thunk, using the "__imp_load"
+        // prefix, in case this thunk needs to be added to the list of valid
+        // call targets for Control Flow Guard.
+        StringRef symName = saver.save("__imp_load_" + extName);
+        s->loadThunkSym =
+            cast<DefinedSynthetic>(symtab->addSynthetic(symName, t));
       }
     }
     thunks.push_back(tm);
+    StringRef tmName =
+        saver.save("__tailMerge_" + syms[0]->getDLLName().lower());
+    symtab->addSynthetic(tmName, tm);
     // Terminate with null values.
     addresses.push_back(make<NullChunk>(8));
     names.push_back(make<NullChunk>(8));
diff --git a/lld/COFF/ICF.cpp b/lld/COFF/ICF.cpp
index 1b33634b63d6af..386f861fb27fb7 100644
--- a/lld/COFF/ICF.cpp
+++ b/lld/COFF/ICF.cpp
@@ -131,7 +131,7 @@ bool ICF::assocEquals(const SectionChunk *a, const SectionChunk *b) {
   auto considerForICF = [](const SectionChunk &assoc) {
     StringRef Name = assoc.getSectionName();
     return !(Name.startswith(".debug") || Name == ".gfids$y" ||
-             Name == ".gljmp$y");
+             Name == ".giats$y" || Name == ".gljmp$y");
   };
   auto ra = make_filter_range(a->children(), considerForICF);
   auto rb = make_filter_range(b->children(), considerForICF);
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index aaa00d0f7279a8..37f66131620e69 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -280,6 +280,8 @@ SectionChunk *ObjFile::readSection(uint32_t sectionNumber,
     debugChunks.push_back(c);
   else if (name == ".gfids$y")
     guardFidChunks.push_back(c);
+  else if (name == ".giats$y")
+    guardIATChunks.push_back(c);
   else if (name == ".gljmp$y")
     guardLJmpChunks.push_back(c);
   else if (name == ".sxdata")
diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h
index 0a5114b165f0cb..26a6e5b7b70d96 100644
--- a/lld/COFF/InputFiles.h
+++ b/lld/COFF/InputFiles.h
@@ -144,6 +144,7 @@ class ObjFile : public InputFile {
   ArrayRef<SectionChunk *> getDebugChunks() { return debugChunks; }
   ArrayRef<SectionChunk *> getSXDataChunks() { return sxDataChunks; }
   ArrayRef<SectionChunk *> getGuardFidChunks() { return guardFidChunks; }
+  ArrayRef<SectionChunk *> getGuardIATChunks() { return guardIATChunks; }
   ArrayRef<SectionChunk *> getGuardLJmpChunks() { return guardLJmpChunks; }
   ArrayRef<Symbol *> getSymbols() { return symbols; }
 
@@ -283,9 +284,11 @@ class ObjFile : public InputFile {
   // 32-bit x86.
   std::vector<SectionChunk *> sxDataChunks;
 
-  // Chunks containing symbol table indices of address taken symbols and longjmp
-  // targets.  These are not linked into the final binary when /guard:cf is set.
+  // Chunks containing symbol table indices of address taken symbols, address
+  // taken IAT entries, and longjmp targets. These are not linked into the
+  // final binary when /guard:cf is set.
   std::vector<SectionChunk *> guardFidChunks;
+  std::vector<SectionChunk *> guardIATChunks;
   std::vector<SectionChunk *> guardLJmpChunks;
 
   // This vector contains a list of all symbols defined or referenced by this
diff --git a/lld/COFF/Symbols.h b/lld/COFF/Symbols.h
index 1da4df3669662e..370f72745900d4 100644
--- a/lld/COFF/Symbols.h
+++ b/lld/COFF/Symbols.h
@@ -343,6 +343,13 @@ class DefinedImportData : public Defined {
   uint16_t getOrdinal() { return file->hdr->OrdinalHint; }
 
   ImportFile *file;
+
+  // This is a pointer to the synthetic symbol associated with the load thunk
+  // for this symbol that will be called if the DLL is delay-loaded. This is
+  // needed for Control Flow Guard because if this DefinedImportData symbol is a
+  // valid call target, the corresponding load thunk must also be marked as a
+  // valid call target.
+  DefinedSynthetic *loadThunkSym;
 };
 
 // This class represents a symbol for a jump table entry which jumps
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index d1081b008ea40e..b437a681483ff3 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -227,6 +227,9 @@ class Writer {
   void markSymbolsForRVATable(ObjFile *file,
                               ArrayRef<SectionChunk *> symIdxChunks,
                               SymbolRVASet &tableSymbols);
+  void getSymbolsFromSections(ObjFile *file,
+                              ArrayRef<SectionChunk *> symIdxChunks,
+                              std::vector<Symbol *> &symbols);
   void maybeAddRVATable(SymbolRVASet tableSymbols, StringRef tableSym,
                         StringRef countSym);
   void setSectionPermissions();
@@ -605,8 +608,9 @@ void Writer::run() {
 
   createImportTables();
   createSections();
-  createMiscChunks();
   appendImportThunks();
+  // Import thunks must be added before the Control Flow Guard tables are added.
+  createMiscChunks();
   createExportTable();
   mergeSections();
   removeUnusedSections();
@@ -1618,6 +1622,8 @@ static void markSymbolsWithRelocations(ObjFile *file,
 // table.
 void Writer::createGuardCFTables() {
   SymbolRVASet addressTakenSyms;
+  SymbolRVASet giatsRVASet;
+  std::vector<Symbol *> giatsSymbols;
   SymbolRVASet longJmpTargets;
   for (ObjFile *file : ObjFile::instances) {
     // If the object was compiled with /guard:cf, the address taken symbols
@@ -1627,6 +1633,8 @@ void Writer::createGuardCFTables() {
     // possibly address-taken.
     if (file->hasGuardCF()) {
       markSymbolsForRVATable(file, file->getGuardFidChunks(), addressTakenSyms);
+      markSymbolsForRVATable(file, file->getGuardIATChunks(), giatsRVASet);
+      getSymbolsFromSections(file, file->getGuardIATChunks(), giatsSymbols);
       markSymbolsForRVATable(file, file->getGuardLJmpChunks(), longJmpTargets);
     } else {
       markSymbolsWithRelocations(file, addressTakenSyms);
@@ -1641,6 +1649,16 @@ void Writer::createGuardCFTables() {
   for (Export &e : config->exports)
     maybeAddAddressTakenFunction(addressTakenSyms, e.sym);
 
+  // For each entry in the .giats table, check if it has a corresponding load
+  // thunk (e.g. because the DLL that defines it will be delay-loaded) and, if
+  // so, add the load thunk to the address taken (.gfids) table.
+  for (Symbol *s : giatsSymbols) {
+    if (auto *di = dyn_cast<DefinedImportData>(s)) {
+      if (di->loadThunkSym)
+        addSymbolToRVASet(addressTakenSyms, di->loadThunkSym);
+    }
+  }
+
   // Ensure sections referenced in the gfid table are 16-byte aligned.
   for (const ChunkAndOffset &c : addressTakenSyms)
     if (c.inputChunk->getAlignment() < 16)
@@ -1649,6 +1667,10 @@ void Writer::createGuardCFTables() {
   maybeAddRVATable(std::move(addressTakenSyms), "__guard_fids_table",
                    "__guard_fids_count");
 
+  // Add the Guard Address Taken IAT Entry Table (.giats).
+  maybeAddRVATable(std::move(giatsRVASet), "__guard_iat_table",
+                   "__guard_iat_count");
+
   // Add the longjmp target table unless the user told us not to.
   if (config->guardCF == GuardCFLevel::Full)
     maybeAddRVATable(std::move(longJmpTargets), "__guard_longjmp_table",
@@ -1665,11 +1687,11 @@ void Writer::createGuardCFTables() {
 }
 
 // Take a list of input sections containing symbol table indices and add those
-// symbols to an RVA table. The challenge is that symbol RVAs are not known and
+// symbols to a vector. The challenge is that symbol RVAs are not known and
 // depend on the table size, so we can't directly build a set of integers.
-void Writer::markSymbolsForRVATable(ObjFile *file,
+void Writer::getSymbolsFromSections(ObjFile *file,
                                     ArrayRef<SectionChunk *> symIdxChunks,
-                                    SymbolRVASet &tableSymbols) {
+                                    std::vector<Symbol *> &symbols) {
   for (SectionChunk *c : symIdxChunks) {
     // Skip sections discarded by linker GC. This comes up when a .gfids section
     // is associated with something like a vtable and the vtable is discarded.
@@ -1687,7 +1709,7 @@ void Writer::markSymbolsForRVATable(ObjFile *file,
     }
 
     // Read each symbol table index and check if that symbol was included in the
-    // final link. If so, add it to the table symbol set.
+    // final link. If so, add it to the vector of symbols.
     ArrayRef<ulittle32_t> symIndices(
         reinterpret_cast<const ulittle32_t *>(data.data()), data.size() / 4);
     ArrayRef<Symbol *> objSymbols = file->getSymbols();
@@ -1699,12 +1721,24 @@ void Writer::markSymbolsForRVATable(ObjFile *file,
       }
       if (Symbol *s = objSymbols[symIndex]) {
         if (s->isLive())
-          addSymbolToRVASet(tableSymbols, cast<Defined>(s));
+          symbols.push_back(cast<Symbol>(s));
       }
     }
   }
 }
 
+// Take a list of input sections containing symbol table indices and add those
+// symbols to an RVA table.
+void Writer::markSymbolsForRVATable(ObjFile *file,
+                                    ArrayRef<SectionChunk *> symIdxChunks,
+                                    SymbolRVASet &tableSymbols) {
+  std::vector<Symbol *> syms;
+  getSymbolsFromSections(file, symIdxChunks, syms);
+
+  for (Symbol *s : syms)
+    addSymbolToRVASet(tableSymbols, cast<Defined>(s));
+}
+
 // Replace the absolute table symbol with a synthetic symbol pointing to
 // tableChunk so that we can emit base relocations for it and resolve section
 // relative relocations.
diff --git a/lld/test/COFF/giats.s b/lld/test/COFF/giats.s
new file mode 100644
index 00000000000000..f18720f3692faa
--- /dev/null
+++ b/lld/test/COFF/giats.s
@@ -0,0 +1,117 @@
+# REQUIRES: x86
+
+# Make a DLL that exports exportfn1.
+# RUN: yaml2obj %p/Inputs/export.yaml -o %basename_t-exp.obj
+# RUN: lld-link /out:%basename_t-exp.dll /dll %basename_t-exp.obj /export:exportfn1 /implib:%basename_t-exp.lib
+
+# Make an object file that imports exportfn1.
+# RUN: llvm-mc -triple x86_64-windows-msvc %s -filetype=obj -o %basename_t.obj
+
+# Check that the Guard address-taken IAT entry tables are propagated to the final executable.
+# RUN: lld-link %basename_t.obj -guard:cf -entry:main -out:%basename_t-nodelay.exe %basename_t-exp.lib
+# RUN: llvm-readobj --file-headers --coff-load-config %basename_t-nodelay.exe | FileCheck %s --check-prefix CHECK
+
+# CHECK: ImageBase: 0x140000000
+# CHECK: LoadConfig [
+# CHECK:   GuardCFFunctionTable: 0x140002114
+# CHECK:   GuardCFFunctionCount: 1
+# CHECK:   GuardFlags: 0x10500
+# CHECK:   GuardAddressTakenIatEntryTable: 0x140002118
+# CHECK:   GuardAddressTakenIatEntryCount: 1
+# CHECK: ]
+# CHECK:      GuardFidTable [
+# CHECK-NEXT:   0x14000{{.*}}
+# CHECK-NEXT: ]
+# CHECK:      GuardIatTable [
+# CHECK-NEXT:   0x14000{{.*}}
+# CHECK-NEXT: ]
+
+
+# Check that the additional load thunk symbol is added to the GFIDs table.
+# RUN: lld-link %basename_t.obj -guard:cf -entry:main -out:%basename_t-delay.exe %basename_t-exp.lib -alternatename:__delayLoadHelper2=main -delayload:%basename_t-exp.dll
+# RUN: llvm-readobj --file-headers --coff-load-config %basename_t-delay.exe | FileCheck %s --check-prefix DELAY-CHECK
+
+# DELAY-CHECK: ImageBase: 0x140000000
+# DELAY-CHECK: LoadConfig [
+# DELAY-CHECK:   GuardCFFunctionTable: 0x140002114
+# DELAY-CHECK:   GuardCFFunctionCount: 2
+# DELAY-CHECK:   GuardFlags: 0x10500
+# DELAY-CHECK:   GuardAddressTakenIatEntryTable: 0x14000211C
+# DELAY-CHECK:   GuardAddressTakenIatEntryCount: 1
+# DELAY-CHECK: ]
+# DELAY-CHECK:      GuardFidTable [
+# DELAY-CHECK-NEXT:   0x14000{{.*}}
+# DELAY-CHECK-NEXT:   0x14000{{.*}}
+# DELAY-CHECK-NEXT: ]
+# DELAY-CHECK:      GuardIatTable [
+# DELAY-CHECK-NEXT:   0x14000{{.*}}
+# DELAY-CHECK-NEXT: ]
+
+
+# This assembly is reduced from C code like:
+# __declspec(noinline)
+# void IndirectCall(BOOL (func)(HANDLE)) {
+#   (*func)(NULL);
+# }
+# int main(int argc, char** argv) {
+#   IndirectCall(exportfn1);
+# }
+
+	.text
+	.def	 @feat.00;
+	.scl	3;
+	.type	0;
+	.endef
+	.globl	@feat.00
+.set @feat.00, 2048
+	.def	 IndirectCall;	.scl	2;	.type	32;	.endef
+	.globl	IndirectCall                    # -- Begin function IndirectCall
+	.p2align	4, 0x90
+IndirectCall:                           # @IndirectCall
+# %bb.0:
+	subq	$40, %rsp
+	movq	%rcx, 32(%rsp)
+	movq	32(%rsp), %rax
+	movq	%rax, %rdx        # This would otherwise have be: movq __guard_dispatch_icall_fptr(%rip), %rdx
+	xorl	%ecx, %ecx
+	callq	*%rdx
+	nop
+	addq	$40, %rsp
+	retq
+                                        # -- End function
+	.def	 main;	.scl	2;	.type	32;	.endef
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+main:                                   # @main
+# %bb.0:
+	subq	$56, %rsp
+	movq	__imp_exportfn1(%rip), %rax
+	movq	%rdx, 48(%rsp)
+	movl	%ecx, 44(%rsp)
+	movq	%rax, %rcx
+	callq	IndirectCall
+	xorl	%eax, %eax
+	addq	$56, %rsp
+	retq
+                                        # -- End function
+	.section	.gfids$y,"dr"
+	.section	.giats$y,"dr"
+	.symidx	__imp_exportfn1
+	.section	.gljmp$y,"dr"
+
+# Load configuration directory entry (winnt.h _IMAGE_LOAD_CONFIG_DIRECTORY64).
+# The linker will define the __guard_* symbols.
+        .section .rdata,"dr"
+.globl _load_config_used
+_load_config_used:
+        .long 256
+        .fill 124, 1, 0
+        .quad __guard_fids_table
+        .quad __guard_fids_count
+        .long __guard_flags
+        .fill 12, 1, 0
+        .quad __guard_iat_table
+        .quad __guard_iat_count
+        .quad __guard_longjmp_table
+        .quad __guard_fids_count
+        .fill 84, 1, 0
\ No newline at end of file
diff --git a/llvm/include/llvm/MC/MCObjectFileInfo.h b/llvm/include/llvm/MC/MCObjectFileInfo.h
index 8c6bcba2332b1a..316086833d9756 100644
--- a/llvm/include/llvm/MC/MCObjectFileInfo.h
+++ b/llvm/include/llvm/MC/MCObjectFileInfo.h
@@ -215,6 +215,7 @@ class MCObjectFileInfo {
   MCSection *XDataSection = nullptr;
   MCSection *SXDataSection = nullptr;
   MCSection *GFIDsSection = nullptr;
+  MCSection *GIATsSection = nullptr;
   MCSection *GLJMPSection = nullptr;
 
   // XCOFF specific sections
@@ -398,6 +399,7 @@ class MCObjectFileInfo {
   MCSection *getXDataSection() const { return XDataSection; }
   MCSection *getSXDataSection() const { return SXDataSection; }
   MCSection *getGFIDsSection() const { return GFIDsSection; }
+  MCSection *getGIATsSection() const { return GIATsSection; }
   MCSection *getGLJMPSection() const { return GLJMPSection; }
 
   // XCOFF specific sections
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp b/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
index 914308d9147e2d..09bcf5cb25a215 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file contains support for writing the metadata for Windows Control Flow
-// Guard, including address-taken functions, and valid longjmp targets.
+// Guard, including address-taken functions and valid longjmp targets.
 //
 //===----------------------------------------------------------------------===//
 
@@ -17,8 +17,8 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCStreamer.h"
@@ -78,20 +78,49 @@ static bool isPossibleIndirectCallTarget(const Function *F) {
   return false;
 }
 
+/// Returns true if this function should be added to the Guard Address Taken IAT
+/// Entry Table (GIATs) instead of the Guard Function ID Table (GFIDs).
+static bool isIATAddressTaken(const Function *F) {
+  if (F->hasDLLImportStorageClass()) {
+    return true;
+  }
+  return false;
+}
+
 void WinCFGuard::endModule() {
   const Module *M = Asm->MMI->getModule();
-  std::vector<const Function *> Functions;
-  for (const Function &F : *M)
-    if (isPossibleIndirectCallTarget(&F))
-      Functions.push_back(&F);
-  if (Functions.empty() && LongjmpTargets.empty())
+  std::vector<const Function *> GFIDsEntries;
+  std::vector<const Function *> GIATsEntries;
+  for (const Function &F : *M) {
+    if (isPossibleIndirectCallTarget(&F)) {
+      if (isIATAddressTaken(&F)) {
+        // If the possible call target is reached via the IAT, add it to the
+        // GIATs table instead of the GFIDs table.
+        GIATsEntries.push_back(&F);
+      } else {
+        // Otherwise add it to the GFIDs table.
+        GFIDsEntries.push_back(&F);
+      }
+    }
+  }
+
+  if (GFIDsEntries.empty() && GIATsEntries.empty() && LongjmpTargets.empty())
     return;
+
+  // Emit the symbol index of each GFIDs entry to form the GFIDs table.
   auto &OS = *Asm->OutStreamer;
   OS.SwitchSection(Asm->OutContext.getObjectFileInfo()->getGFIDsSection());
-  for (const Function *F : Functions)
+  for (const Function *F : GFIDsEntries)
     OS.EmitCOFFSymbolIndex(Asm->getSymbol(F));
 
-  // Emit the symbol index of each longjmp target.
+  // Emit the symbol index of each GIATs entry to form the GIATs table.
+  OS.SwitchSection(Asm->OutContext.getObjectFileInfo()->getGIATsSection());
+  for (const Function *F : GIATsEntries) {
+    OS.EmitCOFFSymbolIndex(Asm->OutContext.getOrCreateSymbol(
+        Twine("__imp_") + Asm->getSymbol(F)->getName()));
+  }
+
+  // Emit the symbol index of each longjmp target to form the GLJMP table.
   OS.SwitchSection(Asm->OutContext.getObjectFileInfo()->getGLJMPSection());
   for (const MCSymbol *S : LongjmpTargets) {
     OS.EmitCOFFSymbolIndex(S);
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index ae7345c4e05b91..eec2615974b578 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -752,6 +752,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
                                          COFF::IMAGE_SCN_MEM_READ,
                                      SectionKind::getMetadata());
 
+  GIATsSection = Ctx->getCOFFSection(".giats$y",
+                                     COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                         COFF::IMAGE_SCN_MEM_READ,
+                                     SectionKind::getMetadata());
+
   GLJMPSection = Ctx->getCOFFSection(".gljmp$y",
                                      COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                                          COFF::IMAGE_SCN_MEM_READ,
diff --git a/llvm/test/CodeGen/WinCFGuard/cfguard-giats.ll b/llvm/test/CodeGen/WinCFGuard/cfguard-giats.ll
new file mode 100644
index 00000000000000..0ac436cc6add59
--- /dev/null
+++ b/llvm/test/CodeGen/WinCFGuard/cfguard-giats.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc | FileCheck %s
+; Control Flow Guard is currently only available on Windows
+
+declare dllimport i32 @target_func()
+
+; Test address-taken functions from imported DLLs are added to the 
+; Guard Address-Taken IAT Entry table (.giats).
+define i32 @func_cf_giats() {
+entry:
+  %func_ptr = alloca i32 ()*, align 8
+  store i32 ()* @target_func, i32 ()** %func_ptr, align 8
+  %0 = load i32 ()*, i32 ()** %func_ptr, align 8
+  %1 = call i32 %0()
+  ret i32 %1
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 2, !"cfguard", i32 2}
+
+; CHECK-LABEL: .section .giats$y,"dr"
+; CHECK-NEXT:  .symidx __imp_target_func
+; CHECK-NOT:   .symidx
\ No newline at end of file
diff --git a/llvm/tools/llvm-readobj/COFFDumper.cpp b/llvm/tools/llvm-readobj/COFFDumper.cpp
index 22e27b3e5a29e1..b4fb2e52cb1996 100644
--- a/llvm/tools/llvm-readobj/COFFDumper.cpp
+++ b/llvm/tools/llvm-readobj/COFFDumper.cpp
@@ -67,6 +67,8 @@ struct LoadConfigTables {
   uint32_t GuardFlags = 0;
   uint64_t GuardFidTableVA = 0;
   uint64_t GuardFidTableCount = 0;
+  uint64_t GuardIatTableVA = 0;
+  uint64_t GuardIatTableCount = 0;
   uint64_t GuardLJmpTableVA = 0;
   uint64_t GuardLJmpTableCount = 0;
 };
@@ -804,6 +806,11 @@ void COFFDumper::printCOFFLoadConfig() {
     }
   }
 
+  if (Tables.GuardIatTableVA) {
+    ListScope LS(W, "GuardIatTable");
+    printRVATable(Tables.GuardIatTableVA, Tables.GuardIatTableCount, 4);
+  }
+
   if (Tables.GuardLJmpTableVA) {
     ListScope LS(W, "GuardLJmpTable");
     printRVATable(Tables.GuardLJmpTableVA, Tables.GuardLJmpTableCount, 4);
@@ -888,6 +895,9 @@ void COFFDumper::printCOFFLoadConfig(const T *Conf, LoadConfigTables &Tables) {
              Conf->GuardRFVerifyStackPointerFunctionPointer);
   W.printHex("HotPatchTableOffset", Conf->HotPatchTableOffset);
 
+  Tables.GuardIatTableVA = Conf->GuardAddressTakenIatEntryTable;
+  Tables.GuardIatTableCount = Conf->GuardAddressTakenIatEntryCount;
+
   Tables.GuardLJmpTableVA = Conf->GuardLongJumpTargetTable;
   Tables.GuardLJmpTableCount = Conf->GuardLongJumpTargetCount;
 }

From bc730b5e43ad4b7efeca977359271fa0eaa7ed45 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 1 Oct 2020 12:49:59 +0100
Subject: [PATCH 7/7] [InstCombine] collectBitParts - use APInt directly to
 check for out of range bit shifts. NFCI.

---
 llvm/lib/Transforms/Utils/Local.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 0dacb266a063d2..550745673bd9f3 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -2872,10 +2872,10 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
 
     // If this is a logical shift by a constant, recurse then shift the result.
     if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) {
-      unsigned BitShift =
-          cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U);
+      const APInt &BitShift = cast<ConstantInt>(I->getOperand(1))->getValue();
+
       // Ensure the shift amount is defined.
-      if (BitShift > BitWidth)
+      if (BitShift.uge(BitWidth))
         return Result;
 
       const auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
@@ -2887,11 +2887,11 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
       // Perform the "shift" on BitProvenance.
       auto &P = Result->Provenance;
       if (I->getOpcode() == Instruction::Shl) {
-        P.erase(std::prev(P.end(), BitShift), P.end());
-        P.insert(P.begin(), BitShift, BitPart::Unset);
+        P.erase(std::prev(P.end(), BitShift.getZExtValue()), P.end());
+        P.insert(P.begin(), BitShift.getZExtValue(), BitPart::Unset);
       } else {
-        P.erase(P.begin(), std::next(P.begin(), BitShift));
-        P.insert(P.end(), BitShift, BitPart::Unset);
+        P.erase(P.begin(), std::next(P.begin(), BitShift.getZExtValue()));
+        P.insert(P.end(), BitShift.getZExtValue(), BitPart::Unset);
       }
 
       return Result;