From 3c3f6d877623d0d821f59f4ec6038b27f27ee01d Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Wed, 15 May 2024 13:53:38 +0300
Subject: [PATCH 01/71] [AMDGPU][AsmParser][NFC] Eliminate Match_PreferE32.
 (#92159)

Was added in 88e0b251815563016ad50241dd592e304bc03ee5 and is unused
since fcef407aa21ad5a79d66a088e6f2a66a5745725d.
---
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      | 20 +++++--------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index d47a5f8ebb8157..c08c35c459843c 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1388,9 +1388,6 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
                     bool IsAtomic);
 
 public:
-  enum AMDGPUMatchResultTy {
-    Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY
-  };
   enum OperandMode {
     OperandMode_Default,
     OperandMode_NSA,
@@ -5262,15 +5259,11 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                   Variant);
     // We order match statuses from least to most specific. We use most specific
     // status as resulting
-    // Match_MnemonicFail < Match_InvalidOperand < Match_MissingFeature < Match_PreferE32
-    if ((R == Match_Success) ||
-        (R == Match_PreferE32) ||
-        (R == Match_MissingFeature && Result != Match_PreferE32) ||
-        (R == Match_InvalidOperand && Result != Match_MissingFeature
-                                   && Result != Match_PreferE32) ||
-        (R == Match_MnemonicFail   && Result != Match_InvalidOperand
-                                   && Result != Match_MissingFeature
-                                   && Result != Match_PreferE32)) {
+    // Match_MnemonicFail < Match_InvalidOperand < Match_MissingFeature
+    if (R == Match_Success || R == Match_MissingFeature ||
+        (R == Match_InvalidOperand && Result != Match_MissingFeature) ||
+        (R == Match_MnemonicFail && Result != Match_InvalidOperand &&
+         Result != Match_MissingFeature)) {
       Result = R;
       ErrorInfo = EI;
     }
@@ -5316,9 +5309,6 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(ErrorLoc, "invalid operand for instruction");
   }
 
-  case Match_PreferE32:
-    return Error(IDLoc, "internal error: instruction without _e64 suffix "
-                        "should be encoded as e32");
   case Match_MnemonicFail:
     llvm_unreachable("Invalid instructions should have been handled already");
   }

From de18f5ecf80ef7183625c80b04445c614a17c483 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Wed, 15 May 2024 06:23:34 -0500
Subject: [PATCH 02/71] [flang][OpenMP] Remove `allocate` from `taskgroup` in
 test (#92173)

Remove the `allocate`, because it needs to be used together with a
privatizing clause. The only such clause for `taskgroup` is
`task_reduction`, but it's not yet supported.
---
 flang/test/Lower/OpenMP/taskgroup.f90 | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/flang/test/Lower/OpenMP/taskgroup.f90 b/flang/test/Lower/OpenMP/taskgroup.f90
index 76458f1f1127f3..d9d262bdd2c083 100644
--- a/flang/test/Lower/OpenMP/taskgroup.f90
+++ b/flang/test/Lower/OpenMP/taskgroup.f90
@@ -1,17 +1,14 @@
-! REQUIRES: openmp_runtime
-
 !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
 
+! The "allocate" clause has been removed, because it needs to be used
+! together with a privatizing clause. The only such clause for "taskgroup"
+! is "task_reduction", but it's not yet supported.
+
 !CHECK-LABEL: @_QPomp_taskgroup
 subroutine omp_taskgroup
-use omp_lib
-integer :: allocated_x
-!CHECK: %[[ALLOC_X_REF:.*]] = fir.alloca i32 {bindc_name = "allocated_x", uniq_name = "_QFomp_taskgroupEallocated_x"}
-!CHECK-NEXT: %[[ALLOC_X_DECL:.*]]:2 = hlfir.declare %[[ALLOC_X_REF]] {uniq_name = "_QFomp_taskgroupEallocated_x"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[C4:.*]] = arith.constant 4 : i64
-
-!CHECK: omp.taskgroup  allocate(%[[C4]] : i64 -> %[[ALLOC_X_DECL]]#1 : !fir.ref<i32>)
-!$omp taskgroup allocate(omp_high_bw_mem_alloc: allocated_x)
+!CHECK: omp.taskgroup
+!$omp taskgroup
+!CHECK: omp.task
 !$omp task
 !CHECK: fir.call @_QPwork() {{.*}}: () -> ()
    call work()

From e6ef836f23aa44520e0823c38e44b2f58eb5a52f Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Wed, 15 May 2024 06:45:57 -0500
Subject: [PATCH 03/71] [flang][OpenMP] Add -fopenmp-version=52 to teams.f90
 (#92180)

One of the functions in the test has `teams if(...)`. The `if` clause
was only allowed on the `teams` directive in OpenMP 5.2.
---
 flang/test/Lower/OpenMP/teams.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/test/Lower/OpenMP/teams.f90 b/flang/test/Lower/OpenMP/teams.f90
index f122a578a6e160..b1b2e7080676e3 100644
--- a/flang/test/Lower/OpenMP/teams.f90
+++ b/flang/test/Lower/OpenMP/teams.f90
@@ -1,6 +1,6 @@
 ! REQUIRES: openmp_runtime
 
-! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 %s -o - | FileCheck %s
 
 ! CHECK-LABEL: func @_QPteams_simple
 subroutine teams_simple()

From ccbf908b0836d8e3945f9331fd3679cbc6be0be1 Mon Sep 17 00:00:00 2001
From: Jan Patrick Lehr <JanPatrick.Lehr@amd.com>
Date: Wed, 15 May 2024 14:06:40 +0200
Subject: [PATCH 04/71] [libc] Fix GPU test build error (#92235)

This fixes a build error on the AMDGPU buildbot introduced in PR
https://github.com/llvm/llvm-project/pull/92172
---
 libc/src/__support/StringUtil/tables/stdc_errors.h | 3 +--
 libc/test/src/string/strerror_test.cpp             | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/libc/src/__support/StringUtil/tables/stdc_errors.h b/libc/src/__support/StringUtil/tables/stdc_errors.h
index a9c1527834550f..6873d6bd51074b 100644
--- a/libc/src/__support/StringUtil/tables/stdc_errors.h
+++ b/libc/src/__support/StringUtil/tables/stdc_errors.h
@@ -15,11 +15,10 @@
 
 namespace LIBC_NAMESPACE {
 
-LIBC_INLINE_VAR constexpr const MsgTable<4> STDC_ERRORS = {
+LIBC_INLINE_VAR constexpr const MsgTable<3> STDC_ERRORS = {
     MsgMapping(0, "Success"),
     MsgMapping(EDOM, "Numerical argument out of domain"),
     MsgMapping(ERANGE, "Numerical result out of range"),
-    MsgMapping(EILSEQ, "Invalid or incomplete multibyte or wide character"),
 };
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/test/src/string/strerror_test.cpp b/libc/test/src/string/strerror_test.cpp
index ec9827b75cfc82..2d6c230573a4bf 100644
--- a/libc/test/src/string/strerror_test.cpp
+++ b/libc/test/src/string/strerror_test.cpp
@@ -97,7 +97,7 @@ TEST(LlvmLibcStrErrorTest, KnownErrors) {
       ".lib section in a.out corrupted",
       "Attempting to link in too many shared libraries",
       "Cannot exec a shared library directly",
-      "Invalid or incomplete multibyte or wide character",
+      "Unknown Error 84", // Unknown
       "Interrupted system call should be restarted",
       "Streams pipe error",
       "Too many users",

From 1650f1b3d7f97ca95eb930984e74bdfd91b02b4e Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 15 May 2024 13:10:16 +0100
Subject: [PATCH 05/71] Fix typo "indicies" (#92232)

---
 clang/include/clang/AST/VTTBuilder.h          |  6 +-
 clang/lib/AST/VTTBuilder.cpp                  |  2 +-
 clang/lib/CodeGen/CGVTT.cpp                   | 17 ++---
 clang/lib/CodeGen/CGVTables.h                 |  6 +-
 .../command/commands/DexExpectStepOrder.py    |  2 +-
 flang/docs/HighLevelFIR.md                    |  2 +-
 flang/test/Lower/HLFIR/forall.f90             |  2 +-
 libc/src/stdio/printf_core/parser.h           |  2 +-
 .../views/mdspan/CustomTestLayouts.h          |  2 +-
 llvm/docs/GlobalISel/GenericOpcode.rst        |  4 +-
 llvm/include/llvm/Target/Target.td            |  4 +-
 llvm/lib/Analysis/DependenceAnalysis.cpp      | 10 +--
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     | 12 ++--
 llvm/lib/Bitcode/Writer/ValueEnumerator.cpp   |  2 +-
 llvm/lib/Bitcode/Writer/ValueEnumerator.h     |  2 +-
 .../LiveDebugValues/VarLocBasedImpl.cpp       |  2 +-
 llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp   |  2 +-
 llvm/lib/CodeGen/PrologEpilogInserter.cpp     |  2 +-
 llvm/lib/Support/ELFAttributeParser.cpp       | 10 +--
 .../Target/AArch64/AArch64ISelLowering.cpp    |  2 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  2 +-
 .../DirectX/DXILWriter/DXILBitcodeWriter.cpp  | 10 +--
 .../DXILWriter/DXILValueEnumerator.cpp        |  2 +-
 .../DirectX/DXILWriter/DXILValueEnumerator.h  |  2 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  2 +-
 .../Transforms/InstCombine/InstCombinePHI.cpp |  4 +-
 .../Scalar/SeparateConstOffsetFromGEP.cpp     |  2 +-
 .../Utils/SampleProfileInference.cpp          |  2 +-
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 64 +++++++++----------
 llvm/test/CodeGen/X86/avx-vperm2x128.ll       |  2 +-
 .../test/DebugInfo/PDB/Inputs/every-type.yaml |  4 +-
 ...h-directive-personalityindex-diagnostics.s |  6 +-
 .../InstCombine/phi-extractvalue.ll           |  8 +--
 .../InstCombine/phi-of-insertvalues.ll        |  6 +-
 .../VectorCombine/X86/scalarize-vector-gep.ll | 12 ++--
 .../Linalg/Transforms/Vectorization.cpp       |  6 +-
 36 files changed, 114 insertions(+), 113 deletions(-)

diff --git a/clang/include/clang/AST/VTTBuilder.h b/clang/include/clang/AST/VTTBuilder.h
index 4acbc1f9e96b28..3c19e61a8701ca 100644
--- a/clang/include/clang/AST/VTTBuilder.h
+++ b/clang/include/clang/AST/VTTBuilder.h
@@ -92,7 +92,7 @@ class VTTBuilder {
   using AddressPointsMapTy = llvm::DenseMap<BaseSubobject, uint64_t>;
 
   /// The sub-VTT indices for the bases of the most derived class.
-  llvm::DenseMap<BaseSubobject, uint64_t> SubVTTIndicies;
+  llvm::DenseMap<BaseSubobject, uint64_t> SubVTTIndices;
 
   /// The secondary virtual pointer indices of all subobjects of
   /// the most derived class.
@@ -148,8 +148,8 @@ class VTTBuilder {
   }
 
   /// Returns a reference to the sub-VTT indices.
-  const llvm::DenseMap<BaseSubobject, uint64_t> &getSubVTTIndicies() const {
-    return SubVTTIndicies;
+  const llvm::DenseMap<BaseSubobject, uint64_t> &getSubVTTIndices() const {
+    return SubVTTIndices;
   }
 
   /// Returns a reference to the secondary virtual pointer indices.
diff --git a/clang/lib/AST/VTTBuilder.cpp b/clang/lib/AST/VTTBuilder.cpp
index d58e8751778520..464a2014c430a0 100644
--- a/clang/lib/AST/VTTBuilder.cpp
+++ b/clang/lib/AST/VTTBuilder.cpp
@@ -189,7 +189,7 @@ void VTTBuilder::LayoutVTT(BaseSubobject Base, bool BaseIsVirtual) {
 
   if (!IsPrimaryVTT) {
     // Remember the sub-VTT index.
-    SubVTTIndicies[Base] = VTTComponents.size();
+    SubVTTIndices[Base] = VTTComponents.size();
   }
 
   uint64_t VTableIndex = VTTVTables.size();
diff --git a/clang/lib/CodeGen/CGVTT.cpp b/clang/lib/CodeGen/CGVTT.cpp
index d2376b14dd5826..4cebb750c89e8f 100644
--- a/clang/lib/CodeGen/CGVTT.cpp
+++ b/clang/lib/CodeGen/CGVTT.cpp
@@ -138,23 +138,24 @@ uint64_t CodeGenVTables::getSubVTTIndex(const CXXRecordDecl *RD,
                                         BaseSubobject Base) {
   BaseSubobjectPairTy ClassSubobjectPair(RD, Base);
 
-  SubVTTIndiciesMapTy::iterator I = SubVTTIndicies.find(ClassSubobjectPair);
-  if (I != SubVTTIndicies.end())
+  SubVTTIndicesMapTy::iterator I = SubVTTIndices.find(ClassSubobjectPair);
+  if (I != SubVTTIndices.end())
     return I->second;
 
   VTTBuilder Builder(CGM.getContext(), RD, /*GenerateDefinition=*/false);
 
-  for (llvm::DenseMap<BaseSubobject, uint64_t>::const_iterator I =
-       Builder.getSubVTTIndicies().begin(),
-       E = Builder.getSubVTTIndicies().end(); I != E; ++I) {
+  for (llvm::DenseMap<BaseSubobject, uint64_t>::const_iterator
+           I = Builder.getSubVTTIndices().begin(),
+           E = Builder.getSubVTTIndices().end();
+       I != E; ++I) {
     // Insert all indices.
     BaseSubobjectPairTy ClassSubobjectPair(RD, I->first);
 
-    SubVTTIndicies.insert(std::make_pair(ClassSubobjectPair, I->second));
+    SubVTTIndices.insert(std::make_pair(ClassSubobjectPair, I->second));
   }
 
-  I = SubVTTIndicies.find(ClassSubobjectPair);
-  assert(I != SubVTTIndicies.end() && "Did not find index!");
+  I = SubVTTIndices.find(ClassSubobjectPair);
+  assert(I != SubVTTIndices.end() && "Did not find index!");
 
   return I->second;
 }
diff --git a/clang/lib/CodeGen/CGVTables.h b/clang/lib/CodeGen/CGVTables.h
index 9d4223547050d4..c06bf7a525d9fd 100644
--- a/clang/lib/CodeGen/CGVTables.h
+++ b/clang/lib/CodeGen/CGVTables.h
@@ -38,10 +38,10 @@ class CodeGenVTables {
   typedef VTableLayout::AddressPointsMapTy VTableAddressPointsMapTy;
 
   typedef std::pair<const CXXRecordDecl *, BaseSubobject> BaseSubobjectPairTy;
-  typedef llvm::DenseMap<BaseSubobjectPairTy, uint64_t> SubVTTIndiciesMapTy;
+  typedef llvm::DenseMap<BaseSubobjectPairTy, uint64_t> SubVTTIndicesMapTy;
 
-  /// SubVTTIndicies - Contains indices into the various sub-VTTs.
-  SubVTTIndiciesMapTy SubVTTIndicies;
+  /// SubVTTIndices - Contains indices into the various sub-VTTs.
+  SubVTTIndicesMapTy SubVTTIndices;
 
   typedef llvm::DenseMap<BaseSubobjectPairTy, uint64_t>
     SecondaryVirtualPointerIndicesMapTy;
diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/command/commands/DexExpectStepOrder.py b/cross-project-tests/debuginfo-tests/dexter/dex/command/commands/DexExpectStepOrder.py
index cb5579b523dcf8..d6954a440f1ae4 100644
--- a/cross-project-tests/debuginfo-tests/dexter/dex/command/commands/DexExpectStepOrder.py
+++ b/cross-project-tests/debuginfo-tests/dexter/dex/command/commands/DexExpectStepOrder.py
@@ -12,7 +12,7 @@
 
 class DexExpectStepOrder(CommandBase):
     """Expect the line every `DexExpectStepOrder` is found on to be stepped on
-    in `order`. Each instance must have a set of unique ascending indicies.
+    in `order`. Each instance must have a set of unique ascending indices.
 
     DexExpectStepOrder(*order)
 
diff --git a/flang/docs/HighLevelFIR.md b/flang/docs/HighLevelFIR.md
index de8dc5a1959bb9..2399efcdeacd3d 100644
--- a/flang/docs/HighLevelFIR.md
+++ b/flang/docs/HighLevelFIR.md
@@ -590,7 +590,7 @@ Syntax:
 
 Note that %indices are not operands, they are the elemental region block
 arguments, representing the array iteration space in a one based fashion.
-The choice of using one based indicies is to match Fortran default for
+The choice of using one based indices is to match Fortran default for
 array variables, so that there is no need to generate bound adjustments
 when working with one based array variables in an expression.
 
diff --git a/flang/test/Lower/HLFIR/forall.f90 b/flang/test/Lower/HLFIR/forall.f90
index 9941ed19401038..c12f0c6a826b50 100644
--- a/flang/test/Lower/HLFIR/forall.f90
+++ b/flang/test/Lower/HLFIR/forall.f90
@@ -144,7 +144,7 @@ subroutine test_nested_foralls()
     ! ifoo and ibar could depend on x since it is a module
     ! variable use associated. The calls in the control value
     ! computation cannot be hoisted from the outer forall
-    ! even when they do not depend on outer forall indicies.
+    ! even when they do not depend on outer forall indices.
     forall (integer(8)::j=jfoo():jbar())
       x(i, j) = x(j, i)
     end forall
diff --git a/libc/src/stdio/printf_core/parser.h b/libc/src/stdio/printf_core/parser.h
index eda978a83ea8af..b9a8f303dd6738 100644
--- a/libc/src/stdio/printf_core/parser.h
+++ b/libc/src/stdio/printf_core/parser.h
@@ -496,7 +496,7 @@ template <typename ArgProvider> class Parser {
   // the type of index, and returns a TypeDesc describing that type. It does not
   // modify cur_pos.
   LIBC_INLINE TypeDesc get_type_desc(size_t index) {
-    // index mode is assumed, and the indicies start at 1, so an index
+    // index mode is assumed, and the indices start at 1, so an index
     // of 0 is invalid.
     size_t local_pos = 0;
 
diff --git a/libcxx/test/std/containers/views/mdspan/CustomTestLayouts.h b/libcxx/test/std/containers/views/mdspan/CustomTestLayouts.h
index 3ac142cce3a348..588a5e9774a553 100644
--- a/libcxx/test/std/containers/views/mdspan/CustomTestLayouts.h
+++ b/libcxx/test/std/containers/views/mdspan/CustomTestLayouts.h
@@ -29,7 +29,7 @@
 #include <utility>
 
 // Layout that wraps indices to test some idiosyncratic behavior
-// - basically it is a layout_left where indicies are first wrapped i.e. i%Wrap
+// - basically it is a layout_left where indices are first wrapped i.e. i%Wrap
 // - only accepts integers as indices
 // - is_always_strided and is_always_unique are false
 // - is_strided and is_unique are true if all extents are smaller than Wrap
diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index 52dc039df7779e..5c28c6fcd30fb6 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -644,7 +644,7 @@ source vector should be inserted into.
 The index must be a constant multiple of the second source vector's minimum
 vector length. If the vectors are scalable, then the index is first scaled by
 the runtime scaling factor. The indices inserted in the source vector must be
-valid indicies of that vector. If this condition cannot be determined statically
+valid indices of that vector. If this condition cannot be determined statically
 but is false at runtime, then the result vector is undefined.
 
 .. code-block:: none
@@ -661,7 +661,7 @@ the source vector.
 The index must be a constant multiple of the source vector's minimum vector
 length. If the source vector is a scalable vector, then the index is first
 scaled by the runtime scaling factor. The indices extracted from the source
-vector must be valid indicies of that vector. If this condition cannot be
+vector must be valid indices of that vector. If this condition cannot be
 determined statically but is false at runtime, then the result vector is
 undefined.
 
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index 1f7dc6922f13e4..34332386085870 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -765,8 +765,8 @@ class Instruction : InstructionEncoding {
 
   /// Should generate helper functions that help you to map a logical operand's
   /// index to the underlying MIOperand's index.
-  /// In most architectures logical operand indicies are equal to
-  /// MIOperand indicies, but for some CISC architectures, a logical operand
+  /// In most architectures logical operand indices are equal to
+  /// MIOperand indices, but for some CISC architectures, a logical operand
   /// might be consist of multiple MIOperand (e.g. a logical operand that
   /// uses complex address mode).
   bit UseLogicalOperandMappings = false;
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 1bce9aae09bb26..e0e7dd18cd8d48 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -3444,9 +3444,9 @@ bool DependenceInfo::tryDelinearizeFixedSize(
   // iff the subscripts are positive and are less than the range of the
   // dimension.
   if (!DisableDelinearizationChecks) {
-    auto AllIndiciesInRange = [&](SmallVector<int, 4> &DimensionSizes,
-                                  SmallVectorImpl<const SCEV *> &Subscripts,
-                                  Value *Ptr) {
+    auto AllIndicesInRange = [&](SmallVector<int, 4> &DimensionSizes,
+                                 SmallVectorImpl<const SCEV *> &Subscripts,
+                                 Value *Ptr) {
       size_t SSize = Subscripts.size();
       for (size_t I = 1; I < SSize; ++I) {
         const SCEV *S = Subscripts[I];
@@ -3462,8 +3462,8 @@ bool DependenceInfo::tryDelinearizeFixedSize(
       return true;
     };
 
-    if (!AllIndiciesInRange(SrcSizes, SrcSubscripts, SrcPtr) ||
-        !AllIndiciesInRange(DstSizes, DstSubscripts, DstPtr)) {
+    if (!AllIndicesInRange(SrcSizes, SrcSubscripts, SrcPtr) ||
+        !AllIndicesInRange(DstSizes, DstSubscripts, DstPtr)) {
       SrcSubscripts.clear();
       DstSubscripts.clear();
       return false;
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 6d01e3b4d82189..c4cea3d6eef2d8 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -986,7 +986,7 @@ void ModuleBitcodeWriter::writeTypeTable() {
   Stream.EnterSubblock(bitc::TYPE_BLOCK_ID_NEW, 4 /*count from # abbrevs */);
   SmallVector<uint64_t, 64> TypeVals;
 
-  uint64_t NumBits = VE.computeBitsRequiredForTypeIndicies();
+  uint64_t NumBits = VE.computeBitsRequiredForTypeIndices();
 
   // Abbrev for TYPE_CODE_OPAQUE_POINTER.
   auto Abbv = std::make_shared<BitCodeAbbrev>();
@@ -3721,7 +3721,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
     auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_SETTYPE));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                              VE.computeBitsRequiredForTypeIndicies()));
+                              VE.computeBitsRequiredForTypeIndices()));
     if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, Abbv) !=
         CONSTANTS_SETTYPE_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
@@ -3741,7 +3741,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CE_CAST));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4));  // cast opc
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,       // typeid
-                              VE.computeBitsRequiredForTypeIndicies()));
+                              VE.computeBitsRequiredForTypeIndices()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));    // value id
 
     if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, Abbv) !=
@@ -3763,7 +3763,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_LOAD));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Ptr
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,    // dest ty
-                              VE.computeBitsRequiredForTypeIndicies()));
+                              VE.computeBitsRequiredForTypeIndices()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // Align
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // volatile
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
@@ -3815,7 +3815,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_CAST));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));    // OpVal
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,       // dest ty
-                              VE.computeBitsRequiredForTypeIndicies()));
+                              VE.computeBitsRequiredForTypeIndices()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4));  // opc
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
         FUNCTION_INST_CAST_ABBREV)
@@ -3826,7 +3826,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_CAST));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // OpVal
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,    // dest ty
-                              VE.computeBitsRequiredForTypeIndicies()));
+                              VE.computeBitsRequiredForTypeIndices()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8)); // flags
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
index e6787e245a49b2..631f31cba97675 100644
--- a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -1191,6 +1191,6 @@ unsigned ValueEnumerator::getGlobalBasicBlockID(const BasicBlock *BB) const {
   return getGlobalBasicBlockID(BB);
 }
 
-uint64_t ValueEnumerator::computeBitsRequiredForTypeIndicies() const {
+uint64_t ValueEnumerator::computeBitsRequiredForTypeIndices() const {
   return Log2_32_Ceil(getTypes().size() + 1);
 }
diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.h b/llvm/lib/Bitcode/Writer/ValueEnumerator.h
index 4b45503595f6f4..8348d6728a5c31 100644
--- a/llvm/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.h
@@ -234,7 +234,7 @@ class ValueEnumerator {
   void incorporateFunction(const Function &F);
 
   void purgeFunction();
-  uint64_t computeBitsRequiredForTypeIndicies() const;
+  uint64_t computeBitsRequiredForTypeIndices() const;
 
 private:
   void OptimizeConstants(unsigned CstStart, unsigned CstEnd);
diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
index bf730be00a9a92..e146fb7e576819 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
@@ -86,7 +86,7 @@
 /// lookup the VarLoc in the VarLocMap. Rather than operate directly on machine
 /// locations, the dataflow analysis in this pass identifies locations by their
 /// indices in the VarLocMap, meaning all the variable locations in a block can
-/// be described by a sparse vector of VarLocMap indicies.
+/// be described by a sparse vector of VarLocMap indices.
 ///
 /// All the storage for the dataflow analysis is local to the ExtendRanges
 /// method and passed down to helper methods. "OutLocs" and "InLocs" record the
diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
index 114e7910dc27bb..f3a961f883517f 100644
--- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
@@ -212,7 +212,7 @@ static const std::vector<int64_t> PerLiveRangeShape{1, NumberOfInterferences};
   M(float, mbb_frequencies, MBBFrequencyShape,                                 \
     "A vector of machine basic block frequencies")                             \
   M(int64_t, mbb_mapping, InstructionsShape,                                   \
-    "A vector of indicies mapping instructions to MBBs")
+    "A vector of indices mapping instructions to MBBs")
 #else
 #define RA_EVICT_FIRST_DEVELOPMENT_FEATURE(M)
 #define RA_EVICT_REST_DEVELOPMENT_FEATURES(M)
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index eaf96ec5cbde8c..6a72797de493d4 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -1444,7 +1444,7 @@ bool PEI::replaceFrameIndexDebugInstr(MachineFunction &MF, MachineInstr &MI,
   // pointer as the base register.
   if (MI.getOpcode() == TargetOpcode::STATEPOINT) {
     assert((!MI.isDebugValue() || OpIdx == 0) &&
-           "Frame indicies can only appear as the first operand of a "
+           "Frame indices can only appear as the first operand of a "
            "DBG_VALUE machine instruction");
     Register Reg;
     MachineOperand &Offset = MI.getOperand(OpIdx + 1);
diff --git a/llvm/lib/Support/ELFAttributeParser.cpp b/llvm/lib/Support/ELFAttributeParser.cpp
index d3100c9ebb211b..26c3d54e17ade8 100644
--- a/llvm/lib/Support/ELFAttributeParser.cpp
+++ b/llvm/lib/Support/ELFAttributeParser.cpp
@@ -154,7 +154,7 @@ Error ELFAttributeParser::parseSubsection(uint32_t length) {
                                    Twine::utohexstr(cursor.tell() - 5));
 
     StringRef scopeName, indexName;
-    SmallVector<uint8_t, 8> indicies;
+    SmallVector<uint8_t, 8> indices;
     switch (tag) {
     case ELFAttrs::File:
       scopeName = "FileAttributes";
@@ -162,12 +162,12 @@ Error ELFAttributeParser::parseSubsection(uint32_t length) {
     case ELFAttrs::Section:
       scopeName = "SectionAttributes";
       indexName = "Sections";
-      parseIndexList(indicies);
+      parseIndexList(indices);
       break;
     case ELFAttrs::Symbol:
       scopeName = "SymbolAttributes";
       indexName = "Symbols";
-      parseIndexList(indicies);
+      parseIndexList(indices);
       break;
     default:
       return createStringError(errc::invalid_argument,
@@ -178,8 +178,8 @@ Error ELFAttributeParser::parseSubsection(uint32_t length) {
 
     if (sw) {
       DictScope scope(*sw, scopeName);
-      if (!indicies.empty())
-        sw->printList(indexName, indicies);
+      if (!indices.empty())
+        sw->printList(indexName, indices);
       if (Error e = parseAttributeList(size - 5))
         return e;
     } else if (Error e = parseAttributeList(size - 5))
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index afa023220d357b..6223c211b33b68 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23881,7 +23881,7 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
 
   // For "scalar + vector of indices", just scale the indices. This only
   // applies to non-temporal scatters because there's no instruction that takes
-  // indicies.
+  // indices.
   if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
     Offset =
         getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8f741ffc58a827..89e83babcfef42 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7053,7 +7053,7 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
   SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
                                DAG.getSplatBuildVector(VecVT, SL, InsVal));
 
-  // 2. Mask off all other indicies except the required index within (1).
+  // 2. Mask off all other indices except the required index within (1).
   SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
 
   // 3. Mask off the required index within the target vector.
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index ebb269c6e6e067..87297ac86b9d25 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -958,7 +958,7 @@ void DXILBitcodeWriter::writeTypeTable() {
   Stream.EnterSubblock(bitc::TYPE_BLOCK_ID_NEW, 4 /*count from # abbrevs */);
   SmallVector<uint64_t, 64> TypeVals;
 
-  uint64_t NumBits = VE.computeBitsRequiredForTypeIndicies();
+  uint64_t NumBits = VE.computeBitsRequiredForTypeIndices();
 
   // Abbrev for TYPE_CODE_POINTER.
   auto Abbv = std::make_shared<BitCodeAbbrev>();
@@ -2747,7 +2747,7 @@ void DXILBitcodeWriter::writeBlockInfo() {
     auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_SETTYPE));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                              VE.computeBitsRequiredForTypeIndicies()));
+                              VE.computeBitsRequiredForTypeIndices()));
     if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, std::move(Abbv)) !=
         CONSTANTS_SETTYPE_ABBREV)
       assert(false && "Unexpected abbrev ordering!");
@@ -2767,7 +2767,7 @@ void DXILBitcodeWriter::writeBlockInfo() {
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CE_CAST));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // cast opc
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,      // typeid
-                              VE.computeBitsRequiredForTypeIndicies()));
+                              VE.computeBitsRequiredForTypeIndices()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
 
     if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, std::move(Abbv)) !=
@@ -2789,7 +2789,7 @@ void DXILBitcodeWriter::writeBlockInfo() {
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_LOAD));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Ptr
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,    // dest ty
-                              VE.computeBitsRequiredForTypeIndicies()));
+                              VE.computeBitsRequiredForTypeIndices()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // Align
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // volatile
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, std::move(Abbv)) !=
@@ -2822,7 +2822,7 @@ void DXILBitcodeWriter::writeBlockInfo() {
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_CAST));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // OpVal
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,    // dest ty
-                              VE.computeBitsRequiredForTypeIndicies()));
+                              VE.computeBitsRequiredForTypeIndices()));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, std::move(Abbv)) !=
         (unsigned)FUNCTION_INST_CAST_ABBREV)
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp
index d90ab968c5d313..9a8d0afa629261 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp
@@ -1140,6 +1140,6 @@ unsigned ValueEnumerator::getGlobalBasicBlockID(const BasicBlock *BB) const {
   return getGlobalBasicBlockID(BB);
 }
 
-uint64_t ValueEnumerator::computeBitsRequiredForTypeIndicies() const {
+uint64_t ValueEnumerator::computeBitsRequiredForTypeIndices() const {
   return Log2_32_Ceil(getTypes().size() + 1);
 }
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.h b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.h
index 66a5d96080bc37..f0f91c6182e32e 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.h
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.h
@@ -236,7 +236,7 @@ class ValueEnumerator {
   void incorporateFunction(const Function &F);
 
   void purgeFunction();
-  uint64_t computeBitsRequiredForTypeIndicies() const;
+  uint64_t computeBitsRequiredForTypeIndices() const;
 
   void EnumerateType(Type *T);
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 0a7483fc45b203..ad86c393ba7919 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -14945,7 +14945,7 @@ static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
     }
   }
 
-  // If the vector extract indicies are not correct, add the appropriate
+  // If the vector extract indices are not correct, add the appropriate
   // vector_shuffle.
   int TgtElemArrayIdx;
   int InputSize = Input.getValueType().getScalarSizeInBits();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 52803e9bea451e..dd8eb4688d3c9d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -331,7 +331,7 @@ Instruction *
 InstCombinerImpl::foldPHIArgInsertValueInstructionIntoPHI(PHINode &PN) {
   auto *FirstIVI = cast<InsertValueInst>(PN.getIncomingValue(0));
 
-  // Scan to see if all operands are `insertvalue`'s with the same indicies,
+  // Scan to see if all operands are `insertvalue`'s with the same indices,
   // and all have a single use.
   for (Value *V : drop_begin(PN.incoming_values())) {
     auto *I = dyn_cast<InsertValueInst>(V);
@@ -371,7 +371,7 @@ Instruction *
 InstCombinerImpl::foldPHIArgExtractValueInstructionIntoPHI(PHINode &PN) {
   auto *FirstEVI = cast<ExtractValueInst>(PN.getIncomingValue(0));
 
-  // Scan to see if all operands are `extractvalue`'s with the same indicies,
+  // Scan to see if all operands are `extractvalue`'s with the same indices,
   // and all have a single use.
   for (Value *V : drop_begin(PN.incoming_values())) {
     auto *I = dyn_cast<ExtractValueInst>(V);
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 9f85396cde2598..1a9eaf28f6e47e 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1008,7 +1008,7 @@ bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
   }
 
   IRBuilder<> Builder(GEP);
-  // For trivial GEP chains, we can swap the indicies.
+  // For trivial GEP chains, we can swap the indices.
   Value *NewSrc = Builder.CreateGEP(
       GEP->getSourceElementType(), PtrGEP->getPointerOperand(),
       SmallVector<Value *, 4>(GEP->indices()), "", IsChainInBounds);
diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
index 101b70d8def4a4..54d46117729c98 100644
--- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
+++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
@@ -1061,7 +1061,7 @@ void initializeNetwork(const ProfiParams &Params, MinCostMaxFlow &Network,
   assert(NumJumps > 0 && "Too few jumps in a function");
 
   // Introducing dummy source/sink pairs to allow flow circulation.
-  // The nodes corresponding to blocks of the function have indicies in
+  // The nodes corresponding to blocks of the function have indices in
   // the range [0 .. 2 * NumBlocks); the dummy sources/sinks are indexed by the
   // next four values.
   uint64_t S = 2 * NumBlocks;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2e0a39c4b4fdc3..d21b5e1cc04178 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6485,7 +6485,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                             const EdgeInfo &UserTreeIdx) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
 
-  SmallVector<int> ReuseShuffleIndicies;
+  SmallVector<int> ReuseShuffleIndices;
   SmallVector<Value *> UniqueValues;
   SmallVector<Value *> NonUniqueValueVL;
   auto TryToFindDuplicates = [&](const InstructionsState &S,
@@ -6494,19 +6494,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     DenseMap<Value *, unsigned> UniquePositions(VL.size());
     for (Value *V : VL) {
       if (isConstant(V)) {
-        ReuseShuffleIndicies.emplace_back(
+        ReuseShuffleIndices.emplace_back(
             isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
         UniqueValues.emplace_back(V);
         continue;
       }
       auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
-      ReuseShuffleIndicies.emplace_back(Res.first->second);
+      ReuseShuffleIndices.emplace_back(Res.first->second);
       if (Res.second)
         UniqueValues.emplace_back(V);
     }
     size_t NumUniqueScalarValues = UniqueValues.size();
     if (NumUniqueScalarValues == VL.size()) {
-      ReuseShuffleIndicies.clear();
+      ReuseShuffleIndices.clear();
     } else {
       // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
       if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
@@ -6532,7 +6532,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             })) {
           unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
           if (PWSz == VL.size()) {
-            ReuseShuffleIndicies.clear();
+            ReuseShuffleIndices.clear();
           } else {
             NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
             NonUniqueValueVL.append(PWSz - UniqueValues.size(),
@@ -6579,7 +6579,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
     if (TryToFindDuplicates(S))
       newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                   ReuseShuffleIndicies);
+                   ReuseShuffleIndices);
     return;
   }
 
@@ -6590,7 +6590,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
     if (TryToFindDuplicates(S))
       newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                   ReuseShuffleIndicies);
+                   ReuseShuffleIndices);
     return;
   }
 
@@ -6694,7 +6694,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
     if (TryToFindDuplicates(S))
       newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                   ReuseShuffleIndicies);
+                   ReuseShuffleIndices);
     return;
   }
 
@@ -6722,7 +6722,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
         if (TryToFindDuplicates(S))
           newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndicies);
+                       ReuseShuffleIndices);
         return;
       }
     } else {
@@ -6745,7 +6745,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                         << ") is already in tree.\n");
       if (TryToFindDuplicates(S))
         newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndicies);
+                     ReuseShuffleIndices);
       return;
     }
   }
@@ -6757,7 +6757,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
         if (TryToFindDuplicates(S))
           newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                       ReuseShuffleIndicies);
+                       ReuseShuffleIndices);
         return;
       }
     }
@@ -6810,7 +6810,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
   if (State == TreeEntry::NeedToGather) {
     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                 ReuseShuffleIndicies);
+                 ReuseShuffleIndices);
     return;
   }
 
@@ -6832,7 +6832,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
             !BS.getScheduleData(VL0)->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
     newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
-                 ReuseShuffleIndicies);
+                 ReuseShuffleIndices);
     NonScheduledFirst.insert(VL.front());
     return;
   }
@@ -6845,7 +6845,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       auto *PH = cast<PHINode>(VL0);
 
       TreeEntry *TE =
-          newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
+          newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
       // Keeps the reordered operands to avoid code duplication.
@@ -6862,7 +6862,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       if (CurrentOrder.empty()) {
         LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
         newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndicies);
+                     ReuseShuffleIndices);
         // This is a special case, as it does not gather, but at the same time
         // we are not extending buildTree_rec() towards the operands.
         ValueList Op0;
@@ -6881,7 +6881,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       // Insert new order with initial value 0, if it does not exist,
       // otherwise return the iterator to the existing one.
       newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                   ReuseShuffleIndicies, CurrentOrder);
+                   ReuseShuffleIndices, CurrentOrder);
       // This is a special case, as it does not gather, but at the same time
       // we are not extending buildTree_rec() towards the operands.
       ValueList Op0;
@@ -6890,7 +6890,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       return;
     }
     case Instruction::InsertElement: {
-      assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
+      assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
 
       auto OrdCompare = [](const std::pair<int, int> &P1,
                            const std::pair<int, int> &P2) {
@@ -6941,12 +6941,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         if (CurrentOrder.empty()) {
           // Original loads are consecutive and does not require reordering.
           TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                            ReuseShuffleIndicies);
+                            ReuseShuffleIndices);
           LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
         } else {
           // Need to reorder.
           TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                            ReuseShuffleIndicies, CurrentOrder);
+                            ReuseShuffleIndices, CurrentOrder);
           LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
         }
         TE->setOperandsInOrder();
@@ -6955,10 +6955,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         if (CurrentOrder.empty()) {
           TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
-                            UserTreeIdx, ReuseShuffleIndicies);
+                            UserTreeIdx, ReuseShuffleIndices);
         } else {
           TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
-                            UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
+                            UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
         }
         TE->setOperandsInOrder();
         LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
@@ -6966,7 +6966,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       case TreeEntry::ScatterVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
         TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
-                          UserTreeIdx, ReuseShuffleIndicies);
+                          UserTreeIdx, ReuseShuffleIndices);
         TE->setOperandsInOrder();
         buildTree_rec(PointerOps, Depth + 1, {TE, 0});
         LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
@@ -7020,7 +7020,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
       }
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndicies);
+                                   ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
       TE->setOperandsInOrder();
@@ -7039,7 +7039,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       // Check that all of the compares have the same predicate.
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndicies);
+                                   ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       ValueList Left, Right;
@@ -7100,7 +7100,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     case Instruction::Or:
     case Instruction::Xor: {
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndicies);
+                                   ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
@@ -7128,7 +7128,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     }
     case Instruction::GetElementPtr: {
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndicies);
+                                   ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
       SmallVector<ValueList, 2> Operands(2);
       // Prepare the operand vector for pointer operands.
@@ -7194,14 +7194,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       if (CurrentOrder.empty()) {
         // Original stores are consecutive and does not require reordering.
         TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                     ReuseShuffleIndicies);
+                                     ReuseShuffleIndices);
         TE->setOperandsInOrder();
         buildTree_rec(Operands, Depth + 1, {TE, 0});
         LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
       } else {
         fixupOrderingIndices(CurrentOrder);
         TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                     ReuseShuffleIndicies, CurrentOrder);
+                                     ReuseShuffleIndices, CurrentOrder);
         TE->setOperandsInOrder();
         buildTree_rec(Operands, Depth + 1, {TE, 0});
         LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
@@ -7215,7 +7215,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndicies);
+                                   ReuseShuffleIndices);
       // Sort operands of the instructions so that each side is more likely to
       // have the same opcode.
       if (isCommutative(VL0)) {
@@ -7261,7 +7261,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     }
     case Instruction::ShuffleVector: {
       TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                                   ReuseShuffleIndicies);
+                                   ReuseShuffleIndices);
       LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
@@ -12107,8 +12107,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
   unsigned VF = E->getVectorFactor();
 
   bool NeedFreeze = false;
-  SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
-                                        E->ReuseShuffleIndices.end());
+  SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
+                                       E->ReuseShuffleIndices.end());
   SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
   // Build a mask out of the reorder indices and reorder scalars per this
   // mask.
diff --git a/llvm/test/CodeGen/X86/avx-vperm2x128.ll b/llvm/test/CodeGen/X86/avx-vperm2x128.ll
index a11b92a663c45d..60fab8bc673792 100644
--- a/llvm/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/llvm/test/CodeGen/X86/avx-vperm2x128.ll
@@ -234,7 +234,7 @@ entry:
   ret <16 x i16> %shuffle
 }
 
-;;;; Cases with undef indicies mixed in the mask
+;;;; Cases with undef indices mixed in the mask
 
 define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
 ; ALL-LABEL: shuffle_v8f32_uu67u9ub:
diff --git a/llvm/test/DebugInfo/PDB/Inputs/every-type.yaml b/llvm/test/DebugInfo/PDB/Inputs/every-type.yaml
index 191c49042be6b9..4cd3034fe40ab6 100644
--- a/llvm/test/DebugInfo/PDB/Inputs/every-type.yaml
+++ b/llvm/test/DebugInfo/PDB/Inputs/every-type.yaml
@@ -19,11 +19,11 @@ TpiStream:
   # (int, char **)                                  [Index: 0x1003]
   - Kind:            LF_ARGLIST
     ArgList:
-      ArgIndicies:  [ 116, 0x1002 ]
+      ArgIndices:  [ 116, 0x1002 ]
   # (int, double)                                   [Index: 0x1004]
   - Kind:            LF_ARGLIST
     ArgList:
-      ArgIndicies:  [ 116, 65 ]       # (int, double)
+      ArgIndices:  [ 116, 65 ]       # (int, double)
   # int main(int argc, char **argv)                 [Index: 0x1005]
   - Kind:            LF_PROCEDURE
     Procedure:
diff --git a/llvm/test/MC/ARM/eh-directive-personalityindex-diagnostics.s b/llvm/test/MC/ARM/eh-directive-personalityindex-diagnostics.s
index 2dc2c8045a65ca..0158035c682c70 100644
--- a/llvm/test/MC/ARM/eh-directive-personalityindex-diagnostics.s
+++ b/llvm/test/MC/ARM/eh-directive-personalityindex-diagnostics.s
@@ -65,10 +65,10 @@ multiple_personality:
 @ CHECK: 	.personalityindex 0
 @ CHECK:       ^
 
-	.global multiple_personality_indicies
-	.type multiple_personality_indicies,%function
+	.global multiple_personality_indices
+	.type multiple_personality_indices,%function
 	.thumb_func
-multiple_personality_indicies:
+multiple_personality_indices:
 	.fnstart
 	.personalityindex 0
 	.personalityindex 1
diff --git a/llvm/test/Transforms/InstCombine/phi-extractvalue.ll b/llvm/test/Transforms/InstCombine/phi-extractvalue.ll
index 75fd4718721cc4..26893c178f590f 100644
--- a/llvm/test/Transforms/InstCombine/phi-extractvalue.ll
+++ b/llvm/test/Transforms/InstCombine/phi-extractvalue.ll
@@ -131,7 +131,7 @@ end:
   ret i32 %r
 }
 
-; But the indicies must match
+; But the indices must match
 define i32 @test4({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 %c) {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:  entry:
@@ -162,7 +162,7 @@ end:
   ret i32 %r
 }
 
-; More complex aggregates are fine, too, as long as indicies match.
+; More complex aggregates are fine, too, as long as indices match.
 define i32 @test5({{ i32, i32 }, { i32, i32 }} %agg_left, {{ i32, i32 }, { i32, i32 }} %agg_right, i1 %c) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:  entry:
@@ -192,7 +192,7 @@ end:
   ret i32 %r
 }
 
-; The indicies must fully match, on all levels.
+; The indices must fully match, on all levels.
 define i32 @test6({{ i32, i32 }, { i32, i32 }} %agg_left, {{ i32, i32 }, { i32, i32 }} %agg_right, i1 %c) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:  entry:
@@ -282,7 +282,7 @@ end:
 }
 
 ; Also, unlike PHI-of-insertvalues, here the base aggregates of extractvalue
-; can have different types, and just checking the indicies is not enough.
+; can have different types, and just checking the indices is not enough.
 define i32 @test9({ i32, i32 } %agg_left, { i32, { i32, i32 } } %agg_right, i1 %c) {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/InstCombine/phi-of-insertvalues.ll b/llvm/test/Transforms/InstCombine/phi-of-insertvalues.ll
index 548fb3bc9ddb01..3596ef198303e1 100644
--- a/llvm/test/Transforms/InstCombine/phi-of-insertvalues.ll
+++ b/llvm/test/Transforms/InstCombine/phi-of-insertvalues.ll
@@ -192,7 +192,7 @@ end:
   ret { i32, i32 } %r
 }
 
-; But the indicies must match
+; But the indices must match
 define { i32, i32 } @test6({ i32, i32 } %agg, i32 %val_left, i32 %val_right, i1 %c) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:  entry:
@@ -223,7 +223,7 @@ end:
   ret { i32, i32 } %r
 }
 
-; More complex aggregates are fine, too, as long as indicies match.
+; More complex aggregates are fine, too, as long as indices match.
 define {{ i32, i32 }, { i32, i32 }} @test7({{ i32, i32 }, { i32, i32 }} %agg, i32 %val_left, i32 %val_right, i1 %c) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:  entry:
@@ -253,7 +253,7 @@ end:
   ret {{ i32, i32 }, { i32, i32 }} %r
 }
 
-; The indicies must fully match, on all levels.
+; The indices must fully match, on all levels.
 define {{ i32, i32 }, { i32, i32 }} @test8({{ i32, i32 }, { i32, i32 }} %agg, i32 %val_left, i32 %val_right, i1 %c) {
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/VectorCombine/X86/scalarize-vector-gep.ll b/llvm/test/Transforms/VectorCombine/X86/scalarize-vector-gep.ll
index e227e9911bc345..ccdc007f674f4a 100644
--- a/llvm/test/Transforms/VectorCombine/X86/scalarize-vector-gep.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-vector-gep.ll
@@ -85,8 +85,8 @@ define void @both_operands_need_extraction.4elts(<4 x ptr> %baseptrs, <4 x i64>
 
 ;-------------------------------------------------------------------------------
 
-define void @indicies_need_extraction.2elts(ptr %baseptr, <2 x i64> %indices) {
-; CHECK-LABEL: @indicies_need_extraction.2elts(
+define void @indices_need_extraction.2elts(ptr %baseptr, <2 x i64> %indices) {
+; CHECK-LABEL: @indices_need_extraction.2elts(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr inbounds i64, ptr [[BASEPTR:%.*]], <2 x i64> [[INDICES:%.*]]
 ; CHECK-NEXT:    [[PTR_0:%.*]] = extractelement <2 x ptr> [[PTRS]], i64 0
 ; CHECK-NEXT:    call void @use(ptr [[PTR_0]])
@@ -105,8 +105,8 @@ define void @indicies_need_extraction.2elts(ptr %baseptr, <2 x i64> %indices) {
   ret void
 }
 
-define void @indicies_need_extraction.3elts(ptr %baseptr, <3 x i64> %indices) {
-; CHECK-LABEL: @indicies_need_extraction.3elts(
+define void @indices_need_extraction.3elts(ptr %baseptr, <3 x i64> %indices) {
+; CHECK-LABEL: @indices_need_extraction.3elts(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr inbounds i64, ptr [[BASEPTR:%.*]], <3 x i64> [[INDICES:%.*]]
 ; CHECK-NEXT:    [[PTR_0:%.*]] = extractelement <3 x ptr> [[PTRS]], i64 0
 ; CHECK-NEXT:    call void @use(ptr [[PTR_0]])
@@ -130,8 +130,8 @@ define void @indicies_need_extraction.3elts(ptr %baseptr, <3 x i64> %indices) {
   ret void
 }
 
-define void @indicies_need_extraction.4elts(ptr %baseptr, <4 x i64> %indices) {
-; CHECK-LABEL: @indicies_need_extraction.4elts(
+define void @indices_need_extraction.4elts(ptr %baseptr, <4 x i64> %indices) {
+; CHECK-LABEL: @indices_need_extraction.4elts(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr inbounds i64, ptr [[BASEPTR:%.*]], <4 x i64> [[INDICES:%.*]]
 ; CHECK-NEXT:    [[PTR_0:%.*]] = extractelement <4 x ptr> [[PTRS]], i64 0
 ; CHECK-NEXT:    call void @use(ptr [[PTR_0]])
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 7b4507c52e02c7..511835a226e7a2 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -3385,13 +3385,13 @@ struct Conv1DGenerator
       auto rhsSize = cast<VectorType>(rhs.getType()).getShape()[0];
       auto resSize = cast<VectorType>(res.getType()).getShape()[1];
 
-      SmallVector<int64_t, 16> indicies;
+      SmallVector<int64_t, 16> indices;
       for (int i = 0; i < resSize / rhsSize; ++i) {
         for (int j = 0; j < rhsSize; ++j)
-          indicies.push_back(j);
+          indices.push_back(j);
       }
 
-      rhs = rewriter.create<vector::ShuffleOp>(loc, rhs, rhs, indicies);
+      rhs = rewriter.create<vector::ShuffleOp>(loc, rhs, rhs, indices);
     }
     // Broadcast the filter to match the output vector
     rhs = rewriter.create<vector::BroadcastOp>(

From 398162ddbcf741c49e86bef2ef4aaa3fd0213916 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Wed, 15 May 2024 07:12:25 -0500
Subject: [PATCH 06/71] [libc] Fix typo in test message

---
 libc/test/src/string/strerror_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/test/src/string/strerror_test.cpp b/libc/test/src/string/strerror_test.cpp
index 2d6c230573a4bf..cfc79481699bc8 100644
--- a/libc/test/src/string/strerror_test.cpp
+++ b/libc/test/src/string/strerror_test.cpp
@@ -97,7 +97,7 @@ TEST(LlvmLibcStrErrorTest, KnownErrors) {
       ".lib section in a.out corrupted",
       "Attempting to link in too many shared libraries",
       "Cannot exec a shared library directly",
-      "Unknown Error 84", // Unknown
+      "Unknown error 84", // Unknown
       "Interrupted system call should be restarted",
       "Streams pipe error",
       "Too many users",

From 3b5a121a2478e586f59e3277d04d17fb63be5d76 Mon Sep 17 00:00:00 2001
From: Julian Schmidt <git.julian.schmidt@gmail.com>
Date: Wed, 15 May 2024 14:35:07 +0200
Subject: [PATCH 07/71] [clang-tidy][NFC] replace comparison of begin and end
 iterators with range empty (#91994)

Improves readability by changing comparisons of `*_begin` and `*_end`
iterators into `.empty()` on their range.
---
 .../clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp          | 3 +--
 .../MisleadingCaptureDefaultByValueCheck.cpp                  | 3 +--
 clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp   | 4 +---
 .../clang-tidy/modernize/UseConstraintsCheck.cpp              | 2 +-
 4 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp
index ca1ae551cc632a..2fca7ae2e7eee8 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.cpp
@@ -171,8 +171,7 @@ void SuspiciousEnumUsageCheck::check(const MatchFinder::MatchResult &Result) {
     // Skip when one of the parameters is an empty enum. The
     // hasDisjointValueRange function could not decide the values properly in
     // case of an empty enum.
-    if (EnumDec->enumerator_begin() == EnumDec->enumerator_end() ||
-        OtherEnumDec->enumerator_begin() == OtherEnumDec->enumerator_end())
+    if (EnumDec->enumerators().empty() || OtherEnumDec->enumerators().empty())
       return;
 
     if (!hasDisjointValueRange(EnumDec, OtherEnumDec))
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp
index 00dfa17a1ccf61..5dee7f91a93410 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.cpp
@@ -67,8 +67,7 @@ static std::string createReplacementText(const LambdaExpr *Lambda) {
       AppendName("this");
     }
   }
-  if (!Replacement.empty() &&
-      Lambda->explicit_capture_begin() != Lambda->explicit_capture_end()) {
+  if (!Replacement.empty() && !Lambda->explicit_captures().empty()) {
     // Add back separator if we are adding explicit capture variables.
     Stream << ", ";
   }
diff --git a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp
index 3f1d2f9f580991..c2d9286312dc4a 100644
--- a/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UnusedParametersCheck.cpp
@@ -192,9 +192,7 @@ void UnusedParametersCheck::check(const MatchFinder::MatchResult &Result) {
 
     // In non-strict mode ignore function definitions with empty bodies
     // (constructor initializer counts for non-empty body).
-    if (StrictMode ||
-        (Function->getBody()->child_begin() !=
-         Function->getBody()->child_end()) ||
+    if (StrictMode || !Function->getBody()->children().empty() ||
         (isa<CXXConstructorDecl>(Function) &&
          cast<CXXConstructorDecl>(Function)->getNumCtorInitializers() > 0))
       warnOnUnusedParameter(Result, Function, I);
diff --git a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
index 6d7d1d6b87c60a..1585925ee99672 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
@@ -254,7 +254,7 @@ findInsertionForConstraint(const FunctionDecl *Function, ASTContext &Context) {
         return utils::lexer::findPreviousTokenKind(Init->getSourceLocation(),
                                                    SM, LangOpts, tok::colon);
     }
-    if (Constructor->init_begin() != Constructor->init_end())
+    if (!Constructor->inits().empty())
       return std::nullopt;
   }
   if (Function->isDeleted()) {

From 8a71284cb9463a90fab0d9e8edbeb5d879531e32 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 15 May 2024 13:43:27 +0100
Subject: [PATCH 08/71] [MC][X86] Cleanup check prefixes identified in #92248

Avoid using numbers as check prefix - replace with actual triple config names where possible
---
 llvm/test/MC/X86/abs8.s                       |  12 +-
 .../test/MC/X86/align-branch-variant-symbol.s |  33 ++--
 llvm/test/MC/X86/data-prefix-fail.s           |  34 ++--
 llvm/test/MC/X86/displacement-overflow.s      |  12 +-
 llvm/test/MC/X86/dwarf-segment-register.s     |  28 +--
 llvm/test/MC/X86/index-operations.s           | 162 ++++++++--------
 llvm/test/MC/X86/ret.s                        | 182 +++++++++---------
 llvm/test/MC/X86/x86_errors.s                 | 158 +++++++--------
 8 files changed, 310 insertions(+), 311 deletions(-)

diff --git a/llvm/test/MC/X86/abs8.s b/llvm/test/MC/X86/abs8.s
index b933c9ddff9032..71936acb37408a 100644
--- a/llvm/test/MC/X86/abs8.s
+++ b/llvm/test/MC/X86/abs8.s
@@ -1,8 +1,8 @@
-// RUN: llvm-mc -filetype=obj %s -o - -triple i686-pc-linux | llvm-objdump --no-print-imm-hex -d -r - | FileCheck --check-prefix=32 %s
-// RUN: llvm-mc -filetype=obj %s -o - -triple x86_64-pc-linux | llvm-objdump --no-print-imm-hex -d -r - | FileCheck --check-prefix=64 %s
+// RUN: llvm-mc -filetype=obj %s -o - -triple i686-pc-linux | llvm-objdump --no-print-imm-hex -d -r - | FileCheck --check-prefix=X86 %s
+// RUN: llvm-mc -filetype=obj %s -o - -triple x86_64-pc-linux | llvm-objdump --no-print-imm-hex -d -r - | FileCheck --check-prefix=X64 %s
 
-// 32: 0: 83 ff 00  cmpl $0, %edi
-// 32:   00000002:  R_386_8 foo
-// 64: 0: 83 ff 00  cmpl $0, %edi
-// 64:  0000000000000002:  R_X86_64_8 foo
+// X86: 0: 83 ff 00  cmpl $0, %edi
+// X86:   00000002:  R_386_8 foo
+// X64: 0: 83 ff 00  cmpl $0, %edi
+// X64:  0000000000000002:  R_X86_64_8 foo
 cmp $foo@ABS8, %edi
diff --git a/llvm/test/MC/X86/align-branch-variant-symbol.s b/llvm/test/MC/X86/align-branch-variant-symbol.s
index 53afdf58bff39e..a1b7b895a354c5 100644
--- a/llvm/test/MC/X86/align-branch-variant-symbol.s
+++ b/llvm/test/MC/X86/align-branch-variant-symbol.s
@@ -1,6 +1,5 @@
-# RUN: llvm-mc -filetype=obj -triple x86_64 --x86-align-branch-boundary=32 --x86-align-branch=call+indirect %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefixes=64BIT,CHECK
-
-# RUN: llvm-mc -filetype=obj -triple i386 --x86-align-branch-boundary=32 --x86-align-branch=call+indirect %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefixes=32BIT,CHECK
+# RUN: llvm-mc -filetype=obj -triple x86_64 --x86-align-branch-boundary=32 --x86-align-branch=call+indirect %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefixes=CHECK,X64
+# RUN: llvm-mc -filetype=obj -triple i386 --x86-align-branch-boundary=32 --x86-align-branch=call+indirect %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefixes=CHECK,X86
 
   # Exercise cases where the instruction to be aligned has a variant symbol
   # operand, and we can't add before it since linker may rewrite it.
@@ -14,8 +13,8 @@ foo:
   int3
   .endr
   # CHECK:    1d:          int3
-  # 64BIT:    1e:          callq
-  # 32BIT:    1e:          calll
+  # X64:    1e:          callq
+  # X86:    1e:          calll
   # CHECK:    23:          int3
   call    ___tls_get_addr@PLT
   int3
@@ -25,10 +24,10 @@ foo:
   int3
   .endr
   # CHECK:    5d:          int3
-  # 64BIT:    5e:          callq    *(%ecx)
-  # 64BIT:    65:          int3
-  # 32BIT:    5e:          calll    *(%ecx)
-  # 32BIT:    64:          int3
+  # X64:    5e:          callq    *(%ecx)
+  # X64:    65:          int3
+  # X86:    5e:          calll    *(%ecx)
+  # X86:    64:          int3
   call *___tls_get_addr@GOT(%ecx)
   int3
 
@@ -37,10 +36,10 @@ foo:
   int3
   .endr
   # CHECK:    9d:          int3
-  # 64BIT:    9e:          callq    *(%eax)
-  # 64BIT:    a1:          int3
-  # 32BIT:    9e:          calll    *(%eax)
-  # 32BIT:    a0:          int3
+  # X64:    9e:          callq    *(%eax)
+  # X64:    a1:          int3
+  # X86:    9e:          calll    *(%eax)
+  # X86:    a0:          int3
   call *foo@tlscall(%eax)
   int3
 
@@ -49,9 +48,9 @@ foo:
   int3
   .endr
   # CHECK:    dd:          int3
-  # 64BIT:    de:          jmpq    *(%eax)
-  # 64BIT:    e1:          int3
-  # 32BIT:    de:          jmpl    *(%eax)
-  # 32BIT:    e0:          int3
+  # X64:    de:          jmpq    *(%eax)
+  # X64:    e1:          int3
+  # X86:    de:          jmpl    *(%eax)
+  # X86:    e0:          int3
   jmp  *foo@tlscall(%eax)
   int3
diff --git a/llvm/test/MC/X86/data-prefix-fail.s b/llvm/test/MC/X86/data-prefix-fail.s
index bd5b62ddc9bedf..3088864fc909b0 100644
--- a/llvm/test/MC/X86/data-prefix-fail.s
+++ b/llvm/test/MC/X86/data-prefix-fail.s
@@ -1,30 +1,30 @@
-// RUN: not llvm-mc -triple x86_64-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=64 %s
+// RUN: not llvm-mc -triple x86_64-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=X86-64 %s
 // RUN: FileCheck --check-prefix=ERR64 < %t.err %s
-// RUN: not llvm-mc -triple i386-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=32 %s
+// RUN: not llvm-mc -triple i386-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=X86-32 %s
 // RUN: FileCheck --check-prefix=ERR32 < %t.err %s
-// RUN: not llvm-mc -triple i386-unknown-unknown-code16 --show-encoding %s 2> %t.err | FileCheck --check-prefix=16 %s
+// RUN: not llvm-mc -triple i386-unknown-unknown-code16 --show-encoding %s 2> %t.err | FileCheck --check-prefix=X86-16 %s
 // RUN: FileCheck --check-prefix=ERR16 < %t.err %s
 
 // ERR64: error: 'data32' is not supported in 64-bit mode
 // ERR32: error: redundant data32 prefix
-// 16: lgdtl 0
-// 16-SAME: encoding: [0x66,0x0f,0x01,0x16,0x00,0x00]
+// X86-16: lgdtl 0
+// X86-16-SAME: encoding: [0x66,0x0f,0x01,0x16,0x00,0x00]
 data32 lgdt 0
 
-// 64: data16
-// 64: encoding: [0x66]
-// 64: lgdtq 0
-// 64: encoding: [0x0f,0x01,0x14,0x25,0x00,0x00,0x00,0x00]
-// 32: data16
-// 32: encoding: [0x66]
-// 32: lgdtl 0
-// 32: encoding: [0x0f,0x01,0x15,0x00,0x00,0x00,0x00]
+// X86-64: data16
+// X86-64: encoding: [0x66]
+// X86-64: lgdtq 0
+// X86-64: encoding: [0x0f,0x01,0x14,0x25,0x00,0x00,0x00,0x00]
+// X86-32: data16
+// X86-32: encoding: [0x66]
+// X86-32: lgdtl 0
+// X86-32: encoding: [0x0f,0x01,0x15,0x00,0x00,0x00,0x00]
 // ERR16: error: redundant data16 prefix
 data16 lgdt 0
 
-// 64:      data16    # encoding: [0x66]
-// 64-NEXT: callq  0  # encoding: [0xe8,A,A,A,A]
-// 32:      data16    # encoding: [0x66]
-// 32-NEXT: calll  0  # encoding: [0xe8,A,A,A,A]
+// X86-64:      data16    # encoding: [0x66]
+// X86-64-NEXT: callq  0  # encoding: [0xe8,A,A,A,A]
+// X86-32:      data16    # encoding: [0x66]
+// X86-32-NEXT: calll  0  # encoding: [0xe8,A,A,A,A]
 // ERR16: {{.*}}.s:[[#@LINE+1]]:1: error: redundant data16 prefix
 data16 call 0
diff --git a/llvm/test/MC/X86/displacement-overflow.s b/llvm/test/MC/X86/displacement-overflow.s
index 2882147af48280..933f98cb21fcb5 100644
--- a/llvm/test/MC/X86/displacement-overflow.s
+++ b/llvm/test/MC/X86/displacement-overflow.s
@@ -1,14 +1,14 @@
-# RUN: not llvm-mc -triple=x86_64 %s 2>&1 | FileCheck %s --check-prefixes=CHECK,64 --implicit-check-not=error: --implicit-check-not=warning:
-# RUN: llvm-mc -triple=i686 --defsym A16=1 %s 2>&1 | FileCheck %s --check-prefixes=CHECK,32 --implicit-check-not=error: --implicit-check-not=warning:
+# RUN: not llvm-mc -triple=x86_64 %s 2>&1 | FileCheck %s --check-prefixes=CHECK,X64 --implicit-check-not=error: --implicit-check-not=warning:
+# RUN: llvm-mc -triple=i686 --defsym A16=1 %s 2>&1 | FileCheck %s --check-prefixes=CHECK,X86 --implicit-check-not=error: --implicit-check-not=warning:
 
 .ifndef A16
 movq 0x80000000-1(%rip), %rax
 leaq -0x80000000(%rip), %rax
 
-# 64: [[#@LINE+1]]:17: error: displacement 2147483648 is not within [-2147483648, 2147483647]
+# X64: [[#@LINE+1]]:17: error: displacement 2147483648 is not within [-2147483648, 2147483647]
 movq 0x80000000(%rip), %rax
 
-# 64: [[#@LINE+1]]:18: error: displacement -2147483649 is not within [-2147483648, 2147483647]
+# X64: [[#@LINE+1]]:18: error: displacement -2147483649 is not within [-2147483648, 2147483647]
 leaq -0x80000001(%rip), %rax
 .endif
 
@@ -31,8 +31,8 @@ leal -0xffffffff-2(%eax), %eax
 movw $0, 0xffff(%bp)
 movw $0, -0xffff(%si)
 
-# 32: [[#@LINE+1]]:19: warning: displacement 65536 shortened to 16-bit signed 0
+# X86: [[#@LINE+1]]:19: warning: displacement 65536 shortened to 16-bit signed 0
 movw $0, 0xffff+1(%bp)
-# 32: [[#@LINE+1]]:20: warning: displacement -65536 shortened to 16-bit signed 0
+# X86: [[#@LINE+1]]:20: warning: displacement -65536 shortened to 16-bit signed 0
 movw $0, -0xffff-1(%si)
 .endif
diff --git a/llvm/test/MC/X86/dwarf-segment-register.s b/llvm/test/MC/X86/dwarf-segment-register.s
index a6857680747064..5482588df82123 100644
--- a/llvm/test/MC/X86/dwarf-segment-register.s
+++ b/llvm/test/MC/X86/dwarf-segment-register.s
@@ -1,7 +1,7 @@
 // RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t.64
-// RUN: llvm-objdump --dwarf=frames %t.64 | FileCheck %s --check-prefixes=64,CHECK
+// RUN: llvm-objdump --dwarf=frames %t.64 | FileCheck %s --check-prefixes=X64,CHECK
 // RUN: llvm-mc -filetype=obj -triple i386-pc-linux-gnu %s -o %t.32
-// RUN: llvm-objdump --dwarf=frames %t.32 | FileCheck %s --check-prefixes=32,CHECK
+// RUN: llvm-objdump --dwarf=frames %t.32 | FileCheck %s --check-prefixes=X86,CHECK
 
 .cfi_startproc
 .cfi_offset %cs, -40
@@ -12,26 +12,26 @@
 .cfi_offset %gs, 0
 .cfi_endproc
 
-// 64: reg51
-// 32: reg41
+// X64: reg51
+// X86: reg41
 // CHECK-SAME: -40
 
-// 64: reg53
-// 32: reg43
+// X64: reg53
+// X86: reg43
 // CHECK-SAME: -32
 
-// 64: reg52
-// 32: reg42
+// X64: reg52
+// X86: reg42
 // CHECK-SAME: -24
 
-// 64: reg50
-// 32: reg40
+// X64: reg50
+// X86: reg40
 // CHECK-SAME: -16
 
-// 64: reg54
-// 32: reg44
+// X64: reg54
+// X86: reg44
 // CHECK-SAME: -8
 
-// 64: reg55
-// 32: reg45
+// X64: reg55
+// X86: reg45
 // CHECK-SAME: 0
diff --git a/llvm/test/MC/X86/index-operations.s b/llvm/test/MC/X86/index-operations.s
index 59498b0ced1249..c425ba8057c5f2 100644
--- a/llvm/test/MC/X86/index-operations.s
+++ b/llvm/test/MC/X86/index-operations.s
@@ -1,34 +1,34 @@
-// RUN: not llvm-mc -triple x86_64-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=64 %s
+// RUN: not llvm-mc -triple x86_64-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=X86-64 %s
 // RUN: FileCheck --input-file=%t.err %s --check-prefix=ERR64 --implicit-check-not=error:
-// RUN: not llvm-mc -triple i386-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=32 %s
+// RUN: not llvm-mc -triple i386-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=X86-32 %s
 // RUN: FileCheck --check-prefix=ERR32 < %t.err %s
-// RUN: not llvm-mc -triple i386-unknown-unknown-code16 --show-encoding %s 2> %t.err | FileCheck --check-prefix=16 %s
+// RUN: not llvm-mc -triple i386-unknown-unknown-code16 --show-encoding %s 2> %t.err | FileCheck --check-prefix=X86-16 %s
 // RUN: FileCheck --check-prefix=ERR16 < %t.err %s
 
 lodsb
-// 64: lodsb (%rsi), %al # encoding: [0xac]
-// 32: lodsb (%esi), %al # encoding: [0xac]
-// 16: lodsb (%si), %al # encoding: [0xac]
+// X86-64: lodsb (%rsi), %al # encoding: [0xac]
+// X86-32: lodsb (%esi), %al # encoding: [0xac]
+// X86-16: lodsb (%si), %al # encoding: [0xac]
 
 lodsb (%rsi), %al
-// 64: lodsb (%rsi), %al # encoding: [0xac]
+// X86-64: lodsb (%rsi), %al # encoding: [0xac]
 // ERR32: 64-bit
 // ERR16: 64-bit
 
 lodsb (%esi), %al
-// 64: lodsb (%esi), %al # encoding: [0x67,0xac]
-// 32: lodsb (%esi), %al # encoding: [0xac]
-// 16: lodsb (%esi), %al # encoding: [0x67,0xac]
+// X86-64: lodsb (%esi), %al # encoding: [0x67,0xac]
+// X86-32: lodsb (%esi), %al # encoding: [0xac]
+// X86-16: lodsb (%esi), %al # encoding: [0x67,0xac]
 
 lodsb (%si), %al
 // ERR64: [[#@LINE-1]]:[[#]]: error: invalid 16-bit base register
-// 32: lodsb (%si), %al # encoding: [0x67,0xac]
-// 16: lodsb (%si), %al # encoding: [0xac]
+// X86-32: lodsb (%si), %al # encoding: [0x67,0xac]
+// X86-16: lodsb (%si), %al # encoding: [0xac]
 
 lodsl %gs:(%esi)
-// 64: lodsl %gs:(%esi), %eax # encoding: [0x67,0x65,0xad]
-// 32: lodsl %gs:(%esi), %eax # encoding: [0x65,0xad]
-// 16: lodsl %gs:(%esi), %eax # encoding: [0x67,0x65,0x66,0xad]
+// X86-64: lodsl %gs:(%esi), %eax # encoding: [0x67,0x65,0xad]
+// X86-32: lodsl %gs:(%esi), %eax # encoding: [0x65,0xad]
+// X86-16: lodsl %gs:(%esi), %eax # encoding: [0x67,0x65,0x66,0xad]
 
 lodsl (%edi), %eax
 // ERR64: [[#@LINE-1]]:[[#]]: error: invalid operand
@@ -41,19 +41,19 @@ lodsl 44(%edi), %eax
 // ERR16: invalid operand
 
 lods (%esi), %ax
-// 64: lodsw (%esi), %ax # encoding: [0x67,0x66,0xad]
-// 32: lodsw (%esi), %ax # encoding: [0x66,0xad]
-// 16: lodsw (%esi), %ax # encoding: [0x67,0xad]
+// X86-64: lodsw (%esi), %ax # encoding: [0x67,0x66,0xad]
+// X86-32: lodsw (%esi), %ax # encoding: [0x66,0xad]
+// X86-16: lodsw (%esi), %ax # encoding: [0x67,0xad]
 
 stosw
-// 64: stosw %ax, %es:(%rdi) # encoding: [0x66,0xab]
-// 32: stosw %ax, %es:(%edi) # encoding: [0x66,0xab]
-// 16: stosw %ax, %es:(%di) # encoding: [0xab]
+// X86-64: stosw %ax, %es:(%rdi) # encoding: [0x66,0xab]
+// X86-32: stosw %ax, %es:(%edi) # encoding: [0x66,0xab]
+// X86-16: stosw %ax, %es:(%di) # encoding: [0xab]
 
 stos %eax, (%edi)
-// 64: stosl %eax, %es:(%edi) # encoding: [0x67,0xab]
-// 32: stosl %eax, %es:(%edi) # encoding: [0xab]
-// 16: stosl %eax, %es:(%edi) # encoding: [0x67,0x66,0xab]
+// X86-64: stosl %eax, %es:(%edi) # encoding: [0x67,0xab]
+// X86-32: stosl %eax, %es:(%edi) # encoding: [0xab]
+// X86-16: stosl %eax, %es:(%edi) # encoding: [0x67,0x66,0xab]
 
 stosb %al, %fs:(%edi)
 // ERR64: [[#@LINE-1]]:[[#]]: error: invalid operand for instruction
@@ -61,27 +61,27 @@ stosb %al, %fs:(%edi)
 // ERR16: invalid operand for instruction
 
 stosb %al, %es:(%edi)
-// 64: stosb %al, %es:(%edi) # encoding: [0x67,0xaa]
-// 32: stosb %al, %es:(%edi) # encoding: [0xaa]
-// 16: stosb %al, %es:(%edi) # encoding: [0x67,0xaa]
+// X86-64: stosb %al, %es:(%edi) # encoding: [0x67,0xaa]
+// X86-32: stosb %al, %es:(%edi) # encoding: [0xaa]
+// X86-16: stosb %al, %es:(%edi) # encoding: [0x67,0xaa]
 
 stosq
-// 64: stosq %rax, %es:(%rdi) # encoding: [0x48,0xab]
+// X86-64: stosq %rax, %es:(%rdi) # encoding: [0x48,0xab]
 // ERR32: 64-bit
 // ERR16: 64-bit
 
 stos %rax, (%edi)
-// 64: 	stosq %rax, %es:(%edi) # encoding: [0x67,0x48,0xab]
+// X86-64: 	stosq %rax, %es:(%edi) # encoding: [0x67,0x48,0xab]
 // ERR32: only available in 64-bit mode
 // ERR16: only available in 64-bit mode
 
 scas %es:(%edi), %al
-// 64: scasb %es:(%edi), %al # encoding: [0x67,0xae]
-// 32: scasb %es:(%edi), %al # encoding: [0xae]
-// 16: scasb %es:(%edi), %al # encoding: [0x67,0xae]
+// X86-64: scasb %es:(%edi), %al # encoding: [0x67,0xae]
+// X86-32: scasb %es:(%edi), %al # encoding: [0xae]
+// X86-16: scasb %es:(%edi), %al # encoding: [0x67,0xae]
 
 scasq %es:(%edi)
-// 64: scasq %es:(%edi), %rax # encoding: [0x67,0x48,0xaf]
+// X86-64: scasq %es:(%edi), %rax # encoding: [0x67,0x48,0xaf]
 // ERR32: 64-bit
 // ERR16: 64-bit
 
@@ -92,18 +92,18 @@ scasl %es:(%edi), %al
 
 scas %es:(%di), %ax
 // ERR64: [[#@LINE-1]]:[[#]]: error: invalid 16-bit base register
-// 16: scasw %es:(%di), %ax # encoding: [0xaf]
-// 32: scasw %es:(%di), %ax # encoding: [0x67,0x66,0xaf]
+// X86-16: scasw %es:(%di), %ax # encoding: [0xaf]
+// X86-32: scasw %es:(%di), %ax # encoding: [0x67,0x66,0xaf]
 
 cmpsb
-// 64: cmpsb %es:(%rdi), (%rsi) # encoding: [0xa6]
-// 32: cmpsb %es:(%edi), (%esi) # encoding: [0xa6]
-// 16: cmpsb %es:(%di), (%si) # encoding: [0xa6]
+// X86-64: cmpsb %es:(%rdi), (%rsi) # encoding: [0xa6]
+// X86-32: cmpsb %es:(%edi), (%esi) # encoding: [0xa6]
+// X86-16: cmpsb %es:(%di), (%si) # encoding: [0xa6]
 
 cmpsw (%edi), (%esi)
-// 64: cmpsw %es:(%edi), (%esi) # encoding: [0x67,0x66,0xa7]
-// 32: cmpsw %es:(%edi), (%esi) # encoding: [0x66,0xa7]
-// 16: cmpsw %es:(%edi), (%esi) # encoding: [0x67,0xa7]
+// X86-64: cmpsw %es:(%edi), (%esi) # encoding: [0x67,0x66,0xa7]
+// X86-32: cmpsw %es:(%edi), (%esi) # encoding: [0x66,0xa7]
+// X86-16: cmpsw %es:(%edi), (%esi) # encoding: [0x67,0xa7]
 
 cmpsb (%di), (%esi)
 // ERR64: [[#@LINE-1]]:[[#]]: error: invalid 16-bit base register
@@ -111,52 +111,52 @@ cmpsb (%di), (%esi)
 // ERR16: mismatching source and destination
 
 cmpsl %es:(%edi), %ss:(%esi)
-// 64: cmpsl %es:(%edi), %ss:(%esi) # encoding: [0x67,0x36,0xa7]
-// 32: cmpsl %es:(%edi), %ss:(%esi) # encoding: [0x36,0xa7]
-// 16: cmpsl %es:(%edi), %ss:(%esi) # encoding: [0x67,0x36,0x66,0xa7]
+// X86-64: cmpsl %es:(%edi), %ss:(%esi) # encoding: [0x67,0x36,0xa7]
+// X86-32: cmpsl %es:(%edi), %ss:(%esi) # encoding: [0x36,0xa7]
+// X86-16: cmpsl %es:(%edi), %ss:(%esi) # encoding: [0x67,0x36,0x66,0xa7]
 
 cmpsq (%rdi), (%rsi)
-// 64: cmpsq %es:(%rdi), (%rsi) # encoding: [0x48,0xa7]
+// X86-64: cmpsq %es:(%rdi), (%rsi) # encoding: [0x48,0xa7]
 // ERR32: 64-bit
 // ERR16: 64-bit
 
 movsb (%esi), (%edi)
-// 64: movsb (%esi), %es:(%edi) # encoding: [0x67,0xa4]
-// 32: movsb (%esi), %es:(%edi) # encoding: [0xa4]
-// 16: movsb (%esi), %es:(%edi) # encoding: [0x67,0xa4]
+// X86-64: movsb (%esi), %es:(%edi) # encoding: [0x67,0xa4]
+// X86-32: movsb (%esi), %es:(%edi) # encoding: [0xa4]
+// X86-16: movsb (%esi), %es:(%edi) # encoding: [0x67,0xa4]
 
 movsl %gs:(%esi), (%edi)
-// 64: movsl %gs:(%esi), %es:(%edi) # encoding: [0x67,0x65,0xa5]
-// 32: movsl %gs:(%esi), %es:(%edi) # encoding: [0x65,0xa5]
-// 16: movsl %gs:(%esi), %es:(%edi) # encoding: [0x67,0x65,0x66,0xa5]
+// X86-64: movsl %gs:(%esi), %es:(%edi) # encoding: [0x67,0x65,0xa5]
+// X86-32: movsl %gs:(%esi), %es:(%edi) # encoding: [0x65,0xa5]
+// X86-16: movsl %gs:(%esi), %es:(%edi) # encoding: [0x67,0x65,0x66,0xa5]
 
 outsb
-// 64: outsb (%rsi), %dx # encoding: [0x6e]
-// 32: outsb (%esi), %dx # encoding: [0x6e]
-// 16: outsb (%si), %dx # encoding: [0x6e]
+// X86-64: outsb (%rsi), %dx # encoding: [0x6e]
+// X86-32: outsb (%esi), %dx # encoding: [0x6e]
+// X86-16: outsb (%si), %dx # encoding: [0x6e]
 
 outsw %fs:(%esi), %dx
-// 64: outsw %fs:(%esi), %dx # encoding: [0x67,0x64,0x66,0x6f]
-// 32: outsw %fs:(%esi), %dx # encoding: [0x64,0x66,0x6f]
-// 16: outsw %fs:(%esi), %dx # encoding: [0x67,0x64,0x6f]
+// X86-64: outsw %fs:(%esi), %dx # encoding: [0x67,0x64,0x66,0x6f]
+// X86-32: outsw %fs:(%esi), %dx # encoding: [0x64,0x66,0x6f]
+// X86-16: outsw %fs:(%esi), %dx # encoding: [0x67,0x64,0x6f]
 
 insw %dx, (%edi)
-// 64: insw %dx, %es:(%edi) # encoding: [0x67,0x66,0x6d]
-// 32: insw %dx, %es:(%edi) # encoding: [0x66,0x6d]
-// 16: insw %dx, %es:(%edi) # encoding: [0x67,0x6d]
+// X86-64: insw %dx, %es:(%edi) # encoding: [0x67,0x66,0x6d]
+// X86-32: insw %dx, %es:(%edi) # encoding: [0x66,0x6d]
+// X86-16: insw %dx, %es:(%edi) # encoding: [0x67,0x6d]
 
 insw %dx, (%bx)
 // ERR64: [[#@LINE-1]]:[[#]]: error: invalid 16-bit base register
-// 32: insw %dx, %es:(%di) # encoding: [0x67,0x66,0x6d]
-// 16: insw %dx, %es:(%di) # encoding: [0x6d]
+// X86-32: insw %dx, %es:(%di) # encoding: [0x67,0x66,0x6d]
+// X86-16: insw %dx, %es:(%di) # encoding: [0x6d]
 
 insw %dx, (%ebx)
-// 64: insw %dx, %es:(%edi) # encoding: [0x67,0x66,0x6d]
-// 32: insw %dx, %es:(%edi) # encoding: [0x66,0x6d]
-// 16: insw %dx, %es:(%edi) # encoding: [0x67,0x6d]
+// X86-64: insw %dx, %es:(%edi) # encoding: [0x67,0x66,0x6d]
+// X86-32: insw %dx, %es:(%edi) # encoding: [0x66,0x6d]
+// X86-16: insw %dx, %es:(%edi) # encoding: [0x67,0x6d]
 
 insw %dx, (%rbx)
-// 64: insw %dx, %es:(%rdi) # encoding: [0x66,0x6d]
+// X86-64: insw %dx, %es:(%rdi) # encoding: [0x66,0x6d]
 // ERR32: 64-bit
 // ERR16: 64-bit
 
@@ -177,17 +177,17 @@ movdir64b (%edx), %r15
 // ERR64: [[#@LINE-1]]:[[#]]: error: invalid operand
 
 movdir64b (%eip), %ebx
-// 64: movdir64b (%eip), %ebx # encoding: [0x67,0x66,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
+// X86-64: movdir64b (%eip), %ebx # encoding: [0x67,0x66,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
 
 movdir64b (%rip), %rbx
-// 64: movdir64b (%rip), %rbx # encoding: [0x66,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
+// X86-64: movdir64b (%rip), %rbx # encoding: [0x66,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
 
 movdir64b 291(%esi, %eiz, 4), %ebx
-// 64: movdir64b 291(%esi,%eiz,4), %ebx # encoding: [0x67,0x66,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
-// 32: movdir64b 291(%esi,%eiz,4), %ebx # encoding: [0x66,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
+// X86-64: movdir64b 291(%esi,%eiz,4), %ebx # encoding: [0x67,0x66,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
+// X86-32: movdir64b 291(%esi,%eiz,4), %ebx # encoding: [0x66,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
 
 movdir64b 291(%rsi, %riz, 4), %rbx
-// 64: movdir64b 291(%rsi,%riz,4), %rbx # encoding: [0x66,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
+// X86-64: movdir64b 291(%rsi,%riz,4), %rbx # encoding: [0x66,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
 
 enqcmd	291(%si), %ecx
 // ERR64: error: invalid 16-bit base register
@@ -206,17 +206,17 @@ enqcmd (%edx), %r15
 // ERR64: [[#@LINE-1]]:[[#]]: error: invalid operand
 
 enqcmd (%eip), %ebx
-// 64: enqcmd (%eip), %ebx # encoding: [0x67,0xf2,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
+// X86-64: enqcmd (%eip), %ebx # encoding: [0x67,0xf2,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
 
 enqcmd (%rip), %rbx
-// 64: enqcmd (%rip), %rbx # encoding: [0xf2,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
+// X86-64: enqcmd (%rip), %rbx # encoding: [0xf2,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
 
 enqcmd 291(%esi, %eiz, 4), %ebx
-// 64: enqcmd 291(%esi,%eiz,4), %ebx # encoding: [0x67,0xf2,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
-// 32: enqcmd 291(%esi,%eiz,4), %ebx # encoding: [0xf2,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
+// X86-64: enqcmd 291(%esi,%eiz,4), %ebx # encoding: [0x67,0xf2,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
+// X86-32: enqcmd 291(%esi,%eiz,4), %ebx # encoding: [0xf2,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
 
 enqcmd 291(%rsi, %riz, 4), %rbx
-// 64: enqcmd 291(%rsi,%riz,4), %rbx # encoding: [0xf2,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
+// X86-64: enqcmd 291(%rsi,%riz,4), %rbx # encoding: [0xf2,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
 
 enqcmds	291(%si), %ecx
 // ERR64: error: invalid 16-bit base register
@@ -235,14 +235,14 @@ enqcmds (%edx), %r15
 // ERR64: [[#@LINE-1]]:[[#]]: error: invalid operand
 
 enqcmds (%eip), %ebx
-// 64: enqcmds (%eip), %ebx # encoding: [0x67,0xf3,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
+// X86-64: enqcmds (%eip), %ebx # encoding: [0x67,0xf3,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
 
 enqcmds (%rip), %rbx
-// 64: enqcmds (%rip), %rbx # encoding: [0xf3,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
+// X86-64: enqcmds (%rip), %rbx # encoding: [0xf3,0x0f,0x38,0xf8,0x1d,0x00,0x00,0x00,0x00]
 
 enqcmds 291(%esi, %eiz, 4), %ebx
-// 64: enqcmds 291(%esi,%eiz,4), %ebx # encoding: [0x67,0xf3,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
-// 32: enqcmds 291(%esi,%eiz,4), %ebx # encoding: [0xf3,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
+// X86-64: enqcmds 291(%esi,%eiz,4), %ebx # encoding: [0x67,0xf3,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
+// X86-32: enqcmds 291(%esi,%eiz,4), %ebx # encoding: [0xf3,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
 
 enqcmds 291(%rsi, %riz, 4), %rbx
-// 64: enqcmds 291(%rsi,%riz,4), %rbx # encoding: [0xf3,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
+// X86-64: enqcmds 291(%rsi,%riz,4), %rbx # encoding: [0xf3,0x0f,0x38,0xf8,0x9c,0xa6,0x23,0x01,0x00,0x00]
diff --git a/llvm/test/MC/X86/ret.s b/llvm/test/MC/X86/ret.s
index 142a4614ba4f1f..7b5bcd4ad990f4 100644
--- a/llvm/test/MC/X86/ret.s
+++ b/llvm/test/MC/X86/ret.s
@@ -1,129 +1,129 @@
-// RUN: not llvm-mc -triple x86_64-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=64 %s
+// RUN: not llvm-mc -triple x86_64-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=X86-64 %s
 // RUN: FileCheck --check-prefix=ERR64 < %t.err %s
-// RUN: not llvm-mc -triple i386-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=32 %s
+// RUN: not llvm-mc -triple i386-unknown-unknown --show-encoding %s 2> %t.err | FileCheck --check-prefix=X86-32 %s
 // RUN: FileCheck --check-prefix=ERR32 < %t.err %s
-// RUN: not llvm-mc -triple i386-unknown-unknown-code16 --show-encoding %s 2> %t.err | FileCheck --check-prefix=16 %s
+// RUN: not llvm-mc -triple i386-unknown-unknown-code16 --show-encoding %s 2> %t.err | FileCheck --check-prefix=X86-16 %s
 // RUN: FileCheck --check-prefix=ERR16 < %t.err %s
 
 	ret
-// 64: retq
-// 64: encoding: [0xc3]
-// 32: retl
-// 32: encoding: [0xc3]
-// 16: retw
-// 16: encoding: [0xc3]
+// X86-64: retq
+// X86-64: encoding: [0xc3]
+// X86-32: retl
+// X86-32: encoding: [0xc3]
+// X86-16: retw
+// X86-16: encoding: [0xc3]
 	retw
-// 64: retw
-// 64: encoding: [0x66,0xc3]
-// 32: retw
-// 32: encoding: [0x66,0xc3]
-// 16: retw
-// 16: encoding: [0xc3]
+// X86-64: retw
+// X86-64: encoding: [0x66,0xc3]
+// X86-32: retw
+// X86-32: encoding: [0x66,0xc3]
+// X86-16: retw
+// X86-16: encoding: [0xc3]
 	retl
 // ERR64: error: instruction requires: Not 64-bit mode
-// 32: retl
-// 32: encoding: [0xc3]
-// 16: retl
-// 16: encoding: [0x66,0xc3]
+// X86-32: retl
+// X86-32: encoding: [0xc3]
+// X86-16: retl
+// X86-16: encoding: [0x66,0xc3]
 	retq
-// 64: retq
-// 64: encoding: [0xc3]
+// X86-64: retq
+// X86-64: encoding: [0xc3]
 // ERR32: error: instruction requires: 64-bit mode
 // ERR16: error: instruction requires: 64-bit mode
 
 	ret $0
-// 64: retq $0
-// 64: encoding: [0xc2,0x00,0x00]
-// 32: retl $0
-// 32: encoding: [0xc2,0x00,0x00]
-// 16: retw $0
-// 16: encoding: [0xc2,0x00,0x00]
+// X86-64: retq $0
+// X86-64: encoding: [0xc2,0x00,0x00]
+// X86-32: retl $0
+// X86-32: encoding: [0xc2,0x00,0x00]
+// X86-16: retw $0
+// X86-16: encoding: [0xc2,0x00,0x00]
 	retw $0
-// 64: retw $0
-// 64: encoding: [0x66,0xc2,0x00,0x00]
-// 32: retw $0
-// 32: encoding: [0x66,0xc2,0x00,0x00]
-// 16: retw $0
-// 16: encoding: [0xc2,0x00,0x00]
+// X86-64: retw $0
+// X86-64: encoding: [0x66,0xc2,0x00,0x00]
+// X86-32: retw $0
+// X86-32: encoding: [0x66,0xc2,0x00,0x00]
+// X86-16: retw $0
+// X86-16: encoding: [0xc2,0x00,0x00]
 	retl $0
 // ERR64: error: instruction requires: Not 64-bit mode
-// 32: retl $0
-// 32: encoding: [0xc2,0x00,0x00]
-// 16: retl $0
-// 16: encoding: [0x66,0xc2,0x00,0x00]
+// X86-32: retl $0
+// X86-32: encoding: [0xc2,0x00,0x00]
+// X86-16: retl $0
+// X86-16: encoding: [0x66,0xc2,0x00,0x00]
 	retq $0
-// 64: retq $0
-// 64: encoding: [0xc2,0x00,0x00]
+// X86-64: retq $0
+// X86-64: encoding: [0xc2,0x00,0x00]
 // ERR32: error: instruction requires: 64-bit mode
 // ERR16: error: instruction requires: 64-bit mode
 
 	retn
-// 64: retq
-// 64: encoding: [0xc3]
-// 32: retl
-// 32: encoding: [0xc3]
-// 16: retw
-// 16: encoding: [0xc3]
+// X86-64: retq
+// X86-64: encoding: [0xc3]
+// X86-32: retl
+// X86-32: encoding: [0xc3]
+// X86-16: retw
+// X86-16: encoding: [0xc3]
 
   retn $0
-// 64: retq $0
-// 64: encoding: [0xc2,0x00,0x00]
-// 32: retl $0
-// 32: encoding: [0xc2,0x00,0x00]
-// 16: retw $0
-// 16: encoding: [0xc2,0x00,0x00]
+// X86-64: retq $0
+// X86-64: encoding: [0xc2,0x00,0x00]
+// X86-32: retl $0
+// X86-32: encoding: [0xc2,0x00,0x00]
+// X86-16: retw $0
+// X86-16: encoding: [0xc2,0x00,0x00]
 
 	lret
-// 64: lretl
-// 64: encoding: [0xcb]
-// 32: lretl
-// 32: encoding: [0xcb]
-// 16: lretw
-// 16: encoding: [0xcb]
+// X86-64: lretl
+// X86-64: encoding: [0xcb]
+// X86-32: lretl
+// X86-32: encoding: [0xcb]
+// X86-16: lretw
+// X86-16: encoding: [0xcb]
 	lretw
-// 64: lretw
-// 64: encoding: [0x66,0xcb]
-// 32: lretw
-// 32: encoding: [0x66,0xcb]
-// 16: lretw
-// 16: encoding: [0xcb]
+// X86-64: lretw
+// X86-64: encoding: [0x66,0xcb]
+// X86-32: lretw
+// X86-32: encoding: [0x66,0xcb]
+// X86-16: lretw
+// X86-16: encoding: [0xcb]
 	lretl
-// 64: lretl
-// 64: encoding: [0xcb]
-// 32: lretl
-// 32: encoding: [0xcb]
-// 16: lretl
-// 16: encoding: [0x66,0xcb]
+// X86-64: lretl
+// X86-64: encoding: [0xcb]
+// X86-32: lretl
+// X86-32: encoding: [0xcb]
+// X86-16: lretl
+// X86-16: encoding: [0x66,0xcb]
 	lretq
-// 64: lretq
-// 64: encoding: [0x48,0xcb]
+// X86-64: lretq
+// X86-64: encoding: [0x48,0xcb]
 // ERR32: error: instruction requires: 64-bit mode
 // ERR16: error: instruction requires: 64-bit mode
 
 	lret $0
-// 64: lretl $0
-// 64: encoding: [0xca,0x00,0x00]
-// 32: lretl $0
-// 32: encoding: [0xca,0x00,0x00]
-// 16: lretw $0
-// 16: encoding: [0xca,0x00,0x00]
+// X86-64: lretl $0
+// X86-64: encoding: [0xca,0x00,0x00]
+// X86-32: lretl $0
+// X86-32: encoding: [0xca,0x00,0x00]
+// X86-16: lretw $0
+// X86-16: encoding: [0xca,0x00,0x00]
 	lretw $0
-// 64: lretw $0
-// 64: encoding: [0x66,0xca,0x00,0x00]
-// 32: lretw $0
-// 32: encoding: [0x66,0xca,0x00,0x00]
-// 16: lretw $0
-// 16: encoding: [0xca,0x00,0x00]
+// X86-64: lretw $0
+// X86-64: encoding: [0x66,0xca,0x00,0x00]
+// X86-32: lretw $0
+// X86-32: encoding: [0x66,0xca,0x00,0x00]
+// X86-16: lretw $0
+// X86-16: encoding: [0xca,0x00,0x00]
 	lretl $0
-// 64: lretl $0
-// 64: encoding: [0xca,0x00,0x00]
-// 32: lretl $0
-// 32: encoding: [0xca,0x00,0x00]
-// 16: lretl $0
-// 16: encoding: [0x66,0xca,0x00,0x00]
+// X86-64: lretl $0
+// X86-64: encoding: [0xca,0x00,0x00]
+// X86-32: lretl $0
+// X86-32: encoding: [0xca,0x00,0x00]
+// X86-16: lretl $0
+// X86-16: encoding: [0x66,0xca,0x00,0x00]
 	lretq $0
-// 64: lretq $0
-// 64: encoding: [0x48,0xca,0x00,0x00]
+// X86-64: lretq $0
+// X86-64: encoding: [0x48,0xca,0x00,0x00]
 // ERR32: error: instruction requires: 64-bit mode
 // ERR16: error: instruction requires: 64-bit mode
 
diff --git a/llvm/test/MC/X86/x86_errors.s b/llvm/test/MC/X86/x86_errors.s
index f1c7110fde1f1c..da8659f3621e36 100644
--- a/llvm/test/MC/X86/x86_errors.s
+++ b/llvm/test/MC/X86/x86_errors.s
@@ -1,122 +1,122 @@
 // RUN: not llvm-mc -triple x86_64-unknown-unknown %s 2> %t.err
-// RUN: FileCheck --check-prefix=64 < %t.err %s
+// RUN: FileCheck --check-prefix=X64 < %t.err %s
 
 // RUN: not llvm-mc -triple i386-unknown-unknown %s 2> %t.err
-// RUN: FileCheck --check-prefix=32 < %t.err %s
+// RUN: FileCheck --check-prefix=X86 < %t.err %s
 // rdar://8204588
 
-// 64: error: ambiguous instructions require an explicit suffix (could be 'cmpb', 'cmpw', 'cmpl', or 'cmpq')
+// X64: error: ambiguous instructions require an explicit suffix (could be 'cmpb', 'cmpw', 'cmpl', or 'cmpq')
 cmp $0, 0(%eax)
 
-// 32: error: register %rax is only available in 64-bit mode
+// X86: error: register %rax is only available in 64-bit mode
 addl $0, 0(%rax)
 
-// 32: test.s:8:2: error: invalid instruction mnemonic 'movi'
+// X86: test.s:8:2: error: invalid instruction mnemonic 'movi'
 
 # 8 "test.s"
  movi $8,%eax
 
 movl 0(%rax), 0(%edx)  // error: invalid operand for instruction
 
-// 32: error: instruction requires: 64-bit mode
+// X86: error: instruction requires: 64-bit mode
 sysexitq
 
 // rdar://10710167
-// 64: error: expected scale expression
+// X64: error: expected scale expression
 lea (%rsp, %rbp, $4), %rax
 
 // rdar://10423777
-// 64: error: base register is 64-bit, but index register is not
+// X64: error: base register is 64-bit, but index register is not
 movq (%rsi,%ecx),%xmm0
 
-// 64: error: invalid 16-bit base register
+// X64: error: invalid 16-bit base register
 movl %eax,(%bp,%si)
 
-// 32: error: scale factor in 16-bit address must be 1
+// X86: error: scale factor in 16-bit address must be 1
 movl %eax,(%bp,%si,2)
 
-// 32: error: invalid 16-bit base register
+// X86: error: invalid 16-bit base register
 movl %eax,(%cx)
 
-// 32: error: invalid 16-bit base/index register combination
+// X86: error: invalid 16-bit base/index register combination
 movl %eax,(%bp,%bx)
 
-// 32: error: 16-bit memory operand may not include only index register
+// X86: error: 16-bit memory operand may not include only index register
 movl %eax,(,%bx)
 
-// 32: error: invalid operand for instruction
+// X86: error: invalid operand for instruction
 outb al, 4
 
-// 32: error: invalid segment register
-// 64: error: invalid segment register
+// X86: error: invalid segment register
+// X64: error: invalid segment register
 movl %eax:0x00, %ebx
 
-// 32: error: invalid operand for instruction
-// 64: error: invalid operand for instruction
+// X86: error: invalid operand for instruction
+// X64: error: invalid operand for instruction
 cmpps $-129, %xmm0, %xmm0
 
-// 32: error: invalid operand for instruction
-// 64: error: invalid operand for instruction
+// X86: error: invalid operand for instruction
+// X64: error: invalid operand for instruction
 cmppd $256, %xmm0, %xmm0
 
-// 32: error: instruction requires: 64-bit mode
+// X86: error: instruction requires: 64-bit mode
 jrcxz 1
 
-// 64: error: instruction requires: Not 64-bit mode
+// X64: error: instruction requires: Not 64-bit mode
 jcxz 1
 
-// 32: error: register %cr8 is only available in 64-bit mode
+// X86: error: register %cr8 is only available in 64-bit mode
 movl %edx, %cr8
 
-// 32: error: register %dr8 is only available in 64-bit mode
+// X86: error: register %dr8 is only available in 64-bit mode
 movl %edx, %dr8
 
-// 32: error: register %rip is only available in 64-bit mode
-// 64: error: %rip can only be used as a base register
+// X86: error: register %rip is only available in 64-bit mode
+// X64: error: %rip can only be used as a base register
 mov %rip, %rax
 
-// 32: error: register %rax is only available in 64-bit mode
-// 64: error: %rip is not allowed as an index register
+// X86: error: register %rax is only available in 64-bit mode
+// X64: error: %rip is not allowed as an index register
 mov (%rax,%rip), %rbx
 
-// 32: error: instruction requires: 64-bit mode
+// X86: error: instruction requires: 64-bit mode
 ljmpq *(%eax)
 
-// 32: error: register %rax is only available in 64-bit mode
-// 64: error: invalid base+index expression
+// X86: error: register %rax is only available in 64-bit mode
+// X64: error: invalid base+index expression
 leaq (%rax,%rsp), %rax
 
-// 32: error: invalid base+index expression
-// 64: error: invalid base+index expression
+// X86: error: invalid base+index expression
+// X64: error: invalid base+index expression
 leaq (%eax,%esp), %eax
 
-// 32: error: invalid 16-bit base/index register combination
-// 64: error: invalid 16-bit base register
+// X86: error: invalid 16-bit base/index register combination
+// X64: error: invalid 16-bit base register
 lea (%si,%bp), %ax
-// 32: error: invalid 16-bit base/index register combination
-// 64: error: invalid 16-bit base register
+// X86: error: invalid 16-bit base/index register combination
+// X64: error: invalid 16-bit base register
 lea (%di,%bp), %ax
-// 32: error: invalid 16-bit base/index register combination
-// 64: error: invalid 16-bit base register
+// X86: error: invalid 16-bit base/index register combination
+// X64: error: invalid 16-bit base register
 lea (%si,%bx), %ax
-// 32: error: invalid 16-bit base/index register combination
-// 64: error: invalid 16-bit base register
+// X86: error: invalid 16-bit base/index register combination
+// X64: error: invalid 16-bit base register
 lea (%di,%bx), %ax
 
-// 32: error: invalid base+index expression
-// 64: error: invalid base+index expression
+// X86: error: invalid base+index expression
+// X64: error: invalid base+index expression
 mov (,%eip), %rbx
 
-// 32: error: invalid base+index expression
-// 64: error: invalid base+index expression
+// X86: error: invalid base+index expression
+// X64: error: invalid base+index expression
 mov (%eip,%eax), %rbx
 
-// 32: error: register %rax is only available in 64-bit mode
-// 64: error: base register is 64-bit, but index register is not
+// X86: error: register %rax is only available in 64-bit mode
+// X64: error: base register is 64-bit, but index register is not
 mov (%rax,%eiz), %ebx
 
-// 32: error: register %riz is only available in 64-bit mode
-// 64: error: base register is 32-bit, but index register is not
+// X86: error: register %riz is only available in 64-bit mode
+// X64: error: base register is 32-bit, but index register is not
 mov (%eax,%riz), %ebx
 
 
@@ -128,68 +128,68 @@ v_gs  = %gs
 v_imm = 4
 $test = %ebx
 
-// 32: 7: error: expected register here
-// 64: 7: error: expected register here
+// X86: 7: error: expected register here
+// X64: 7: error: expected register here
 mov 4(4), %eax	
 
-// 32: 7: error: expected register here
-// 64: 7: error: expected register here
+// X86: 7: error: expected register here
+// X64: 7: error: expected register here
 mov 5(v_imm), %eax		
 	
-// 32: 7: error: invalid register name
-// 64: 7: error: invalid register name
+// X86: 7: error: invalid register name
+// X64: 7: error: invalid register name
 mov 6(%v_imm), %eax		
 	
-// 32: 8: warning: scale factor without index register is ignored
-// 64: 8: warning: scale factor without index register is ignored
+// X86: 8: warning: scale factor without index register is ignored
+// X64: 8: warning: scale factor without index register is ignored
 mov 7(,v_imm), %eax		
 
-// 64: 6: error: expected immediate expression
+// X64: 6: error: expected immediate expression
 mov $%eax, %ecx
 
-// 32: 6: error: expected immediate expression
-// 64: 6: error: expected immediate expression
+// X86: 6: error: expected immediate expression
+// X64: 6: error: expected immediate expression
 mov $v_eax, %ecx
 
-// 32: error: unexpected token in argument list
-// 64: error: unexpected token in argument list
+// X86: error: unexpected token in argument list
+// X64: error: unexpected token in argument list
 mov v_ecx(%eax), %ecx	
 
-// 32: 7: error: invalid operand for instruction
-// 64: 7: error: invalid operand for instruction
+// X86: 7: error: invalid operand for instruction
+// X64: 7: error: invalid operand for instruction
 addb (%dx), %al
 
-// 32: error: instruction requires: 64-bit mode
+// X86: error: instruction requires: 64-bit mode
 cqto
 
-// 32: error: instruction requires: 64-bit mode
+// X86: error: instruction requires: 64-bit mode
 cltq
 
-// 32: error: instruction requires: 64-bit mode
+// X86: error: instruction requires: 64-bit mode
 cmpxchg16b (%eax)
 
-// 32: error: unsupported instruction
-// 64: error: unsupported instruction
+// X86: error: unsupported instruction
+// X64: error: unsupported instruction
 {vex} vmovdqu32 %xmm0, %xmm0
 
-// 32: error: unsupported instruction
-// 64: error: unsupported instruction
+// X86: error: unsupported instruction
+// X64: error: unsupported instruction
 {vex2} vmovdqu32 %xmm0, %xmm0
 
-// 32: error: unsupported instruction
-// 64: error: unsupported instruction
+// X86: error: unsupported instruction
+// X64: error: unsupported instruction
 {vex3} vmovdqu32 %xmm0, %xmm0
 
-// 32: error: unsupported instruction
-// 64: error: unsupported instruction
+// X86: error: unsupported instruction
+// X64: error: unsupported instruction
 {evex} vmovdqu %xmm0, %xmm0
 
-// 32: 12: error: immediate must be an integer in range [0, 15]
-// 64: 12: error: immediate must be an integer in range [0, 15]
+// X86: 12: error: immediate must be an integer in range [0, 15]
+// X64: 12: error: immediate must be an integer in range [0, 15]
 vpermil2pd $16, %xmm3, %xmm5, %xmm1, %xmm2
 
-// 32: error: instruction requires: 64-bit mode
+// X86: error: instruction requires: 64-bit mode
 pbndkb
 
-// 32: error: register %r16d is only available in 64-bit mode
+// X86: error: register %r16d is only available in 64-bit mode
 movl %eax, %r16d

From 89873694654a635cabdd861ddebd61a041d8342f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 15 May 2024 13:49:39 +0100
Subject: [PATCH 09/71] [X86] sibcall - cleanup check prefixes identified in
 #92248

Avoid using numbers as check prefix - replace with actual triple config names
---
 llvm/test/CodeGen/X86/sibcall-2.ll     | 36 +++++++++++++-------------
 llvm/test/CodeGen/X86/sibcall-byval.ll | 20 +++++++-------
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/llvm/test/CodeGen/X86/sibcall-2.ll b/llvm/test/CodeGen/X86/sibcall-2.ll
index a4345cf28335fa..ca42fbafde14f7 100644
--- a/llvm/test/CodeGen/X86/sibcall-2.ll
+++ b/llvm/test/CodeGen/X86/sibcall-2.ll
@@ -1,48 +1,48 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=i386-apple-darwin   -frame-pointer=all | FileCheck %s -check-prefix=32
-; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-apple-darwin -frame-pointer=all | FileCheck %s -check-prefix=64
+; RUN: llc -verify-machineinstrs < %s -mtriple=i386-apple-darwin   -frame-pointer=all | FileCheck %s -check-prefix=X86
+; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-apple-darwin -frame-pointer=all | FileCheck %s -check-prefix=X64
 
 ; Tail call should not use ebp / rbp after it's popped. Use esp / rsp.
 
 define void @t1(ptr nocapture %value) nounwind {
 entry:
-; 32-LABEL: t1:
-; 32: jmpl *4(%esp)
+; X86-LABEL: t1:
+; X86: jmpl *4(%esp)
 
-; 64-LABEL: t1:
-; 64: jmpq *%rdi
+; X64-LABEL: t1:
+; X64: jmpq *%rdi
   tail call void %value() nounwind
   ret void
 }
 
 define void @t2(i32 %a, ptr nocapture %value) nounwind {
 entry:
-; 32-LABEL: t2:
-; 32: jmpl *8(%esp)
+; X86-LABEL: t2:
+; X86: jmpl *8(%esp)
 
-; 64-LABEL: t2:
-; 64: jmpq *%rsi
+; X64-LABEL: t2:
+; X64: jmpq *%rsi
   tail call void %value() nounwind
   ret void
 }
 
 define void @t3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, ptr nocapture %value) nounwind {
 entry:
-; 32-LABEL: t3:
-; 32: jmpl *28(%esp)
+; X86-LABEL: t3:
+; X86: jmpl *28(%esp)
 
-; 64-LABEL: t3:
-; 64: jmpq *8(%rsp)
+; X64-LABEL: t3:
+; X64: jmpq *8(%rsp)
   tail call void %value() nounwind
   ret void
 }
 
 define void @t4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, ptr nocapture %value) nounwind {
 entry:
-; 32-LABEL: t4:
-; 32: jmpl *32(%esp)
+; X86-LABEL: t4:
+; X86: jmpl *32(%esp)
 
-; 64-LABEL: t4:
-; 64: jmpq *16(%rsp)
+; X64-LABEL: t4:
+; X64: jmpq *16(%rsp)
   tail call void %value() nounwind
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/sibcall-byval.ll b/llvm/test/CodeGen/X86/sibcall-byval.ll
index 12dbac1389a0e8..0e06833ad70bb8 100644
--- a/llvm/test/CodeGen/X86/sibcall-byval.ll
+++ b/llvm/test/CodeGen/X86/sibcall-byval.ll
@@ -1,15 +1,15 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin9   | FileCheck %s -check-prefix=32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=64
+; RUN: llc < %s -mtriple=i386-apple-darwin9   | FileCheck %s -check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=X64
 
 %struct.p = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
 
 define i32 @f(ptr byval(%struct.p) align 4 %q) nounwind ssp {
 entry:
-; 32: _f:
-; 32: jmp _g
+; X86: _f:
+; X86: jmp _g
 
-; 64: _f:
-; 64: jmp _g
+; X64: _f:
+; X64: jmp _g
   %call = tail call i32 @g(ptr byval(%struct.p) align 4 %q) nounwind
   ret i32 %call
 }
@@ -18,11 +18,11 @@ declare i32 @g(ptr byval(%struct.p) align 4)
 
 define i32 @h(ptr byval(%struct.p) align 4 %q, i32 %r) nounwind ssp {
 entry:
-; 32: _h:
-; 32: jmp _i
+; X86: _h:
+; X86: jmp _i
 
-; 64: _h:
-; 64: jmp _i
+; X64: _h:
+; X64: jmp _i
 
   %call = tail call i32 @i(ptr byval(%struct.p) align 4 %q, i32 %r) nounwind
   ret i32 %call

From 932f0de43a9e334e161a69a50bd6b01cd51e238e Mon Sep 17 00:00:00 2001
From: Julian Schmidt <git.julian.schmidt@gmail.com>
Date: Wed, 15 May 2024 14:52:32 +0200
Subject: [PATCH 10/71] [clang-tidy] fix crash due to assumed callee in
 min-max-use-initializer-list (#91992)

Previously, the call to `findArgs` for a `CallExpr` inside of a `min` or
`max` call would call `findArgs` before checking if the argument is a
call to `min` or `max`, which is what `findArgs` is expecting.
The fix moves the name checking before the call to `findArgs`, such that
only a `min` or `max` function call is used as an argument.

Fixes #91982
Fixes #92249
---
 .../MinMaxUseInitializerListCheck.cpp         | 10 ++++-----
 .../min-max-use-initializer-list.cpp          | 21 +++++++++++++++++++
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.cpp
index 45f7700463d570..418699ffbc4d1a 100644
--- a/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MinMaxUseInitializerListCheck.cpp
@@ -129,17 +129,17 @@ generateReplacements(const MatchFinder::MatchResult &Match,
       continue;
     }
 
+    // if the nested call is not the same as the top call
+    if (InnerCall->getDirectCallee()->getQualifiedNameAsString() !=
+        TopCall->getDirectCallee()->getQualifiedNameAsString())
+      continue;
+
     const FindArgsResult InnerResult = findArgs(InnerCall);
 
     // if the nested call doesn't have arguments skip it
     if (!InnerResult.First || !InnerResult.Last)
       continue;
 
-    // if the nested call is not the same as the top call
-    if (InnerCall->getDirectCallee()->getQualifiedNameAsString() !=
-        TopCall->getDirectCallee()->getQualifiedNameAsString())
-      continue;
-
     // if the nested call doesn't have the same compare function
     if ((Result.Compare || InnerResult.Compare) &&
         !utils::areStatementsIdentical(Result.Compare, InnerResult.Compare,
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/min-max-use-initializer-list.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/min-max-use-initializer-list.cpp
index 51ab9bda975f10..1f2dad2b933ca7 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/min-max-use-initializer-list.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/min-max-use-initializer-list.cpp
@@ -300,6 +300,27 @@ B maxTT2 = std::max(B(), std::max(B(), B()));
 B maxTT3 = std::max(B(), std::max(B(), B()), [](const B &lhs, const B &rhs) { return lhs.a[0] < rhs.a[0]; });
 // CHECK-FIXES: B maxTT3 = std::max(B(), std::max(B(), B()), [](const B &lhs, const B &rhs) { return lhs.a[0] < rhs.a[0]; });
 
+struct GH91982 {
+  int fun0Args();
+  int fun1Arg(int a);
+  int fun2Args(int a, int b);
+  int fun3Args(int a, int b, int c);
+  int fun4Args(int a, int b, int c, int d);
+
+  int foo() {
+    return std::max(
+        fun0Args(),
+        std::max(fun1Arg(0),
+                 std::max(fun2Args(0, 1),
+                          std::max(fun3Args(0, 1, 2), fun4Args(0, 1, 2, 3)))));
+// CHECK-MESSAGES: :[[@LINE-5]]:12: warning: do not use nested 'std::max' calls, use an initializer list instead [modernize-min-max-use-initializer-list]
+// CHECK-FIXES: return std::max(
+// CHECK-FIXES-NEXT: {fun0Args(),
+// CHECK-FIXES-NEXT: fun1Arg(0),
+// CHECK-FIXES-NEXT: fun2Args(0, 1),
+// CHECK-FIXES-NEXT: fun3Args(0, 1, 2), fun4Args(0, 1, 2, 3)});
+  }
+};
 
 } // namespace
 

From 83d9aa27680b6a7f3556fcf13ada70b4be95bab2 Mon Sep 17 00:00:00 2001
From: Pietro Ghiglio <pietro.ghiglio@codeplay.com>
Date: Wed, 15 May 2024 15:03:21 +0200
Subject: [PATCH 11/71] [VPlan] Add scalar inferencing support for addrspace
 cast (#92107)

Fixes https://github.com/llvm/llvm-project/issues/91434

PR: https://github.com/llvm/llvm-project/pull/92107
---
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |  1 +
 llvm/test/Transforms/LoopVectorize/as_cast.ll | 40 +++++++++++++++++++
 2 files changed, 41 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/as_cast.ll

diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 5f93339083f0c2..efe8c21874a3ac 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -171,6 +171,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) {
   case Instruction::ICmp:
   case Instruction::FCmp:
     return IntegerType::get(Ctx, 1);
+  case Instruction::AddrSpaceCast:
   case Instruction::Alloca:
   case Instruction::BitCast:
   case Instruction::Trunc:
diff --git a/llvm/test/Transforms/LoopVectorize/as_cast.ll b/llvm/test/Transforms/LoopVectorize/as_cast.ll
new file mode 100644
index 00000000000000..58a8c3d078f02a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/as_cast.ll
@@ -0,0 +1,40 @@
+; RUN: opt -passes=loop-vectorize %s -force-vector-width=1 -force-vector-interleave=2 -S -o - | FileCheck %s
+
+define void @foo(ptr addrspace(1) %in) {
+entry:
+  br label %loop
+
+loop:
+  %iter = phi i64 [ %next, %loop ], [ 0, %entry ]
+  %ascast = addrspacecast ptr addrspace(1) %in to ptr
+  %next = add i64 %iter, 1
+  %arrayidx = getelementptr inbounds i64, ptr %ascast, i64 %next
+  store i64 %next, ptr %arrayidx, align 4
+
+; check that we find the two interleaved blocks with ascast, gep and store:
+; CHECK: pred.store.if:
+; CHECK: [[ID1:%.*]] = add i64 %{{.*}}, 1
+; CHECK: [[AS1:%.*]] = addrspacecast ptr addrspace(1) %{{.*}} to ptr
+; CHECK: [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[AS1]], i64 [[ID1]]
+; CHECK: store i64 [[ID1]], ptr [[GEP1]]
+
+; CHECK: pred.store.if1:
+; CHECK: [[ID2:%.*]] = add i64 %{{.*}}, 1
+; CHECK: [[AS2:%.*]] = addrspacecast ptr addrspace(1) %in to ptr
+; CHECK: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[AS2]], i64 [[ID2]]
+; CHECK: store i64 [[ID2]], ptr %9, align 4
+
+  %cmp = icmp eq i64 %next, 7
+  br i1 %cmp, label %exit, label %loop
+
+; check that we branch to the exit block
+; CHECK: middle.block:
+; CHECK: br i1 true, label %exit, label %scalar.ph
+
+exit:
+  ret void
+; CHECK: exit:
+; CHECK:   ret void
+}
+
+; CHECK: !{{[0-9]*}} = !{!"llvm.loop.isvectorized", i32 1}

From d06270ee00e37b247eb99268fb2f106dbeee08ff Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Wed, 15 May 2024 06:08:29 -0700
Subject: [PATCH 12/71] [workflows] Fix libclang-abi-tests to work with new
 version scheme (#91865)

---
 .github/workflows/libclang-abi-tests.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index ccfc1e5fb8a742..972d21c3bcedf9 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -33,7 +33,6 @@ jobs:
       ABI_HEADERS: ${{ steps.vars.outputs.ABI_HEADERS }}
       ABI_LIBS: ${{ steps.vars.outputs.ABI_LIBS }}
       BASELINE_VERSION_MAJOR: ${{ steps.vars.outputs.BASELINE_VERSION_MAJOR }}
-      BASELINE_VERSION_MINOR: ${{ steps.vars.outputs.BASELINE_VERSION_MINOR }}
       LLVM_VERSION_MAJOR: ${{ steps.version.outputs.LLVM_VERSION_MAJOR }}
       LLVM_VERSION_MINOR: ${{ steps.version.outputs.LLVM_VERSION_MINOR }}
       LLVM_VERSION_PATCH: ${{ steps.version.outputs.LLVM_VERSION_PATCH }}
@@ -51,9 +50,9 @@ jobs:
         id: vars
         run: |
           remote_repo='https://github.com/llvm/llvm-project'
-          if [ ${{ steps.version.outputs.LLVM_VERSION_MINOR }} -ne 0 ] || [ ${{ steps.version.outputs.LLVM_VERSION_PATCH }} -eq 0 ]; then
+          if [ ${{ steps.version.outputs.LLVM_VERSION_PATCH }} -eq 0 ]; then
             major_version=$(( ${{ steps.version.outputs.LLVM_VERSION_MAJOR }} - 1))
-            baseline_ref="llvmorg-$major_version.0.0"
+            baseline_ref="llvmorg-$major_version.1.0"
 
             # If there is a minor release, we want to use that as the base line.
             minor_ref=$(git ls-remote --refs -t "$remote_repo" llvmorg-"$major_version".[1-9].[0-9] | tail -n1 | grep -o 'llvmorg-.\+' || true)
@@ -75,7 +74,7 @@ jobs:
           else
             {
               echo "BASELINE_VERSION_MAJOR=${{ steps.version.outputs.LLVM_VERSION_MAJOR }}"
-              echo "BASELINE_REF=llvmorg-${{ steps.version.outputs.LLVM_VERSION_MAJOR }}.0.0"
+              echo "BASELINE_REF=llvmorg-${{ steps.version.outputs.LLVM_VERSION_MAJOR }}.1.0"
               echo "ABI_HEADERS=."
               echo "ABI_LIBS=libclang.so libclang-cpp.so"
             } >> "$GITHUB_OUTPUT"

From 97418bb519d90542aad3c1f82c80264381a5758e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 15 May 2024 14:03:13 +0100
Subject: [PATCH 13/71] [X86] patchable functions - cleanup check prefixes
 identified in #92248

Avoid using numbers as check prefix - replace with actual triple config names
---
 .../X86/patchable-function-entry-ibt.ll       |  44 +++---
 .../CodeGen/X86/patchable-function-entry.ll   |  62 ++++----
 llvm/test/CodeGen/X86/patchable-prologue.ll   | 134 +++++++++---------
 3 files changed, 120 insertions(+), 120 deletions(-)

diff --git a/llvm/test/CodeGen/X86/patchable-function-entry-ibt.ll b/llvm/test/CodeGen/X86/patchable-function-entry-ibt.ll
index d0a9bee7878c3f..bcb1106de749e4 100644
--- a/llvm/test/CodeGen/X86/patchable-function-entry-ibt.ll
+++ b/llvm/test/CodeGen/X86/patchable-function-entry-ibt.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=i686 %s -o - | FileCheck --check-prefixes=CHECK,32 %s
-; RUN: llc -mtriple=x86_64 %s -o - | FileCheck --check-prefixes=CHECK,64 %s
+; RUN: llc -mtriple=i686 %s -o - | FileCheck --check-prefixes=CHECK,X86 %s
+; RUN: llc -mtriple=x86_64 %s -o - | FileCheck --check-prefixes=CHECK,X64 %s
 
 ;; -fpatchable-function-entry=0 -fcf-protection=branch
 define void @f0() "patchable-function-entry"="0" {
@@ -7,8 +7,8 @@ define void @f0() "patchable-function-entry"="0" {
 ; CHECK-NEXT: .Lfunc_begin0:
 ; CHECK-NEXT: .cfi_startproc
 ; CHECK-NEXT: # %bb.0:
-; 32-NEXT:     endbr32
-; 64-NEXT:     endbr64
+; X86-NEXT:     endbr32
+; X64-NEXT:     endbr64
 ; CHECK-NEXT:  ret
 ; CHECK-NOT:  .section __patchable_function_entries
   ret void
@@ -22,16 +22,16 @@ define void @f1() "patchable-function-entry"="1" {
 ; CHECK-NEXT: .Lfunc_begin1:
 ; CHECK-NEXT: .cfi_startproc
 ; CHECK-NEXT: # %bb.0:
-; 32-NEXT:     endbr32
-; 64-NEXT:     endbr64
+; X86-NEXT:     endbr32
+; X64-NEXT:     endbr64
 ; CHECK-NEXT: .Lpatch0:
 ; CHECK-NEXT:  nop
 ; CHECK-NEXT:  ret
 ; CHECK:      .section __patchable_function_entries,"awo",@progbits,f1{{$}}
-; 32-NEXT:    .p2align 2
-; 32-NEXT:    .long .Lpatch0
-; 64-NEXT:    .p2align 3
-; 64-NEXT:    .quad .Lpatch0
+; X86-NEXT:    .p2align 2
+; X86-NEXT:    .long .Lpatch0
+; X64-NEXT:    .p2align 3
+; X64-NEXT:    .quad .Lpatch0
   ret void
 }
 
@@ -44,17 +44,17 @@ define void @f2_1() "patchable-function-entry"="1" "patchable-function-prefix"="
 ; CHECK-NEXT: .Lfunc_begin2:
 ; CHECK-NEXT: .cfi_startproc
 ; CHECK-NEXT: # %bb.0:
-; 32-NEXT:     endbr32
-; 64-NEXT:     endbr64
+; X86-NEXT:     endbr32
+; X64-NEXT:     endbr64
 ; CHECK-NEXT:  nop
 ; CHECK-NEXT:  ret
 ; CHECK:      .Lfunc_end2:
 ; CHECK-NEXT: .size f2_1, .Lfunc_end2-f2_1
 ; CHECK:      .section __patchable_function_entries,"awo",@progbits,f2_1{{$}}
-; 32-NEXT:    .p2align 2
-; 32-NEXT:    .long .Ltmp0
-; 64-NEXT:    .p2align 3
-; 64-NEXT:    .quad .Ltmp0
+; X86-NEXT:    .p2align 2
+; X86-NEXT:    .long .Ltmp0
+; X64-NEXT:    .p2align 3
+; X64-NEXT:    .quad .Ltmp0
   ret void
 }
 
@@ -74,10 +74,10 @@ define internal void @f1i() "patchable-function-entry"="1" {
 ;; Another basic block has ENDBR, but it doesn't affect our decision to not create .Lpatch0
 ; CHECK:       endbr
 ; CHECK:      .section __patchable_function_entries,"awo",@progbits,f1i{{$}}
-; 32-NEXT:    .p2align 2
-; 32-NEXT:    .long .Lfunc_begin3
-; 64-NEXT:    .p2align 3
-; 64-NEXT:    .quad .Lfunc_begin3
+; X86-NEXT:    .p2align 2
+; X86-NEXT:    .long .Lfunc_begin3
+; X64-NEXT:    .p2align 3
+; X64-NEXT:    .quad .Lfunc_begin3
 entry:
   tail call i32 @llvm.eh.sjlj.setjmp(ptr @buf)
   ret void
@@ -93,8 +93,8 @@ entry:
 ; CHECK-NEXT: .Lfunc_begin{{.*}}:
 ; CHECK-NEXT:   .cfi_startproc
 ; CHECK-NEXT:   # %bb.0:
-; 32-NEXT:      endbr32
-; 64-NEXT:      endbr64
+; X86-NEXT:      endbr32
+; X64-NEXT:      endbr64
 ; CHECK-NEXT:   nop
 ; CHECK-NEXT:   ret
 define void @sanitize_function(ptr noundef %x) "patchable-function-prefix"="1" "patchable-function-entry"="1" !func_sanitize !1 {
diff --git a/llvm/test/CodeGen/X86/patchable-function-entry.ll b/llvm/test/CodeGen/X86/patchable-function-entry.ll
index 8c37f545108015..54ecd8b1e5dafb 100644
--- a/llvm/test/CodeGen/X86/patchable-function-entry.ll
+++ b/llvm/test/CodeGen/X86/patchable-function-entry.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=i386 %s -o - | FileCheck --check-prefixes=CHECK,32 %s
-; RUN: llc -mtriple=x86_64 %s -o - | FileCheck --check-prefixes=CHECK,64 %s
-; RUN: llc -mtriple=x86_64 -function-sections %s -o - | FileCheck --check-prefixes=CHECK,64 %s
+; RUN: llc -mtriple=i386 %s -o - | FileCheck --check-prefixes=CHECK,X86 %s
+; RUN: llc -mtriple=x86_64 %s -o - | FileCheck --check-prefixes=CHECK,X64 %s
+; RUN: llc -mtriple=x86_64 -function-sections %s -o - | FileCheck --check-prefixes=CHECK,X64 %s
 
 define void @f0() "patchable-function-entry"="0" {
 ; CHECK-LABEL: f0:
@@ -17,10 +17,10 @@ define void @f1() "patchable-function-entry"="1" {
 ; CHECK:       nop
 ; CHECK-NEXT:  ret
 ; CHECK:       .section __patchable_function_entries,"awo",@progbits,f1{{$}}
-; 32:          .p2align 2
-; 32-NEXT:     .long .Lfunc_begin1
-; 64:          .p2align 3
-; 64-NEXT:     .quad .Lfunc_begin1
+; X86:          .p2align 2
+; X86-NEXT:     .long .Lfunc_begin1
+; X64:          .p2align 3
+; X64-NEXT:     .quad .Lfunc_begin1
   ret void
 }
 
@@ -31,14 +31,14 @@ define void @f1() "patchable-function-entry"="1" {
 define void @f2() "patchable-function-entry"="2" {
 ; CHECK-LABEL: f2:
 ; CHECK-NEXT: .Lfunc_begin2:
-; 32:          xchgw %ax, %ax
-; 64:          xchgw %ax, %ax
+; X86:          xchgw %ax, %ax
+; X64:          xchgw %ax, %ax
 ; CHECK-NEXT:  ret
 ; CHECK:       .section __patchable_function_entries,"awo",@progbits,f2{{$}}
-; 32:          .p2align 2
-; 32-NEXT:     .long .Lfunc_begin2
-; 64:          .p2align 3
-; 64-NEXT:     .quad .Lfunc_begin2
+; X86:          .p2align 2
+; X86-NEXT:     .long .Lfunc_begin2
+; X64:          .p2align 3
+; X64-NEXT:     .quad .Lfunc_begin2
   ret void
 }
 
@@ -46,15 +46,15 @@ $f3 = comdat any
 define void @f3() "patchable-function-entry"="3" comdat {
 ; CHECK-LABEL: f3:
 ; CHECK-NEXT: .Lfunc_begin3:
-; 32:          xchgw %ax, %ax
-; 32-NEXT:     nop
-; 64:          nopl (%rax)
+; X86:          xchgw %ax, %ax
+; X86-NEXT:     nop
+; X64:          nopl (%rax)
 ; CHECK:       ret
 ; CHECK:       .section __patchable_function_entries,"awoG",@progbits,f3,f3,comdat{{$}}
-; 32:          .p2align 2
-; 32-NEXT:     .long .Lfunc_begin3
-; 64:          .p2align 3
-; 64-NEXT:     .quad .Lfunc_begin3
+; X86:          .p2align 2
+; X86-NEXT:     .long .Lfunc_begin3
+; X64:          .p2align 3
+; X64-NEXT:     .quad .Lfunc_begin3
   ret void
 }
 
@@ -62,15 +62,15 @@ $f5 = comdat any
 define void @f5() "patchable-function-entry"="5" comdat {
 ; CHECK-LABEL: f5:
 ; CHECK-NEXT: .Lfunc_begin4:
-; 32-COUNT-2:  xchgw %ax, %ax
-; 32-NEXT:     nop
-; 64:          nopl 8(%rax,%rax)
+; X86-COUNT-2:  xchgw %ax, %ax
+; X86-NEXT:     nop
+; X64:          nopl 8(%rax,%rax)
 ; CHECK-NEXT:  ret
 ; CHECK:       .section __patchable_function_entries,"awoG",@progbits,f5,f5,comdat{{$}}
-; 32:          .p2align 2
-; 32-NEXT:     .long .Lfunc_begin4
-; 64:          .p2align 3
-; 64-NEXT:     .quad .Lfunc_begin4
+; X86:          .p2align 2
+; X86-NEXT:     .long .Lfunc_begin4
+; X64:          .p2align 3
+; X64-NEXT:     .quad .Lfunc_begin4
   ret void
 }
 
@@ -91,10 +91,10 @@ define void @f3_2() "patchable-function-entry"="1" "patchable-function-prefix"="
 ; CHECK:      .Lfunc_end5:
 ; CHECK-NEXT: .size f3_2, .Lfunc_end5-f3_2
 ; CHECK:      .section __patchable_function_entries,"awo",@progbits,f3_2{{$}}
-; 32:         .p2align 2
-; 32-NEXT:    .long .Ltmp0
-; 64:         .p2align 3
-; 64-NEXT:    .quad .Ltmp0
+; X86:         .p2align 2
+; X86-NEXT:    .long .Ltmp0
+; X64:         .p2align 3
+; X64-NEXT:    .quad .Ltmp0
   %frame = alloca i8, i32 16
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/patchable-prologue.ll b/llvm/test/CodeGen/X86/patchable-prologue.ll
index 43761e3d1e1eb9..aec76a359d267e 100644
--- a/llvm/test/CodeGen/X86/patchable-prologue.ll
+++ b/llvm/test/CodeGen/X86/patchable-prologue.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -verify-machineinstrs -filetype=obj -o - -mtriple=x86_64-apple-macosx < %s | llvm-objdump --no-print-imm-hex --triple=x86_64-apple-macosx -d - | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=x86_64-apple-macosx < %s | FileCheck %s --check-prefix=CHECK-ALIGN
-; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386 < %s | FileCheck %s --check-prefixes=32,32CFI,XCHG
-; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-windows-msvc < %s | FileCheck %s --check-prefixes=32,MOV
-; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-windows-msvc -mcpu=pentium3 < %s | FileCheck %s --check-prefixes=32,MOV
-; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-windows-msvc -mcpu=pentium4 < %s | FileCheck %s --check-prefixes=32,XCHG
-; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=64
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386 < %s | FileCheck %s --check-prefixes=X86,X86CFI,XCHG
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-windows-msvc < %s | FileCheck %s --check-prefixes=X86,MOV
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-windows-msvc -mcpu=pentium3 < %s | FileCheck %s --check-prefixes=X86,MOV
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=i386-windows-msvc -mcpu=pentium4 < %s | FileCheck %s --check-prefixes=X86,XCHG
+; RUN: llc -verify-machineinstrs -show-mc-encoding -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=X64
 
 declare void @callee(ptr)
 
@@ -15,18 +15,18 @@ define void @f0() "patchable-function"="prologue-short-redirect" {
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _f0:
 
-; 32: f0:
-; 32CFI-NEXT: .cfi_startproc
-; 32-NEXT: # %bb.0:
+; X86: f0:
+; X86CFI-NEXT: .cfi_startproc
+; X86-NEXT: # %bb.0:
 ; XCHG-NEXT: xchgw   %ax, %ax                # encoding: [0x66,0x90]
 ; MOV-NEXT: movl    %edi, %edi              # encoding: [0x8b,0xff]
-; 32-NEXT: retl
+; X86-NEXT: retl
+
+; X64: f0:
+; X64-NEXT: # %bb.0:
+; X64-NEXT: xchgw   %ax, %ax                # encoding: [0x66,0x90]
+; X64-NEXT: retq
 
-; 64: f0:
-; 64-NEXT: # %bb.0:
-; 64-NEXT: xchgw   %ax, %ax                # encoding: [0x66,0x90]
-; 64-NEXT: retq
-		
   ret void
 }
 
@@ -38,19 +38,19 @@ define void @f1() "patchable-function"="prologue-short-redirect" "frame-pointer"
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _f1:
 
-; 32: f1:
-; 32CFI-NEXT: .cfi_startproc
-; 32-NEXT: # %bb.0:
+; X86: f1:
+; X86CFI-NEXT: .cfi_startproc
+; X86-NEXT: # %bb.0:
 ; XCHG-NEXT: xchgw   %ax, %ax                # encoding: [0x66,0x90]
 ; MOV-NEXT: movl    %edi, %edi              # encoding: [0x8b,0xff]
-; 32-NEXT: pushl   %ebp
-
-; 64: f1:
-; 64-NEXT: .seh_proc f1
-; 64-NEXT: # %bb.0:
-; 64-NEXT: xchgw %ax, %ax
-; 64-NEXT: pushq   %rbp
-		
+; X86-NEXT: pushl   %ebp
+
+; X64: f1:
+; X64-NEXT: .seh_proc f1
+; X64-NEXT: # %bb.0:
+; X64-NEXT: xchgw %ax, %ax
+; X64-NEXT: pushq   %rbp
+
   ret void
 }
 
@@ -61,18 +61,18 @@ define void @f2() "patchable-function"="prologue-short-redirect" {
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _f2:
 
-; 32: f2:
-; 32CFI-NEXT: .cfi_startproc
-; 32-NEXT: # %bb.0:
+; X86: f2:
+; X86CFI-NEXT: .cfi_startproc
+; X86-NEXT: # %bb.0:
 ; XCHG-NEXT: xchgw   %ax, %ax                # encoding: [0x66,0x90]
 ; MOV-NEXT: movl    %edi, %edi              # encoding: [0x8b,0xff]
-; 32-NEXT: pushl   %ebp
+; X86-NEXT: pushl   %ebp
+
+; X64: f2:
+; X64-NEXT: .seh_proc f2
+; X64-NEXT: # %bb.0:
+; X64-NEXT: subq    $200, %rsp
 
-; 64: f2:
-; 64-NEXT: .seh_proc f2
-; 64-NEXT: # %bb.0:
-; 64-NEXT: subq    $200, %rsp
-		
   %ptr = alloca i64, i32 20
   call void @callee(ptr %ptr)
   ret void
@@ -85,17 +85,17 @@ define void @f3() "patchable-function"="prologue-short-redirect" optsize {
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _f3:
 
-; 32: f3:
-; 32CFI-NEXT: .cfi_startproc
-; 32-NEXT: # %bb.0:
+; X86: f3:
+; X86CFI-NEXT: .cfi_startproc
+; X86-NEXT: # %bb.0:
 ; XCHG-NEXT: xchgw   %ax, %ax
 ; MOV-NEXT: movl   %edi, %edi
-; 32-NEXT: retl
+; X86-NEXT: retl
 
-; 64: f3:
-; 64-NEXT: # %bb.0:
-; 64-NEXT: xchgw   %ax, %ax
-; 64-NEXT: retq
+; X64: f3:
+; X64-NEXT: # %bb.0:
+; X64-NEXT: xchgw   %ax, %ax
+; X64-NEXT: retq
 
   ret void
 }
@@ -105,16 +105,16 @@ define void @f3() "patchable-function"="prologue-short-redirect" optsize {
 ; patchable one.
 ; CHECK-LABEL: f4{{>?}}:
 ; CHECK-NEXT: 8b 0c 37  movl  (%rdi,%rsi), %ecx
-; 32: f4:
-; 32CFI-NEXT: .cfi_startproc
-; 32-NEXT: # %bb.0:
+; X86: f4:
+; X86CFI-NEXT: .cfi_startproc
+; X86-NEXT: # %bb.0:
 ; XCHG-NEXT: xchgw   %ax, %ax
 ; MOV-NEXT: movl   %edi, %edi
-; 32-NEXT: pushl   %ebx
+; X86-NEXT: pushl   %ebx
 
-; 64: f4:
-; 64-NEXT: # %bb.0:
-; 64-NOT: xchgw   %ax, %ax
+; X64: f4:
+; X64-NEXT: # %bb.0:
+; X64-NOT: xchgw   %ax, %ax
 
 define i32 @f4(ptr %arg1, i64 %arg2, i32 %arg3) "patchable-function"="prologue-short-redirect" {
 bb:
@@ -143,15 +143,15 @@ bb21:
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _emptyfunc:
 
-; 32: emptyfunc:
-; 32CFI-NEXT: .cfi_startproc
-; 32-NEXT: # %bb.0:
+; X86: emptyfunc:
+; X86CFI-NEXT: .cfi_startproc
+; X86-NEXT: # %bb.0:
 ; XCHG-NEXT: xchgw   %ax, %ax
 ; MOV-NEXT: movl   %edi, %edi
 
-; 64: emptyfunc:
-; 64-NEXT: # %bb.0:
-; 64-NEXT: xchgw   %ax, %ax
+; X64: emptyfunc:
+; X64-NEXT: # %bb.0:
+; X64-NEXT: xchgw   %ax, %ax
 
 ; From code: int emptyfunc() {}
 define i32 @emptyfunc() "patchable-function"="prologue-short-redirect" {
@@ -169,15 +169,15 @@ define i32 @emptyfunc() "patchable-function"="prologue-short-redirect" {
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _jmp_to_start:
 
-; 32: jmp_to_start:
-; 32CFI-NEXT: .cfi_startproc
-; 32-NEXT: # %bb.0:
+; X86: jmp_to_start:
+; X86CFI-NEXT: .cfi_startproc
+; X86-NEXT: # %bb.0:
 ; XCHG-NEXT: xchgw   %ax, %ax
 ; MOV-NEXT: movl   %edi, %edi
 
-; 64: jmp_to_start:
-; 64-NEXT: # %bb.0:
-; 64-NEXT: xchgw   %ax, %ax
+; X64: jmp_to_start:
+; X64-NEXT: # %bb.0:
+; X64-NEXT: xchgw   %ax, %ax
 
 define dso_local void @jmp_to_start(ptr inreg nocapture noundef %b) "patchable-function"="prologue-short-redirect" {
 entry:
@@ -198,12 +198,12 @@ do.end:                                           ; preds = %do.body
 ; Test that inline asm is properly hotpatched. We currently don't examine the
 ; asm instruction when printing it, thus we always emit patching NOPs.
 
-; 64: inline_asm:
-; 64-NEXT: # %bb.0:
-; 64-NEXT: xchgw   %ax, %ax                        # encoding: [0x66,0x90]
-; 64-NEXT: #APP
-; 64-NEXT: int3                                    # encoding: [0xcc]
-; 64-NEXT: #NO_APP
+; X64: inline_asm:
+; X64-NEXT: # %bb.0:
+; X64-NEXT: xchgw   %ax, %ax                        # encoding: [0x66,0x90]
+; X64-NEXT: #APP
+; X64-NEXT: int3                                    # encoding: [0xcc]
+; X64-NEXT: #NO_APP
 
 define dso_local void @inline_asm() "patchable-function"="prologue-short-redirect" {
 entry:

From 96ac2e3af78a45c4fdf4ecc3f9a76cc00663cac7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 15 May 2024 14:04:42 +0100
Subject: [PATCH 14/71] [X86] cmpxchg-clobber-flags.ll - cleanup check prefixes
 identified in #92248

Avoid using numbers as check prefix - replace with actual triple config names
---
 .../test/CodeGen/X86/cmpxchg-clobber-flags.ll | 394 +++++++++---------
 1 file changed, 197 insertions(+), 197 deletions(-)

diff --git a/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll
index 7738ce49a7636c..29751dcfca5d6b 100644
--- a/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll
+++ b/llvm/test/CodeGen/X86/cmpxchg-clobber-flags.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=i386-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=32-ALL,32-GOOD-RA
-; RUN: llc -mtriple=i386-linux-gnu -verify-machineinstrs -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefixes=32-ALL,32-FAST-RA
+; RUN: llc -mtriple=i386-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=X86-ALL,X86-GOOD-RA
+; RUN: llc -mtriple=i386-linux-gnu -verify-machineinstrs -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefixes=X86-ALL,X86-FAST-RA
 
-; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=64-ALL
-; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefix=64-ALL
-; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mattr=+sahf %s -o - | FileCheck %s --check-prefix=64-ALL
-; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mattr=+sahf -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefix=64-ALL
-; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mcpu=corei7 %s -o - | FileCheck %s --check-prefix=64-ALL
+; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=X64-ALL
+; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefix=X64-ALL
+; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mattr=+sahf %s -o - | FileCheck %s --check-prefix=X64-ALL
+; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mattr=+sahf -pre-RA-sched=fast %s -o - | FileCheck %s --check-prefix=X64-ALL
+; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs -mcpu=corei7 %s -o - | FileCheck %s --check-prefix=X64-ALL
 
 declare i32 @foo()
 declare i32 @bar(i64)
@@ -24,86 +24,86 @@ declare i32 @bar(i64)
 ; repeated saving and restoring logic and can be trivially managed by the
 ; register allocator.
 define i64 @test_intervening_call(ptr %foo, i64 %bar, i64 %baz) nounwind {
-; 32-GOOD-RA-LABEL: test_intervening_call:
-; 32-GOOD-RA:       # %bb.0: # %entry
-; 32-GOOD-RA-NEXT:    pushl %ebx
-; 32-GOOD-RA-NEXT:    pushl %esi
-; 32-GOOD-RA-NEXT:    pushl %eax
-; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; 32-GOOD-RA-NEXT:    lock cmpxchg8b (%esi)
-; 32-GOOD-RA-NEXT:    setne %bl
-; 32-GOOD-RA-NEXT:    subl $8, %esp
-; 32-GOOD-RA-NEXT:    pushl %edx
-; 32-GOOD-RA-NEXT:    pushl %eax
-; 32-GOOD-RA-NEXT:    calll bar@PLT
-; 32-GOOD-RA-NEXT:    addl $16, %esp
-; 32-GOOD-RA-NEXT:    testb %bl, %bl
-; 32-GOOD-RA-NEXT:    jne .LBB0_3
-; 32-GOOD-RA-NEXT:  # %bb.1: # %t
-; 32-GOOD-RA-NEXT:    movl $42, %eax
-; 32-GOOD-RA-NEXT:    jmp .LBB0_2
-; 32-GOOD-RA-NEXT:  .LBB0_3: # %f
-; 32-GOOD-RA-NEXT:    xorl %eax, %eax
-; 32-GOOD-RA-NEXT:  .LBB0_2: # %t
-; 32-GOOD-RA-NEXT:    xorl %edx, %edx
-; 32-GOOD-RA-NEXT:    addl $4, %esp
-; 32-GOOD-RA-NEXT:    popl %esi
-; 32-GOOD-RA-NEXT:    popl %ebx
-; 32-GOOD-RA-NEXT:    retl
+; X86-GOOD-RA-LABEL: test_intervening_call:
+; X86-GOOD-RA:       # %bb.0: # %entry
+; X86-GOOD-RA-NEXT:    pushl %ebx
+; X86-GOOD-RA-NEXT:    pushl %esi
+; X86-GOOD-RA-NEXT:    pushl %eax
+; X86-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-GOOD-RA-NEXT:    lock cmpxchg8b (%esi)
+; X86-GOOD-RA-NEXT:    setne %bl
+; X86-GOOD-RA-NEXT:    subl $8, %esp
+; X86-GOOD-RA-NEXT:    pushl %edx
+; X86-GOOD-RA-NEXT:    pushl %eax
+; X86-GOOD-RA-NEXT:    calll bar@PLT
+; X86-GOOD-RA-NEXT:    addl $16, %esp
+; X86-GOOD-RA-NEXT:    testb %bl, %bl
+; X86-GOOD-RA-NEXT:    jne .LBB0_3
+; X86-GOOD-RA-NEXT:  # %bb.1: # %t
+; X86-GOOD-RA-NEXT:    movl $42, %eax
+; X86-GOOD-RA-NEXT:    jmp .LBB0_2
+; X86-GOOD-RA-NEXT:  .LBB0_3: # %f
+; X86-GOOD-RA-NEXT:    xorl %eax, %eax
+; X86-GOOD-RA-NEXT:  .LBB0_2: # %t
+; X86-GOOD-RA-NEXT:    xorl %edx, %edx
+; X86-GOOD-RA-NEXT:    addl $4, %esp
+; X86-GOOD-RA-NEXT:    popl %esi
+; X86-GOOD-RA-NEXT:    popl %ebx
+; X86-GOOD-RA-NEXT:    retl
 ;
-; 32-FAST-RA-LABEL: test_intervening_call:
-; 32-FAST-RA:       # %bb.0: # %entry
-; 32-FAST-RA-NEXT:    pushl %ebx
-; 32-FAST-RA-NEXT:    pushl %esi
-; 32-FAST-RA-NEXT:    pushl %eax
-; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; 32-FAST-RA-NEXT:    lock cmpxchg8b (%esi)
-; 32-FAST-RA-NEXT:    setne %bl
-; 32-FAST-RA-NEXT:    subl $8, %esp
-; 32-FAST-RA-NEXT:    pushl %edx
-; 32-FAST-RA-NEXT:    pushl %eax
-; 32-FAST-RA-NEXT:    calll bar@PLT
-; 32-FAST-RA-NEXT:    addl $16, %esp
-; 32-FAST-RA-NEXT:    testb %bl, %bl
-; 32-FAST-RA-NEXT:    jne .LBB0_3
-; 32-FAST-RA-NEXT:  # %bb.1: # %t
-; 32-FAST-RA-NEXT:    movl $42, %eax
-; 32-FAST-RA-NEXT:    jmp .LBB0_2
-; 32-FAST-RA-NEXT:  .LBB0_3: # %f
-; 32-FAST-RA-NEXT:    xorl %eax, %eax
-; 32-FAST-RA-NEXT:  .LBB0_2: # %t
-; 32-FAST-RA-NEXT:    xorl %edx, %edx
-; 32-FAST-RA-NEXT:    addl $4, %esp
-; 32-FAST-RA-NEXT:    popl %esi
-; 32-FAST-RA-NEXT:    popl %ebx
-; 32-FAST-RA-NEXT:    retl
+; X86-FAST-RA-LABEL: test_intervening_call:
+; X86-FAST-RA:       # %bb.0: # %entry
+; X86-FAST-RA-NEXT:    pushl %ebx
+; X86-FAST-RA-NEXT:    pushl %esi
+; X86-FAST-RA-NEXT:    pushl %eax
+; X86-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-RA-NEXT:    lock cmpxchg8b (%esi)
+; X86-FAST-RA-NEXT:    setne %bl
+; X86-FAST-RA-NEXT:    subl $8, %esp
+; X86-FAST-RA-NEXT:    pushl %edx
+; X86-FAST-RA-NEXT:    pushl %eax
+; X86-FAST-RA-NEXT:    calll bar@PLT
+; X86-FAST-RA-NEXT:    addl $16, %esp
+; X86-FAST-RA-NEXT:    testb %bl, %bl
+; X86-FAST-RA-NEXT:    jne .LBB0_3
+; X86-FAST-RA-NEXT:  # %bb.1: # %t
+; X86-FAST-RA-NEXT:    movl $42, %eax
+; X86-FAST-RA-NEXT:    jmp .LBB0_2
+; X86-FAST-RA-NEXT:  .LBB0_3: # %f
+; X86-FAST-RA-NEXT:    xorl %eax, %eax
+; X86-FAST-RA-NEXT:  .LBB0_2: # %t
+; X86-FAST-RA-NEXT:    xorl %edx, %edx
+; X86-FAST-RA-NEXT:    addl $4, %esp
+; X86-FAST-RA-NEXT:    popl %esi
+; X86-FAST-RA-NEXT:    popl %ebx
+; X86-FAST-RA-NEXT:    retl
 ;
-; 64-ALL-LABEL: test_intervening_call:
-; 64-ALL:       # %bb.0: # %entry
-; 64-ALL-NEXT:    pushq %rbx
-; 64-ALL-NEXT:    movq %rsi, %rax
-; 64-ALL-NEXT:    lock cmpxchgq %rdx, (%rdi)
-; 64-ALL-NEXT:    setne %bl
-; 64-ALL-NEXT:    movq %rax, %rdi
-; 64-ALL-NEXT:    callq bar@PLT
-; 64-ALL-NEXT:    testb %bl, %bl
-; 64-ALL-NEXT:    jne .LBB0_2
-; 64-ALL-NEXT:  # %bb.1: # %t
-; 64-ALL-NEXT:    movl $42, %eax
-; 64-ALL-NEXT:    popq %rbx
-; 64-ALL-NEXT:    retq
-; 64-ALL-NEXT:  .LBB0_2: # %f
-; 64-ALL-NEXT:    xorl %eax, %eax
-; 64-ALL-NEXT:    popq %rbx
-; 64-ALL-NEXT:    retq
+; X64-ALL-LABEL: test_intervening_call:
+; X64-ALL:       # %bb.0: # %entry
+; X64-ALL-NEXT:    pushq %rbx
+; X64-ALL-NEXT:    movq %rsi, %rax
+; X64-ALL-NEXT:    lock cmpxchgq %rdx, (%rdi)
+; X64-ALL-NEXT:    setne %bl
+; X64-ALL-NEXT:    movq %rax, %rdi
+; X64-ALL-NEXT:    callq bar@PLT
+; X64-ALL-NEXT:    testb %bl, %bl
+; X64-ALL-NEXT:    jne .LBB0_2
+; X64-ALL-NEXT:  # %bb.1: # %t
+; X64-ALL-NEXT:    movl $42, %eax
+; X64-ALL-NEXT:    popq %rbx
+; X64-ALL-NEXT:    retq
+; X64-ALL-NEXT:  .LBB0_2: # %f
+; X64-ALL-NEXT:    xorl %eax, %eax
+; X64-ALL-NEXT:    popq %rbx
+; X64-ALL-NEXT:    retq
 entry:
   %cx = cmpxchg ptr %foo, i64 %bar, i64 %baz seq_cst seq_cst
   %v = extractvalue { i64, i1 } %cx, 0
@@ -120,61 +120,61 @@ f:
 
 ; Interesting in producing a clobber without any function calls.
 define i32 @test_control_flow(ptr %p, i32 %i, i32 %j) nounwind {
-; 32-ALL-LABEL: test_control_flow:
-; 32-ALL:       # %bb.0: # %entry
-; 32-ALL-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; 32-ALL-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; 32-ALL-NEXT:    jle .LBB1_6
-; 32-ALL-NEXT:  # %bb.1: # %loop_start
-; 32-ALL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; 32-ALL-NEXT:    .p2align 4, 0x90
-; 32-ALL-NEXT:  .LBB1_2: # %while.condthread-pre-split.i
-; 32-ALL-NEXT:    # =>This Loop Header: Depth=1
-; 32-ALL-NEXT:    # Child Loop BB1_3 Depth 2
-; 32-ALL-NEXT:    movl (%ecx), %edx
-; 32-ALL-NEXT:    .p2align 4, 0x90
-; 32-ALL-NEXT:  .LBB1_3: # %while.cond.i
-; 32-ALL-NEXT:    # Parent Loop BB1_2 Depth=1
-; 32-ALL-NEXT:    # => This Inner Loop Header: Depth=2
-; 32-ALL-NEXT:    movl %edx, %eax
-; 32-ALL-NEXT:    xorl %edx, %edx
-; 32-ALL-NEXT:    testl %eax, %eax
-; 32-ALL-NEXT:    je .LBB1_3
-; 32-ALL-NEXT:  # %bb.4: # %while.body.i
-; 32-ALL-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; 32-ALL-NEXT:    lock cmpxchgl %eax, (%ecx)
-; 32-ALL-NEXT:    jne .LBB1_2
-; 32-ALL-NEXT:  # %bb.5:
-; 32-ALL-NEXT:    xorl %eax, %eax
-; 32-ALL-NEXT:  .LBB1_6: # %cond.end
-; 32-ALL-NEXT:    retl
+; X86-ALL-LABEL: test_control_flow:
+; X86-ALL:       # %bb.0: # %entry
+; X86-ALL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-ALL-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-ALL-NEXT:    jle .LBB1_6
+; X86-ALL-NEXT:  # %bb.1: # %loop_start
+; X86-ALL-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-ALL-NEXT:    .p2align 4, 0x90
+; X86-ALL-NEXT:  .LBB1_2: # %while.condthread-pre-split.i
+; X86-ALL-NEXT:    # =>This Loop Header: Depth=1
+; X86-ALL-NEXT:    # Child Loop BB1_3 Depth 2
+; X86-ALL-NEXT:    movl (%ecx), %edx
+; X86-ALL-NEXT:    .p2align 4, 0x90
+; X86-ALL-NEXT:  .LBB1_3: # %while.cond.i
+; X86-ALL-NEXT:    # Parent Loop BB1_2 Depth=1
+; X86-ALL-NEXT:    # => This Inner Loop Header: Depth=2
+; X86-ALL-NEXT:    movl %edx, %eax
+; X86-ALL-NEXT:    xorl %edx, %edx
+; X86-ALL-NEXT:    testl %eax, %eax
+; X86-ALL-NEXT:    je .LBB1_3
+; X86-ALL-NEXT:  # %bb.4: # %while.body.i
+; X86-ALL-NEXT:    # in Loop: Header=BB1_2 Depth=1
+; X86-ALL-NEXT:    lock cmpxchgl %eax, (%ecx)
+; X86-ALL-NEXT:    jne .LBB1_2
+; X86-ALL-NEXT:  # %bb.5:
+; X86-ALL-NEXT:    xorl %eax, %eax
+; X86-ALL-NEXT:  .LBB1_6: # %cond.end
+; X86-ALL-NEXT:    retl
 ;
-; 64-ALL-LABEL: test_control_flow:
-; 64-ALL:       # %bb.0: # %entry
-; 64-ALL-NEXT:    movl %esi, %eax
-; 64-ALL-NEXT:    cmpl %edx, %esi
-; 64-ALL-NEXT:    jle .LBB1_5
-; 64-ALL-NEXT:    .p2align 4, 0x90
-; 64-ALL-NEXT:  .LBB1_1: # %while.condthread-pre-split.i
-; 64-ALL-NEXT:    # =>This Loop Header: Depth=1
-; 64-ALL-NEXT:    # Child Loop BB1_2 Depth 2
-; 64-ALL-NEXT:    movl (%rdi), %ecx
-; 64-ALL-NEXT:    .p2align 4, 0x90
-; 64-ALL-NEXT:  .LBB1_2: # %while.cond.i
-; 64-ALL-NEXT:    # Parent Loop BB1_1 Depth=1
-; 64-ALL-NEXT:    # => This Inner Loop Header: Depth=2
-; 64-ALL-NEXT:    movl %ecx, %eax
-; 64-ALL-NEXT:    xorl %ecx, %ecx
-; 64-ALL-NEXT:    testl %eax, %eax
-; 64-ALL-NEXT:    je .LBB1_2
-; 64-ALL-NEXT:  # %bb.3: # %while.body.i
-; 64-ALL-NEXT:    # in Loop: Header=BB1_1 Depth=1
-; 64-ALL-NEXT:    lock cmpxchgl %eax, (%rdi)
-; 64-ALL-NEXT:    jne .LBB1_1
-; 64-ALL-NEXT:  # %bb.4:
-; 64-ALL-NEXT:    xorl %eax, %eax
-; 64-ALL-NEXT:  .LBB1_5: # %cond.end
-; 64-ALL-NEXT:    retq
+; X64-ALL-LABEL: test_control_flow:
+; X64-ALL:       # %bb.0: # %entry
+; X64-ALL-NEXT:    movl %esi, %eax
+; X64-ALL-NEXT:    cmpl %edx, %esi
+; X64-ALL-NEXT:    jle .LBB1_5
+; X64-ALL-NEXT:    .p2align 4, 0x90
+; X64-ALL-NEXT:  .LBB1_1: # %while.condthread-pre-split.i
+; X64-ALL-NEXT:    # =>This Loop Header: Depth=1
+; X64-ALL-NEXT:    # Child Loop BB1_2 Depth 2
+; X64-ALL-NEXT:    movl (%rdi), %ecx
+; X64-ALL-NEXT:    .p2align 4, 0x90
+; X64-ALL-NEXT:  .LBB1_2: # %while.cond.i
+; X64-ALL-NEXT:    # Parent Loop BB1_1 Depth=1
+; X64-ALL-NEXT:    # => This Inner Loop Header: Depth=2
+; X64-ALL-NEXT:    movl %ecx, %eax
+; X64-ALL-NEXT:    xorl %ecx, %ecx
+; X64-ALL-NEXT:    testl %eax, %eax
+; X64-ALL-NEXT:    je .LBB1_2
+; X64-ALL-NEXT:  # %bb.3: # %while.body.i
+; X64-ALL-NEXT:    # in Loop: Header=BB1_1 Depth=1
+; X64-ALL-NEXT:    lock cmpxchgl %eax, (%rdi)
+; X64-ALL-NEXT:    jne .LBB1_1
+; X64-ALL-NEXT:  # %bb.4:
+; X64-ALL-NEXT:    xorl %eax, %eax
+; X64-ALL-NEXT:  .LBB1_5: # %cond.end
+; X64-ALL-NEXT:    retq
 entry:
   %cmp = icmp sgt i32 %i, %j
   br i1 %cmp, label %loop_start, label %cond.end
@@ -208,66 +208,66 @@ cond.end:
 ; This one is an interesting case because CMOV doesn't have a chain
 ; operand. Naive attempts to limit cmpxchg EFLAGS use are likely to fail here.
 define i32 @test_feed_cmov(ptr %addr, i32 %desired, i32 %new) nounwind {
-; 32-GOOD-RA-LABEL: test_feed_cmov:
-; 32-GOOD-RA:       # %bb.0: # %entry
-; 32-GOOD-RA-NEXT:    pushl %ebx
-; 32-GOOD-RA-NEXT:    pushl %esi
-; 32-GOOD-RA-NEXT:    pushl %eax
-; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; 32-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; 32-GOOD-RA-NEXT:    lock cmpxchgl %esi, (%ecx)
-; 32-GOOD-RA-NEXT:    sete %bl
-; 32-GOOD-RA-NEXT:    calll foo@PLT
-; 32-GOOD-RA-NEXT:    testb %bl, %bl
-; 32-GOOD-RA-NEXT:    jne .LBB2_2
-; 32-GOOD-RA-NEXT:  # %bb.1: # %entry
-; 32-GOOD-RA-NEXT:    movl %eax, %esi
-; 32-GOOD-RA-NEXT:  .LBB2_2: # %entry
-; 32-GOOD-RA-NEXT:    movl %esi, %eax
-; 32-GOOD-RA-NEXT:    addl $4, %esp
-; 32-GOOD-RA-NEXT:    popl %esi
-; 32-GOOD-RA-NEXT:    popl %ebx
-; 32-GOOD-RA-NEXT:    retl
+; X86-GOOD-RA-LABEL: test_feed_cmov:
+; X86-GOOD-RA:       # %bb.0: # %entry
+; X86-GOOD-RA-NEXT:    pushl %ebx
+; X86-GOOD-RA-NEXT:    pushl %esi
+; X86-GOOD-RA-NEXT:    pushl %eax
+; X86-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-GOOD-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-GOOD-RA-NEXT:    lock cmpxchgl %esi, (%ecx)
+; X86-GOOD-RA-NEXT:    sete %bl
+; X86-GOOD-RA-NEXT:    calll foo@PLT
+; X86-GOOD-RA-NEXT:    testb %bl, %bl
+; X86-GOOD-RA-NEXT:    jne .LBB2_2
+; X86-GOOD-RA-NEXT:  # %bb.1: # %entry
+; X86-GOOD-RA-NEXT:    movl %eax, %esi
+; X86-GOOD-RA-NEXT:  .LBB2_2: # %entry
+; X86-GOOD-RA-NEXT:    movl %esi, %eax
+; X86-GOOD-RA-NEXT:    addl $4, %esp
+; X86-GOOD-RA-NEXT:    popl %esi
+; X86-GOOD-RA-NEXT:    popl %ebx
+; X86-GOOD-RA-NEXT:    retl
 ;
-; 32-FAST-RA-LABEL: test_feed_cmov:
-; 32-FAST-RA:       # %bb.0: # %entry
-; 32-FAST-RA-NEXT:    pushl %ebx
-; 32-FAST-RA-NEXT:    pushl %esi
-; 32-FAST-RA-NEXT:    pushl %eax
-; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; 32-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; 32-FAST-RA-NEXT:    lock cmpxchgl %esi, (%ecx)
-; 32-FAST-RA-NEXT:    sete %bl
-; 32-FAST-RA-NEXT:    calll foo@PLT
-; 32-FAST-RA-NEXT:    testb %bl, %bl
-; 32-FAST-RA-NEXT:    jne .LBB2_2
-; 32-FAST-RA-NEXT:  # %bb.1: # %entry
-; 32-FAST-RA-NEXT:    movl %eax, %esi
-; 32-FAST-RA-NEXT:  .LBB2_2: # %entry
-; 32-FAST-RA-NEXT:    movl %esi, %eax
-; 32-FAST-RA-NEXT:    addl $4, %esp
-; 32-FAST-RA-NEXT:    popl %esi
-; 32-FAST-RA-NEXT:    popl %ebx
-; 32-FAST-RA-NEXT:    retl
+; X86-FAST-RA-LABEL: test_feed_cmov:
+; X86-FAST-RA:       # %bb.0: # %entry
+; X86-FAST-RA-NEXT:    pushl %ebx
+; X86-FAST-RA-NEXT:    pushl %esi
+; X86-FAST-RA-NEXT:    pushl %eax
+; X86-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-FAST-RA-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-RA-NEXT:    lock cmpxchgl %esi, (%ecx)
+; X86-FAST-RA-NEXT:    sete %bl
+; X86-FAST-RA-NEXT:    calll foo@PLT
+; X86-FAST-RA-NEXT:    testb %bl, %bl
+; X86-FAST-RA-NEXT:    jne .LBB2_2
+; X86-FAST-RA-NEXT:  # %bb.1: # %entry
+; X86-FAST-RA-NEXT:    movl %eax, %esi
+; X86-FAST-RA-NEXT:  .LBB2_2: # %entry
+; X86-FAST-RA-NEXT:    movl %esi, %eax
+; X86-FAST-RA-NEXT:    addl $4, %esp
+; X86-FAST-RA-NEXT:    popl %esi
+; X86-FAST-RA-NEXT:    popl %ebx
+; X86-FAST-RA-NEXT:    retl
 ;
-; 64-ALL-LABEL: test_feed_cmov:
-; 64-ALL:       # %bb.0: # %entry
-; 64-ALL-NEXT:    pushq %rbp
-; 64-ALL-NEXT:    pushq %rbx
-; 64-ALL-NEXT:    pushq %rax
-; 64-ALL-NEXT:    movl %edx, %ebx
-; 64-ALL-NEXT:    movl %esi, %eax
-; 64-ALL-NEXT:    lock cmpxchgl %edx, (%rdi)
-; 64-ALL-NEXT:    sete %bpl
-; 64-ALL-NEXT:    callq foo@PLT
-; 64-ALL-NEXT:    testb %bpl, %bpl
-; 64-ALL-NEXT:    cmovnel %ebx, %eax
-; 64-ALL-NEXT:    addq $8, %rsp
-; 64-ALL-NEXT:    popq %rbx
-; 64-ALL-NEXT:    popq %rbp
-; 64-ALL-NEXT:    retq
+; X64-ALL-LABEL: test_feed_cmov:
+; X64-ALL:       # %bb.0: # %entry
+; X64-ALL-NEXT:    pushq %rbp
+; X64-ALL-NEXT:    pushq %rbx
+; X64-ALL-NEXT:    pushq %rax
+; X64-ALL-NEXT:    movl %edx, %ebx
+; X64-ALL-NEXT:    movl %esi, %eax
+; X64-ALL-NEXT:    lock cmpxchgl %edx, (%rdi)
+; X64-ALL-NEXT:    sete %bpl
+; X64-ALL-NEXT:    callq foo@PLT
+; X64-ALL-NEXT:    testb %bpl, %bpl
+; X64-ALL-NEXT:    cmovnel %ebx, %eax
+; X64-ALL-NEXT:    addq $8, %rsp
+; X64-ALL-NEXT:    popq %rbx
+; X64-ALL-NEXT:    popq %rbp
+; X64-ALL-NEXT:    retq
 entry:
   %res = cmpxchg ptr %addr, i32 %desired, i32 %new seq_cst seq_cst
   %success = extractvalue { i32, i1 } %res, 1

From e26eacf771fed3226058a84d5d83f94994f583b2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 15 May 2024 14:10:56 +0100
Subject: [PATCH 15/71] [X86] prefetch.ll - cleanup check prefixes identified
 in #92248

Avoid using leading numbers in check prefixes - replace with actual triple config names (and makes it easier to add X64 test coverage in a future commit).
---
 llvm/test/CodeGen/X86/prefetch.ll | 138 +++++++++++++++---------------
 1 file changed, 69 insertions(+), 69 deletions(-)

diff --git a/llvm/test/CodeGen/X86/prefetch.ll b/llvm/test/CodeGen/X86/prefetch.ll
index 3cfa0e3efcb1e1..404d49b63f25c9 100644
--- a/llvm/test/CodeGen/X86/prefetch.ll
+++ b/llvm/test/CodeGen/X86/prefetch.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-- -mattr=+sse | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prfchw | FileCheck %s -check-prefix=PRFCHWSSE
-; RUN: llc < %s -mtriple=i686-- -mattr=+prfchw | FileCheck %s -check-prefix=PRFCHWSSE
-; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=PRFCHWSSE
-; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=PRFCHWSSE
-; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=SSE
-; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prefetchwt1 | FileCheck %s -check-prefix=PREFETCHWT1
-; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+prefetchwt1 | FileCheck %s -check-prefix=PREFETCHWT1
-; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+3dnow,+prefetchwt1 | FileCheck %s -check-prefix=PREFETCHWT1
-; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow | FileCheck %s -check-prefix=3DNOW
-; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow,+prfchw | FileCheck %s -check-prefix=3DNOW
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse | FileCheck %s --check-prefix=X86-SSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s --check-prefix=X86-SSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prfchw | FileCheck %s -check-prefix=X86-PRFCHWSSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+prfchw | FileCheck %s -check-prefix=X86-PRFCHWSSE
+; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=X86-PRFCHWSSE
+; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=X86-PRFCHWSSE
+; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=X86-SSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prefetchwt1 | FileCheck %s -check-prefix=X86-PREFETCHWT1
+; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+prefetchwt1 | FileCheck %s -check-prefix=X86-PREFETCHWT1
+; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+3dnow,+prefetchwt1 | FileCheck %s -check-prefix=X86-PREFETCHWT1
+; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow | FileCheck %s -check-prefix=X86-3DNOW
+; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow,+prfchw | FileCheck %s -check-prefix=X86-3DNOW
 
 ; Rules:
 ; 3dnow by itself get you just the single prefetch instruction with no hints
@@ -22,67 +22,67 @@
 ; rdar://10538297
 
 define void @t(ptr %ptr) nounwind  {
-; SSE-LABEL: t:
-; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE-NEXT:    prefetcht2 (%eax)
-; SSE-NEXT:    prefetcht1 (%eax)
-; SSE-NEXT:    prefetcht0 (%eax)
-; SSE-NEXT:    prefetchnta (%eax)
-; SSE-NEXT:    prefetcht2 (%eax)
-; SSE-NEXT:    prefetcht1 (%eax)
-; SSE-NEXT:    prefetcht0 (%eax)
-; SSE-NEXT:    prefetchnta (%eax)
-; SSE-NEXT:    retl
+; X86-SSE-LABEL: t:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    prefetcht2 (%eax)
+; X86-SSE-NEXT:    prefetcht1 (%eax)
+; X86-SSE-NEXT:    prefetcht0 (%eax)
+; X86-SSE-NEXT:    prefetchnta (%eax)
+; X86-SSE-NEXT:    prefetcht2 (%eax)
+; X86-SSE-NEXT:    prefetcht1 (%eax)
+; X86-SSE-NEXT:    prefetcht0 (%eax)
+; X86-SSE-NEXT:    prefetchnta (%eax)
+; X86-SSE-NEXT:    retl
 ;
-; PRFCHWSSE-LABEL: t:
-; PRFCHWSSE:       # %bb.0: # %entry
-; PRFCHWSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; PRFCHWSSE-NEXT:    prefetcht2 (%eax)
-; PRFCHWSSE-NEXT:    prefetcht1 (%eax)
-; PRFCHWSSE-NEXT:    prefetcht0 (%eax)
-; PRFCHWSSE-NEXT:    prefetchnta (%eax)
-; PRFCHWSSE-NEXT:    prefetchw (%eax)
-; PRFCHWSSE-NEXT:    prefetchw (%eax)
-; PRFCHWSSE-NEXT:    prefetchw (%eax)
-; PRFCHWSSE-NEXT:    prefetchw (%eax)
-; PRFCHWSSE-NEXT:    retl
+; X86-PRFCHWSSE-LABEL: t:
+; X86-PRFCHWSSE:       # %bb.0: # %entry
+; X86-PRFCHWSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-PRFCHWSSE-NEXT:    prefetcht2 (%eax)
+; X86-PRFCHWSSE-NEXT:    prefetcht1 (%eax)
+; X86-PRFCHWSSE-NEXT:    prefetcht0 (%eax)
+; X86-PRFCHWSSE-NEXT:    prefetchnta (%eax)
+; X86-PRFCHWSSE-NEXT:    prefetchw (%eax)
+; X86-PRFCHWSSE-NEXT:    prefetchw (%eax)
+; X86-PRFCHWSSE-NEXT:    prefetchw (%eax)
+; X86-PRFCHWSSE-NEXT:    prefetchw (%eax)
+; X86-PRFCHWSSE-NEXT:    retl
 ;
-; PREFETCHWT1-LABEL: t:
-; PREFETCHWT1:       # %bb.0: # %entry
-; PREFETCHWT1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; PREFETCHWT1-NEXT:    prefetcht2 (%eax)
-; PREFETCHWT1-NEXT:    prefetcht1 (%eax)
-; PREFETCHWT1-NEXT:    prefetcht0 (%eax)
-; PREFETCHWT1-NEXT:    prefetchnta (%eax)
-; PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
-; PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
-; PREFETCHWT1-NEXT:    prefetchw (%eax)
-; PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
-; PREFETCHWT1-NEXT:    retl
+; X86-PREFETCHWT1-LABEL: t:
+; X86-PREFETCHWT1:       # %bb.0: # %entry
+; X86-PREFETCHWT1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-PREFETCHWT1-NEXT:    prefetcht2 (%eax)
+; X86-PREFETCHWT1-NEXT:    prefetcht1 (%eax)
+; X86-PREFETCHWT1-NEXT:    prefetcht0 (%eax)
+; X86-PREFETCHWT1-NEXT:    prefetchnta (%eax)
+; X86-PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
+; X86-PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
+; X86-PREFETCHWT1-NEXT:    prefetchw (%eax)
+; X86-PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
+; X86-PREFETCHWT1-NEXT:    retl
 ;
-; 3DNOW-LABEL: t:
-; 3DNOW:       # %bb.0: # %entry
-; 3DNOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; 3DNOW-NEXT:    prefetch (%eax)
-; 3DNOW-NEXT:    prefetch (%eax)
-; 3DNOW-NEXT:    prefetch (%eax)
-; 3DNOW-NEXT:    prefetch (%eax)
-; 3DNOW-NEXT:    prefetchw (%eax)
-; 3DNOW-NEXT:    prefetchw (%eax)
-; 3DNOW-NEXT:    prefetchw (%eax)
-; 3DNOW-NEXT:    prefetchw (%eax)
-; 3DNOW-NEXT:    retl
+; X86-3DNOW-LABEL: t:
+; X86-3DNOW:       # %bb.0: # %entry
+; X86-3DNOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-3DNOW-NEXT:    prefetch (%eax)
+; X86-3DNOW-NEXT:    prefetch (%eax)
+; X86-3DNOW-NEXT:    prefetch (%eax)
+; X86-3DNOW-NEXT:    prefetch (%eax)
+; X86-3DNOW-NEXT:    prefetchw (%eax)
+; X86-3DNOW-NEXT:    prefetchw (%eax)
+; X86-3DNOW-NEXT:    prefetchw (%eax)
+; X86-3DNOW-NEXT:    prefetchw (%eax)
+; X86-3DNOW-NEXT:    retl
 entry:
-	tail call void @llvm.prefetch( ptr %ptr, i32 0, i32 1, i32 1 )
-	tail call void @llvm.prefetch( ptr %ptr, i32 0, i32 2, i32 1 )
-	tail call void @llvm.prefetch( ptr %ptr, i32 0, i32 3, i32 1 )
-	tail call void @llvm.prefetch( ptr %ptr, i32 0, i32 0, i32 1 )
-	tail call void @llvm.prefetch( ptr %ptr, i32 1, i32 1, i32 1 )
-	tail call void @llvm.prefetch( ptr %ptr, i32 1, i32 2, i32 1 )
-	tail call void @llvm.prefetch( ptr %ptr, i32 1, i32 3, i32 1 )
-	tail call void @llvm.prefetch( ptr %ptr, i32 1, i32 0, i32 1 )
-	ret void
+  tail call void @llvm.prefetch( ptr %ptr, i32 0, i32 1, i32 1 )
+  tail call void @llvm.prefetch( ptr %ptr, i32 0, i32 2, i32 1 )
+  tail call void @llvm.prefetch( ptr %ptr, i32 0, i32 3, i32 1 )
+  tail call void @llvm.prefetch( ptr %ptr, i32 0, i32 0, i32 1 )
+  tail call void @llvm.prefetch( ptr %ptr, i32 1, i32 1, i32 1 )
+  tail call void @llvm.prefetch( ptr %ptr, i32 1, i32 2, i32 1 )
+  tail call void @llvm.prefetch( ptr %ptr, i32 1, i32 3, i32 1 )
+  tail call void @llvm.prefetch( ptr %ptr, i32 1, i32 0, i32 1 )
+  ret void
 }
 
 declare void @llvm.prefetch(ptr, i32, i32, i32) nounwind

From 3f07430c383dffad77a120c91df79cbc7d99313c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 15 May 2024 14:23:46 +0100
Subject: [PATCH 16/71] [X86] avoid-sfb-g-no-change.mir - cleanup check
 prefixes identified in #92248

Don't include "-LABEL" (or any other FileCheck modifier) in the core check prefix name
---
 llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
index 679a908893f6ea..93292990dee02a 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
+++ b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
@@ -1,5 +1,5 @@
-# RUN: llc %s -run-pass x86-avoid-SFB -mtriple=x86_64-unknown-linux-gnu -o - | FileCheck %s -check-prefixes DEBUG-LABEL,CHECK
-# RUN: llc %s -run-pass x86-avoid-SFB -mtriple=x86_64-unknown-linux-gnu -o - | FileCheck %s -check-prefixes NODEBUG-LABEL,CHECK
+# RUN: llc %s -run-pass x86-avoid-SFB -mtriple=x86_64-unknown-linux-gnu -o - | FileCheck %s -check-prefixes=CHECK,DEBUG
+# RUN: llc %s -run-pass x86-avoid-SFB -mtriple=x86_64-unknown-linux-gnu -o - | FileCheck %s -check-prefixes=CHECK,NODEBUG
 #
 # This was generated from:
 #
@@ -202,8 +202,8 @@ body:             |
     MOVAPSmr %1, 1, $noreg, 0, $noreg, killed %2 :: (store (s128) into %ir.p2)
     RET 0
 
-    ; DEBUG-LABEL: name: debug
-    ; NODEBUG-LABEL: name: nodebug
+    ; DEBUG: name: debug
+    ; NODEBUG: name: nodebug
     ; CHECK: %1:gr64 = COPY
     ; CHECK: %0:gr64 = COPY
     ; CHECK: MOV8mi
@@ -218,5 +218,5 @@ body:             |
     ; CHECK: %7:gr8 = MOV8rm
     ; CHECK: MOV8mr
     ; CHECK: RET 0
-    ; DEBUG-LABEL: name: nodebug
+    ; DEBUG: name: nodebug
 ...

From f8395f8420cee8fc0854f43c9e88819c0ed54696 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 15 May 2024 14:25:29 +0100
Subject: [PATCH 17/71] [X86] Cleanup check prefixes identified in #92248

Avoid using leading numbers in check prefixes - replace with actual triple config names.
---
 .../align-branch-boundary-suppressions-tls.ll |  22 +--
 llvm/test/CodeGen/X86/asm-modifier.ll         |  30 +--
 llvm/test/CodeGen/X86/pr32345.ll              | 180 +++++++++---------
 llvm/test/CodeGen/X86/x32-va_start.ll         |  46 ++---
 4 files changed, 139 insertions(+), 139 deletions(-)

diff --git a/llvm/test/CodeGen/X86/align-branch-boundary-suppressions-tls.ll b/llvm/test/CodeGen/X86/align-branch-boundary-suppressions-tls.ll
index fd58cfeb65fd74..0e2e6f3ef81df5 100644
--- a/llvm/test/CodeGen/X86/align-branch-boundary-suppressions-tls.ll
+++ b/llvm/test/CodeGen/X86/align-branch-boundary-suppressions-tls.ll
@@ -2,8 +2,8 @@
 ;; sequence. It uses prefixes to allow linker relaxation. We need to disable
 ;; prefix or nop padding for it. For simplicity and consistency, disable for
 ;; Local Dynamic and 32-bit as well.
-; RUN: llc -mtriple=i386 -relocation-model=pic -x86-branches-within-32B-boundaries < %s | FileCheck --check-prefixes=CHECK,32 %s
-; RUN: llc -mtriple=x86_64 -relocation-model=pic -x86-branches-within-32B-boundaries < %s | FileCheck --check-prefixes=CHECK,64 %s
+; RUN: llc -mtriple=i386 -relocation-model=pic -x86-branches-within-32B-boundaries < %s | FileCheck --check-prefixes=CHECK,X86 %s
+; RUN: llc -mtriple=x86_64 -relocation-model=pic -x86-branches-within-32B-boundaries < %s | FileCheck --check-prefixes=CHECK,X64 %s
 
 @gd = external thread_local global i32
 @ld = internal thread_local global i32 0
@@ -11,17 +11,17 @@
 define i32 @tls_get_addr() {
 ; CHECK-LABEL: tls_get_addr:
 ; CHECK: #noautopadding
-; 32: leal gd@TLSGD(,%ebx), %eax
-; 32: calll ___tls_get_addr@PLT
-; 64: data16
-; 64: leaq gd@TLSGD(%rip), %rdi
-; 64: callq __tls_get_addr@PLT
+; X86: leal gd@TLSGD(,%ebx), %eax
+; X86: calll ___tls_get_addr@PLT
+; X64: data16
+; X64: leaq gd@TLSGD(%rip), %rdi
+; X64: callq __tls_get_addr@PLT
 ; CHECK: #autopadding
 ; CHECK: #noautopadding
-; 32: leal ld@TLSLDM(%ebx), %eax
-; 32: calll ___tls_get_addr@PLT
-; 64: leaq ld@TLSLD(%rip), %rdi
-; 64: callq __tls_get_addr@PLT
+; X86: leal ld@TLSLDM(%ebx), %eax
+; X86: calll ___tls_get_addr@PLT
+; X64: leaq ld@TLSLD(%rip), %rdi
+; X64: callq __tls_get_addr@PLT
 ; CHECK: #autopadding
   %1 = load i32, ptr @gd
   %2 = load i32, ptr @ld
diff --git a/llvm/test/CodeGen/X86/asm-modifier.ll b/llvm/test/CodeGen/X86/asm-modifier.ll
index c121b46f845065..9a69402d221689 100644
--- a/llvm/test/CodeGen/X86/asm-modifier.ll
+++ b/llvm/test/CodeGen/X86/asm-modifier.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=i686 < %s | FileCheck %s --check-prefixes=CHECK,32
-; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefixes=CHECK,64
+; RUN: llc -mtriple=i686 < %s | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefixes=CHECK,X64
 
 @var = internal global i32 0, align 4
 
@@ -43,20 +43,20 @@ entry:
 }
 
 define void @test_V(ptr %p) {
-; 32-LABEL: test_V:
-; 32:       # %bb.0: # %entry
-; 32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; 32-NEXT:    #APP
-; 32-NEXT:    calll __x86_indirect_thunk_eax
-; 32-NEXT:    #NO_APP
-; 32-NEXT:    retl
+; X86-LABEL: test_V:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    #APP
+; X86-NEXT:    calll __x86_indirect_thunk_eax
+; X86-NEXT:    #NO_APP
+; X86-NEXT:    retl
 ;
-; 64-LABEL: test_V:
-; 64:       # %bb.0: # %entry
-; 64-NEXT:    #APP
-; 64-NEXT:    callq __x86_indirect_thunk_rdi
-; 64-NEXT:    #NO_APP
-; 64-NEXT:    retq
+; X64-LABEL: test_V:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    #APP
+; X64-NEXT:    callq __x86_indirect_thunk_rdi
+; X64-NEXT:    #NO_APP
+; X64-NEXT:    retq
 entry:
   tail call void asm sideeffect "call __x86_indirect_thunk_${0:V}", "r,~{dirflag},~{fpsr},~{flags}"(ptr %p)
   ret void
diff --git a/llvm/test/CodeGen/X86/pr32345.ll b/llvm/test/CodeGen/X86/pr32345.ll
index 2745cb8bb908bb..c7405e982660c0 100644
--- a/llvm/test/CodeGen/X86/pr32345.ll
+++ b/llvm/test/CodeGen/X86/pr32345.ll
@@ -1,74 +1,74 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -o - %s | FileCheck %s -check-prefix=X640
-; RUN: llc -O0 -mtriple=i686-unknown             -o - %s | FileCheck %s -check-prefix=6860
-; RUN: llc     -mtriple=x86_64-unknown-linux-gnu -o - %s | FileCheck %s -check-prefix=X64
-; RUN: llc     -mtriple=i686-unknown             -o - %s | FileCheck %s -check-prefix=686
+; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s -check-prefix=X64-O0
+; RUN: llc -O0 -mtriple=i686-unknown             < %s | FileCheck %s -check-prefix=X86-O0
+; RUN: llc     -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s -check-prefix=X64
+; RUN: llc     -mtriple=i686-unknown             < %s | FileCheck %s -check-prefix=X86
 
 @var_22 = external dso_local global i16, align 2
 @var_27 = external dso_local global i16, align 2
 
 define void @foo() {
-; X640-LABEL: foo:
-; X640:       # %bb.0: # %bb
-; X640-NEXT:    movzwl var_22, %eax
-; X640-NEXT:    movzwl var_27, %ecx
-; X640-NEXT:    xorl %ecx, %eax
-; X640-NEXT:    movzwl var_27, %ecx
-; X640-NEXT:    xorl %ecx, %eax
-; X640-NEXT:    cltq
-; X640-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X640-NEXT:    movzwl var_22, %eax
-; X640-NEXT:    movzwl var_27, %ecx
-; X640-NEXT:    xorl %ecx, %eax
-; X640-NEXT:    movzwl var_27, %ecx
-; X640-NEXT:    xorl %ecx, %eax
-; X640-NEXT:    cltq
-; X640-NEXT:    movzwl var_27, %ecx
-; X640-NEXT:    subl $16610, %ecx # imm = 0x40E2
-; X640-NEXT:    movl %ecx, %ecx
-; X640-NEXT:    # kill: def $rcx killed $ecx
-; X640-NEXT:    # kill: def $cl killed $rcx
-; X640-NEXT:    sarq %cl, %rax
-; X640-NEXT:    movb %al, %cl
-; X640-NEXT:    # implicit-def: $rax
-; X640-NEXT:    movb %cl, (%rax)
-; X640-NEXT:    retq
+; X64-O0-LABEL: foo:
+; X64-O0:       # %bb.0: # %bb
+; X64-O0-NEXT:    movzwl var_22, %eax
+; X64-O0-NEXT:    movzwl var_27, %ecx
+; X64-O0-NEXT:    xorl %ecx, %eax
+; X64-O0-NEXT:    movzwl var_27, %ecx
+; X64-O0-NEXT:    xorl %ecx, %eax
+; X64-O0-NEXT:    cltq
+; X64-O0-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-O0-NEXT:    movzwl var_22, %eax
+; X64-O0-NEXT:    movzwl var_27, %ecx
+; X64-O0-NEXT:    xorl %ecx, %eax
+; X64-O0-NEXT:    movzwl var_27, %ecx
+; X64-O0-NEXT:    xorl %ecx, %eax
+; X64-O0-NEXT:    cltq
+; X64-O0-NEXT:    movzwl var_27, %ecx
+; X64-O0-NEXT:    subl $16610, %ecx # imm = 0x40E2
+; X64-O0-NEXT:    movl %ecx, %ecx
+; X64-O0-NEXT:    # kill: def $rcx killed $ecx
+; X64-O0-NEXT:    # kill: def $cl killed $rcx
+; X64-O0-NEXT:    sarq %cl, %rax
+; X64-O0-NEXT:    movb %al, %cl
+; X64-O0-NEXT:    # implicit-def: $rax
+; X64-O0-NEXT:    movb %cl, (%rax)
+; X64-O0-NEXT:    retq
 ;
-; 6860-LABEL: foo:
-; 6860:       # %bb.0: # %bb
-; 6860-NEXT:    pushl %ebp
-; 6860-NEXT:    .cfi_def_cfa_offset 8
-; 6860-NEXT:    .cfi_offset %ebp, -8
-; 6860-NEXT:    movl %esp, %ebp
-; 6860-NEXT:    .cfi_def_cfa_register %ebp
-; 6860-NEXT:    andl $-8, %esp
-; 6860-NEXT:    subl $24, %esp
-; 6860-NEXT:    movzwl var_22, %eax
-; 6860-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; 6860-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; 6860-NEXT:    movzwl var_22, %edx
-; 6860-NEXT:    movb var_27, %cl
-; 6860-NEXT:    addb $30, %cl
-; 6860-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; 6860-NEXT:    xorl %eax, %eax
-; 6860-NEXT:    shrdl %cl, %eax, %edx
-; 6860-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
-; 6860-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; 6860-NEXT:    testb $32, %cl
-; 6860-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; 6860-NEXT:    jne .LBB0_2
-; 6860-NEXT:  # %bb.1: # %bb
-; 6860-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; 6860-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; 6860-NEXT:  .LBB0_2: # %bb
-; 6860-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; 6860-NEXT:    movb %al, %cl
-; 6860-NEXT:    # implicit-def: $eax
-; 6860-NEXT:    movb %cl, (%eax)
-; 6860-NEXT:    movl %ebp, %esp
-; 6860-NEXT:    popl %ebp
-; 6860-NEXT:    .cfi_def_cfa %esp, 4
-; 6860-NEXT:    retl
+; X86-O0-LABEL: foo:
+; X86-O0:       # %bb.0: # %bb
+; X86-O0-NEXT:    pushl %ebp
+; X86-O0-NEXT:    .cfi_def_cfa_offset 8
+; X86-O0-NEXT:    .cfi_offset %ebp, -8
+; X86-O0-NEXT:    movl %esp, %ebp
+; X86-O0-NEXT:    .cfi_def_cfa_register %ebp
+; X86-O0-NEXT:    andl $-8, %esp
+; X86-O0-NEXT:    subl $24, %esp
+; X86-O0-NEXT:    movzwl var_22, %eax
+; X86-O0-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-O0-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-O0-NEXT:    movzwl var_22, %edx
+; X86-O0-NEXT:    movb var_27, %cl
+; X86-O0-NEXT:    addb $30, %cl
+; X86-O0-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-O0-NEXT:    xorl %eax, %eax
+; X86-O0-NEXT:    shrdl %cl, %eax, %edx
+; X86-O0-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-O0-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-O0-NEXT:    testb $32, %cl
+; X86-O0-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-O0-NEXT:    jne .LBB0_2
+; X86-O0-NEXT:  # %bb.1: # %bb
+; X86-O0-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-O0-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-O0-NEXT:  .LBB0_2: # %bb
+; X86-O0-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-O0-NEXT:    movb %al, %cl
+; X86-O0-NEXT:    # implicit-def: $eax
+; X86-O0-NEXT:    movb %cl, (%eax)
+; X86-O0-NEXT:    movl %ebp, %esp
+; X86-O0-NEXT:    popl %ebp
+; X86-O0-NEXT:    .cfi_def_cfa %esp, 4
+; X86-O0-NEXT:    retl
 ;
 ; X64-LABEL: foo:
 ; X64:       # %bb.0: # %bb
@@ -80,32 +80,32 @@ define void @foo() {
 ; X64-NEXT:    movb %al, (%rax)
 ; X64-NEXT:    retq
 ;
-; 686-LABEL: foo:
-; 686:       # %bb.0: # %bb
-; 686-NEXT:    pushl %ebp
-; 686-NEXT:    .cfi_def_cfa_offset 8
-; 686-NEXT:    .cfi_offset %ebp, -8
-; 686-NEXT:    movl %esp, %ebp
-; 686-NEXT:    .cfi_def_cfa_register %ebp
-; 686-NEXT:    andl $-8, %esp
-; 686-NEXT:    subl $8, %esp
-; 686-NEXT:    movzbl var_27, %ecx
-; 686-NEXT:    movzwl var_22, %eax
-; 686-NEXT:    movl %eax, (%esp)
-; 686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; 686-NEXT:    addb $30, %cl
-; 686-NEXT:    xorl %edx, %edx
-; 686-NEXT:    shrdl %cl, %edx, %eax
-; 686-NEXT:    testb $32, %cl
-; 686-NEXT:    jne .LBB0_2
-; 686-NEXT:  # %bb.1: # %bb
-; 686-NEXT:    movl %eax, %edx
-; 686-NEXT:  .LBB0_2: # %bb
-; 686-NEXT:    movb %dl, (%eax)
-; 686-NEXT:    movl %ebp, %esp
-; 686-NEXT:    popl %ebp
-; 686-NEXT:    .cfi_def_cfa %esp, 4
-; 686-NEXT:    retl
+; X86-LABEL: foo:
+; X86:       # %bb.0: # %bb
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movzbl var_27, %ecx
+; X86-NEXT:    movzwl var_22, %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    addb $30, %cl
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    shrdl %cl, %edx, %eax
+; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    jne .LBB0_2
+; X86-NEXT:  # %bb.1: # %bb
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:  .LBB0_2: # %bb
+; X86-NEXT:    movb %dl, (%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    retl
 bb:
   %tmp = alloca i64, align 8
   %tmp1 = load i16, ptr @var_22, align 2
diff --git a/llvm/test/CodeGen/X86/x32-va_start.ll b/llvm/test/CodeGen/X86/x32-va_start.ll
index e61e5765f124aa..31c8aee3fddec5 100644
--- a/llvm/test/CodeGen/X86/x32-va_start.ll
+++ b/llvm/test/CodeGen/X86/x32-va_start.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s -check-prefix=SSE
 ; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -mattr=-sse | FileCheck %s -check-prefix=NOSSE
-; RUN: llc < %s -mtriple=i386-linux-gnux32 | FileCheck %s -check-prefix=32BITABI
-; RUN: llc < %s -mtriple=i686-linux-gnux32 | FileCheck %s -check-prefix=32BITABI
+; RUN: llc < %s -mtriple=i386-linux-gnux32 | FileCheck %s -check-prefix=X32BITABI
+; RUN: llc < %s -mtriple=i686-linux-gnux32 | FileCheck %s -check-prefix=X32BITABI
 ;
 ; Verifies that x32 va_start lowering is sane. To regenerate this test, use
 ; cat <<EOF |
@@ -97,27 +97,27 @@ define i32 @foo(float %a, ptr nocapture readnone %fmt, ...) nounwind {
 ; NOSSE-NEXT:    movl (%eax), %eax
 ; NOSSE-NEXT:    retq
 ;
-; 32BITABI-LABEL: foo:
-; 32BITABI:       # %bb.0: # %entry
-; 32BITABI-NEXT:    subl $28, %esp
-; 32BITABI-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; 32BITABI-NEXT:    movl %ecx, (%esp)
-; 32BITABI-NEXT:    cmpl $40, %ecx
-; 32BITABI-NEXT:    ja .LBB0_2
-; 32BITABI-NEXT:  # %bb.1: # %vaarg.in_reg
-; 32BITABI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; 32BITABI-NEXT:    addl %ecx, %eax
-; 32BITABI-NEXT:    addl $8, %ecx
-; 32BITABI-NEXT:    movl %ecx, (%esp)
-; 32BITABI-NEXT:    jmp .LBB0_3
-; 32BITABI-NEXT:  .LBB0_2: # %vaarg.in_mem
-; 32BITABI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; 32BITABI-NEXT:    leal 8(%eax), %ecx
-; 32BITABI-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; 32BITABI-NEXT:  .LBB0_3: # %vaarg.end
-; 32BITABI-NEXT:    movl (%eax), %eax
-; 32BITABI-NEXT:    addl $28, %esp
-; 32BITABI-NEXT:    retl
+; X32BITABI-LABEL: foo:
+; X32BITABI:       # %bb.0: # %entry
+; X32BITABI-NEXT:    subl $28, %esp
+; X32BITABI-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; X32BITABI-NEXT:    movl %ecx, (%esp)
+; X32BITABI-NEXT:    cmpl $40, %ecx
+; X32BITABI-NEXT:    ja .LBB0_2
+; X32BITABI-NEXT:  # %bb.1: # %vaarg.in_reg
+; X32BITABI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32BITABI-NEXT:    addl %ecx, %eax
+; X32BITABI-NEXT:    addl $8, %ecx
+; X32BITABI-NEXT:    movl %ecx, (%esp)
+; X32BITABI-NEXT:    jmp .LBB0_3
+; X32BITABI-NEXT:  .LBB0_2: # %vaarg.in_mem
+; X32BITABI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32BITABI-NEXT:    leal 8(%eax), %ecx
+; X32BITABI-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X32BITABI-NEXT:  .LBB0_3: # %vaarg.end
+; X32BITABI-NEXT:    movl (%eax), %eax
+; X32BITABI-NEXT:    addl $28, %esp
+; X32BITABI-NEXT:    retl
 entry:
   %ap = alloca [1 x %struct.__va_list_tag], align 16
   call void @llvm.lifetime.start.p0(i64 16, ptr %ap) #2

From b59760d83d10c27f22d77b7df24871500167c3e8 Mon Sep 17 00:00:00 2001
From: Kamlesh Kumar <kamleshbhalui@gmail.com>
Date: Wed, 15 May 2024 18:57:49 +0530
Subject: [PATCH 18/71] Revert "[ExceptionDemo] Correct and update example
 ExceptionDemo" (#92257)

Reverts llvm/llvm-project#69485
---
 llvm/examples/ExceptionDemo/CMakeLists.txt    |   4 +-
 llvm/examples/ExceptionDemo/ExceptionDemo.cpp | 181 ++++++++++--------
 2 files changed, 108 insertions(+), 77 deletions(-)

diff --git a/llvm/examples/ExceptionDemo/CMakeLists.txt b/llvm/examples/ExceptionDemo/CMakeLists.txt
index 0a60ad848dd406..793cf291ca6f11 100644
--- a/llvm/examples/ExceptionDemo/CMakeLists.txt
+++ b/llvm/examples/ExceptionDemo/CMakeLists.txt
@@ -1,7 +1,9 @@
 set(LLVM_LINK_COMPONENTS
   Core
   ExecutionEngine
-  ORCJIT
+  MC
+  MCJIT
+  RuntimeDyld
   Support
   Target
   nativecodegen
diff --git a/llvm/examples/ExceptionDemo/ExceptionDemo.cpp b/llvm/examples/ExceptionDemo/ExceptionDemo.cpp
index 41fa0cf626bf35..0afc6b30d140e9 100644
--- a/llvm/examples/ExceptionDemo/ExceptionDemo.cpp
+++ b/llvm/examples/ExceptionDemo/ExceptionDemo.cpp
@@ -49,9 +49,8 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
-#include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/ExecutionEngine/MCJIT.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
@@ -85,8 +84,6 @@
 #define USE_GLOBAL_STR_CONSTS true
 #endif
 
-llvm::ExitOnError ExitOnErr;
-
 //
 // Example types
 //
@@ -145,7 +142,6 @@ static llvm::ConstantInt *ourExceptionCaughtState;
 
 typedef std::vector<std::string> ArgNames;
 typedef std::vector<llvm::Type*> ArgTypes;
-typedef llvm::ArrayRef<llvm::Type *> TypeArray;
 
 //
 // Code Generation Utilities
@@ -896,10 +892,13 @@ void generateStringPrint(llvm::LLVMContext &context,
 ///        generated, and is used to hold the constant string. A value of
 ///        false indicates that the constant string will be stored on the
 ///        stack.
-void generateIntegerPrint(llvm::LLVMContext &context, llvm::Module &module,
+void generateIntegerPrint(llvm::LLVMContext &context,
+                          llvm::Module &module,
                           llvm::IRBuilder<> &builder,
-                          llvm::Function &printFunct, llvm::Value *toPrint,
-                          std::string format, bool useGlobal = true) {
+                          llvm::Function &printFunct,
+                          llvm::Value &toPrint,
+                          std::string format,
+                          bool useGlobal = true) {
   llvm::Constant *stringConstant =
     llvm::ConstantDataArray::getString(context, format);
   llvm::Value *stringVar;
@@ -921,9 +920,10 @@ void generateIntegerPrint(llvm::LLVMContext &context, llvm::Module &module,
 
   llvm::Value *cast = builder.CreateBitCast(stringVar,
                                             builder.getPtrTy());
-  builder.CreateCall(&printFunct, {toPrint, cast});
+  builder.CreateCall(&printFunct, {&toPrint, cast});
 }
 
+
 /// Generates code to handle finally block type semantics: always runs
 /// regardless of whether a thrown exception is passing through or the
 /// parent function is simply exiting. In addition to printing some state
@@ -997,10 +997,10 @@ static llvm::BasicBlock *createFinallyBlock(llvm::LLVMContext &context,
                       bufferToPrint.str(),
                       USE_GLOBAL_STR_CONSTS);
 
-  llvm::SwitchInst *theSwitch = builder.CreateSwitch(
-      builder.CreateLoad(ourExceptionNotThrownState->getType(),
-                         *exceptionCaughtFlag),
-      &terminatorBlock, 2);
+  llvm::SwitchInst *theSwitch = builder.CreateSwitch(builder.CreateLoad(
+                                                       *exceptionCaughtFlag),
+                                                     &terminatorBlock,
+                                                     2);
   theSwitch->addCase(ourExceptionCaughtState, &terminatorBlock);
   theSwitch->addCase(ourExceptionThrownState, &unwindResumeBlock);
 
@@ -1186,7 +1186,7 @@ static llvm::Function *createCatchWrappedInvokeFunction(
 
   // Note: function handles NULL exceptions
   builder.CreateCall(deleteOurException,
-                     builder.CreateLoad(builder.getPtrTy(), exceptionStorage));
+                     builder.CreateLoad(exceptionStorage));
   builder.CreateRetVoid();
 
   // Normal Block
@@ -1206,8 +1206,7 @@ static llvm::Function *createCatchWrappedInvokeFunction(
 
   builder.SetInsertPoint(unwindResumeBlock);
 
-  builder.CreateResume(
-      builder.CreateLoad(ourCaughtResultType, caughtResultStorage));
+  builder.CreateResume(builder.CreateLoad(caughtResultStorage));
 
   // Exception Block
 
@@ -1242,9 +1241,8 @@ static llvm::Function *createCatchWrappedInvokeFunction(
   // Retrieve exception_class member from thrown exception
   // (_Unwind_Exception instance). This member tells us whether or not
   // the exception is foreign.
-  llvm::Value *unwindExceptionClass = builder.CreateLoad(
-      builder.getInt64Ty(),
-      builder.CreateStructGEP(
+  llvm::Value *unwindExceptionClass =
+      builder.CreateLoad(builder.CreateStructGEP(
           ourUnwindExceptionType,
           builder.CreatePointerCast(unwindException,
                                     ourUnwindExceptionType->getPointerTo()),
@@ -1280,9 +1278,9 @@ static llvm::Function *createCatchWrappedInvokeFunction(
   //
   // Note: ourBaseFromUnwindOffset is usually negative
   llvm::Value *typeInfoThrown = builder.CreatePointerCast(
-      builder.CreateConstGEP1_64(builder.getPtrTy(), unwindException,
-                                 ourBaseFromUnwindOffset),
-      ourExceptionType->getPointerTo());
+                                  builder.CreateConstGEP1_64(unwindException,
+                                                       ourBaseFromUnwindOffset),
+                                  ourExceptionType->getPointerTo());
 
   // Retrieve thrown exception type info type
   //
@@ -1291,15 +1289,17 @@ static llvm::Function *createCatchWrappedInvokeFunction(
   typeInfoThrown = builder.CreateStructGEP(ourExceptionType, typeInfoThrown, 0);
 
   llvm::Value *typeInfoThrownType =
-      builder.CreateStructGEP(ourTypeInfoType, typeInfoThrown, 0);
+      builder.CreateStructGEP(builder.getPtrTy(), typeInfoThrown, 0);
 
-  llvm::Value *ti8 =
-      builder.CreateLoad(builder.getInt8Ty(), typeInfoThrownType);
-  generateIntegerPrint(context, module, builder, *toPrint32Int,
-                       builder.CreateZExt(ti8, builder.getInt32Ty()),
+  generateIntegerPrint(context,
+                       module,
+                       builder,
+                       *toPrint32Int,
+                       *(builder.CreateLoad(typeInfoThrownType)),
                        "Gen: Exception type <%d> received (stack unwound) "
                        " in " +
-                           ourId + ".\n",
+                       ourId +
+                       ".\n",
                        USE_GLOBAL_STR_CONSTS);
 
   // Route to matched type info catch block or run cleanup finally block
@@ -1311,7 +1311,8 @@ static llvm::Function *createCatchWrappedInvokeFunction(
 
   for (unsigned i = 1; i <= numExceptionsToCatch; ++i) {
     nextTypeToCatch = i - 1;
-    switchToCatchBlock->addCase(llvm::ConstantInt::get(builder.getInt32Ty(), i),
+    switchToCatchBlock->addCase(llvm::ConstantInt::get(
+                                   llvm::Type::getInt32Ty(context), i),
                                 catchBlocks[nextTypeToCatch]);
   }
 
@@ -1386,10 +1387,14 @@ createThrowExceptionFunction(llvm::Module &module, llvm::IRBuilder<> &builder,
   builder.SetInsertPoint(entryBlock);
 
   llvm::Function *toPrint32Int = module.getFunction("print32Int");
-  generateIntegerPrint(context, module, builder, *toPrint32Int,
-                       builder.CreateZExt(exceptionType, builder.getInt32Ty()),
-                       "\nGen: About to throw exception type <%d> in " + ourId +
-                           ".\n",
+  generateIntegerPrint(context,
+                       module,
+                       builder,
+                       *toPrint32Int,
+                       *exceptionType,
+                       "\nGen: About to throw exception type <%d> in " +
+                       ourId +
+                       ".\n",
                        USE_GLOBAL_STR_CONSTS);
 
   // Switches on runtime type info type value to determine whether or not
@@ -1541,13 +1546,15 @@ typedef void (*OurExceptionThrowFunctType) (int32_t typeToThrow);
 /// @param function generated test function to run
 /// @param typeToThrow type info type of generated exception to throw, or
 ///        indicator to cause foreign exception to be thrown.
-static void runExceptionThrow(llvm::orc::LLJIT *JIT, std::string function,
-                              int32_t typeToThrow) {
+static
+void runExceptionThrow(llvm::ExecutionEngine *engine,
+                       llvm::Function *function,
+                       int32_t typeToThrow) {
 
   // Find test's function pointer
   OurExceptionThrowFunctType functPtr =
-      reinterpret_cast<OurExceptionThrowFunctType>(reinterpret_cast<uintptr_t>(
-          ExitOnErr(JIT->lookup(function)).getValue()));
+    reinterpret_cast<OurExceptionThrowFunctType>(
+       reinterpret_cast<intptr_t>(engine->getPointerToFunction(function)));
 
   try {
     // Run test
@@ -1576,6 +1583,8 @@ static void runExceptionThrow(llvm::orc::LLJIT *JIT, std::string function,
 // End test functions
 //
 
+typedef llvm::ArrayRef<llvm::Type*> TypeArray;
+
 /// This initialization routine creates type info globals and
 /// adds external function declarations to module.
 /// @param numTypeInfos number of linear type info associated type info types
@@ -1885,73 +1894,93 @@ int main(int argc, char *argv[]) {
     return(0);
   }
 
+  // If not set, exception handling will not be turned on
+  llvm::TargetOptions Opts;
+
   llvm::InitializeNativeTarget();
   llvm::InitializeNativeTargetAsmPrinter();
-  auto Context = std::make_unique<llvm::LLVMContext>();
-  llvm::IRBuilder<> theBuilder(*Context);
+  llvm::LLVMContext Context;
+  llvm::IRBuilder<> theBuilder(Context);
 
   // Make the module, which holds all the code.
   std::unique_ptr<llvm::Module> Owner =
-      std::make_unique<llvm::Module>("my cool jit", *Context);
+      std::make_unique<llvm::Module>("my cool jit", Context);
   llvm::Module *module = Owner.get();
 
-  // Build LLJIT
-  std::unique_ptr<llvm::orc::LLJIT> JIT =
-      ExitOnErr(llvm::orc::LLJITBuilder().create());
+  std::unique_ptr<llvm::RTDyldMemoryManager> MemMgr(new llvm::SectionMemoryManager());
 
-  // Set up the optimizer pipeline.
-  llvm::legacy::FunctionPassManager fpm(module);
+  // Build engine with JIT
+  llvm::EngineBuilder factory(std::move(Owner));
+  factory.setEngineKind(llvm::EngineKind::JIT);
+  factory.setTargetOptions(Opts);
+  factory.setMCJITMemoryManager(std::move(MemMgr));
+  llvm::ExecutionEngine *executionEngine = factory.create();
 
-  // Optimizations turned on
+  {
+    llvm::legacy::FunctionPassManager fpm(module);
+
+    // Set up the optimizer pipeline.
+    // Start with registering info about how the
+    // target lays out data structures.
+    module->setDataLayout(executionEngine->getDataLayout());
+
+    // Optimizations turned on
 #ifdef ADD_OPT_PASSES
 
-  // Basic AliasAnslysis support for GVN.
-  fpm.add(llvm::createBasicAliasAnalysisPass());
+    // Basic AliasAnslysis support for GVN.
+    fpm.add(llvm::createBasicAliasAnalysisPass());
 
-  // Promote allocas to registers.
-  fpm.add(llvm::createPromoteMemoryToRegisterPass());
+    // Promote allocas to registers.
+    fpm.add(llvm::createPromoteMemoryToRegisterPass());
 
-  // Do simple "peephole" optimizations and bit-twiddling optzns.
-  fpm.add(llvm::createInstructionCombiningPass());
+    // Do simple "peephole" optimizations and bit-twiddling optzns.
+    fpm.add(llvm::createInstructionCombiningPass());
 
-  // Reassociate expressions.
-  fpm.add(llvm::createReassociatePass());
+    // Reassociate expressions.
+    fpm.add(llvm::createReassociatePass());
 
-  // Eliminate Common SubExpressions.
-  fpm.add(llvm::createGVNPass());
+    // Eliminate Common SubExpressions.
+    fpm.add(llvm::createGVNPass());
 
-  // Simplify the control flow graph (deleting unreachable
-  // blocks, etc).
-  fpm.add(llvm::createCFGSimplificationPass());
+    // Simplify the control flow graph (deleting unreachable
+    // blocks, etc).
+    fpm.add(llvm::createCFGSimplificationPass());
 #endif  // ADD_OPT_PASSES
 
-  fpm.doInitialization();
+    fpm.doInitialization();
 
-  // Generate test code using function throwCppException(...) as
-  // the function which throws foreign exceptions.
-  createUnwindExceptionTest(*module, theBuilder, fpm, "throwCppException");
+    // Generate test code using function throwCppException(...) as
+    // the function which throws foreign exceptions.
+    llvm::Function *toRun =
+    createUnwindExceptionTest(*module,
+                              theBuilder,
+                              fpm,
+                              "throwCppException");
 
-  ExitOnErr(JIT->addIRModule(
-      llvm::orc::ThreadSafeModule(std::move(Owner), std::move(Context))));
+    executionEngine->finalizeObject();
 
 #ifndef NDEBUG
-  fprintf(stderr, "\nBegin module dump:\n\n");
+    fprintf(stderr, "\nBegin module dump:\n\n");
 
-  module->print(llvm::errs(), nullptr);
+    module->dump();
 
-  fprintf(stderr, "\nEnd module dump:\n");
+    fprintf(stderr, "\nEnd module dump:\n");
 #endif
 
-  fprintf(stderr, "\n\nBegin Test:\n");
-  std::string toRun = "outerCatchFunct";
+    fprintf(stderr, "\n\nBegin Test:\n");
+
+    for (int i = 1; i < argc; ++i) {
+      // Run test for each argument whose value is the exception
+      // type to throw.
+      runExceptionThrow(executionEngine,
+                        toRun,
+                        (unsigned) strtoul(argv[i], NULL, 10));
+    }
 
-  for (int i = 1; i < argc; ++i) {
-    // Run test for each argument whose value is the exception
-    // type to throw.
-    runExceptionThrow(JIT.get(), toRun, (unsigned)strtoul(argv[i], NULL, 10));
+    fprintf(stderr, "\nEnd Test:\n\n");
   }
 
-  fprintf(stderr, "\nEnd Test:\n\n");
+  delete executionEngine;
 
   return 0;
 }

From 3bb39690d729d85cd93c9dd6e750d82d6f367541 Mon Sep 17 00:00:00 2001
From: Hans <hans@hanshq.net>
Date: Wed, 15 May 2024 15:29:08 +0200
Subject: [PATCH 19/71] [coro] Lower `llvm.coro.await.suspend.handle` to resume
 with tail call (#89751)

The C++ standard requires that symmetric transfer from one coroutine to
another is performed via a tail call. Failure to do so is a miscompile
and often breaks programs by quickly overflowing the stack.

Until now, the coro split pass tried to ensure this in the
`addMustTailToCoroResumes()` function by searching for
`llvm.coro.resume` calls to lower as tail calls if the conditions were
right: the right function arguments, attributes, calling convention
etc., and if a `ret void` was sure to be reached after traversal with
some ad-hoc constant folding following the call.

This was brittle, as the kind of implicit variants required for a tail
call to happen could easily be broken by other passes (e.g. if some
instruction got in between the `resume` and `ret`), see for example
9d1cb18d19862fc0627e4a56e1e491a498e84c71 and
284da049f5feb62b40f5abc41dda7895e3d81d72.

Also the logic seemed backwards: instead of searching for possible tail
call candidates and doing them if the circumstances are right, it seems
better to start with the intention of making the tail calls we need, and
forcing the circumstances to be right.

Now that we have the `llvm.coro.await.suspend.handle` intrinsic (since
f78688134026686288a8d310b493d9327753a022) which corresponds exactly to
symmetric transfer, change the lowering of that to also include the
`resume` part, always lowered as a tail call.
---
 clang/lib/CodeGen/CGCoroutine.cpp             |  11 +-
 clang/test/CodeGenCoroutines/coro-await.cpp   |   4 +-
 .../coro-symmetric-transfer-01.cpp            |  54 ----
 .../coro-symmetric-transfer-02.cpp            |   6 +-
 llvm/docs/Coroutines.rst                      |  19 +-
 llvm/include/llvm/IR/Intrinsics.td            |   2 +-
 llvm/lib/Transforms/Coroutines/CoroInternal.h |   3 +-
 llvm/lib/Transforms/Coroutines/CoroSplit.cpp  | 240 +++++-------------
 llvm/lib/Transforms/Coroutines/Coroutines.cpp |   4 +-
 .../coro-await-suspend-lower-invoke.ll        |   5 +-
 .../Coroutines/coro-await-suspend-lower.ll    |   5 +-
 .../Coroutines/coro-preserve-final.ll         | 131 ----------
 ...-split-musttail-chain-pgo-counter-promo.ll |   9 +-
 .../Coroutines/coro-split-musttail.ll         |  17 +-
 .../Coroutines/coro-split-musttail1.ll        |  35 ++-
 .../Coroutines/coro-split-musttail10.ll       |   6 +-
 .../Coroutines/coro-split-musttail2.ll        |  15 +-
 .../Coroutines/coro-split-musttail3.ll        |  36 ++-
 .../Coroutines/coro-split-musttail4.ll        |   8 +-
 .../Coroutines/coro-split-musttail5.ll        |   8 +-
 .../Coroutines/coro-split-musttail6.ll        |  11 +-
 .../Coroutines/coro-split-musttail7.ll        |  17 +-
 22 files changed, 168 insertions(+), 478 deletions(-)
 delete mode 100644 clang/test/CodeGenCoroutines/coro-symmetric-transfer-01.cpp
 delete mode 100644 llvm/test/Transforms/Coroutines/coro-preserve-final.ll

diff --git a/clang/lib/CodeGen/CGCoroutine.cpp b/clang/lib/CodeGen/CGCoroutine.cpp
index 567e85a02dc612..b4c724422c14aa 100644
--- a/clang/lib/CodeGen/CGCoroutine.cpp
+++ b/clang/lib/CodeGen/CGCoroutine.cpp
@@ -278,7 +278,11 @@ static LValueOrRValue emitSuspendExpression(CodeGenFunction &CGF, CGCoroData &Co
 
   llvm::Function *AwaitSuspendIntrinsic = CGF.CGM.getIntrinsic(AwaitSuspendIID);
 
-  const auto AwaitSuspendCanThrow = StmtCanThrow(S.getSuspendExpr());
+  // SuspendHandle might throw since it also resumes the returned handle.
+  const bool AwaitSuspendCanThrow =
+      SuspendReturnType ==
+          CoroutineSuspendExpr::SuspendReturnType::SuspendHandle ||
+      StmtCanThrow(S.getSuspendExpr());
 
   llvm::CallBase *SuspendRet = nullptr;
   // FIXME: add call attributes?
@@ -307,10 +311,7 @@ static LValueOrRValue emitSuspendExpression(CodeGenFunction &CGF, CGCoroData &Co
     break;
   }
   case CoroutineSuspendExpr::SuspendReturnType::SuspendHandle: {
-    assert(SuspendRet->getType()->isPointerTy());
-
-    auto ResumeIntrinsic = CGF.CGM.getIntrinsic(llvm::Intrinsic::coro_resume);
-    Builder.CreateCall(ResumeIntrinsic, SuspendRet);
+    assert(SuspendRet->getType()->isVoidTy());
     break;
   }
   }
diff --git a/clang/test/CodeGenCoroutines/coro-await.cpp b/clang/test/CodeGenCoroutines/coro-await.cpp
index 65bfb099468817..c7a09e8b8bc7c4 100644
--- a/clang/test/CodeGenCoroutines/coro-await.cpp
+++ b/clang/test/CodeGenCoroutines/coro-await.cpp
@@ -370,8 +370,8 @@ extern "C" void TestTailcall() {
   // ---------------------------
   // Call coro.await.suspend
   // ---------------------------
-  // CHECK-NEXT: %[[RESUMED:.+]] = call ptr @llvm.coro.await.suspend.handle(ptr %[[AWAITABLE]], ptr %[[FRAME]], ptr @TestTailcall.__await_suspend_wrapper__await)
-  // CHECK-NEXT: call void @llvm.coro.resume(ptr %[[RESUMED]])
+  // Note: The call must not be nounwind since the resumed function could throw.
+  // CHECK-NEXT: call void @llvm.coro.await.suspend.handle(ptr %[[AWAITABLE]], ptr %[[FRAME]], ptr @TestTailcall.__await_suspend_wrapper__await){{$}}
   // CHECK-NEXT: %[[OUTCOME:.+]] = call i8 @llvm.coro.suspend(token %[[SUSPEND_ID]], i1 false)
   // CHECK-NEXT: switch i8 %[[OUTCOME]], label %[[RET_BB:.+]] [
   // CHECK-NEXT:   i8 0, label %[[READY_BB]]
diff --git a/clang/test/CodeGenCoroutines/coro-symmetric-transfer-01.cpp b/clang/test/CodeGenCoroutines/coro-symmetric-transfer-01.cpp
deleted file mode 100644
index da30e12c63cffb..00000000000000
--- a/clang/test/CodeGenCoroutines/coro-symmetric-transfer-01.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -O0 -emit-llvm %s -o - -disable-llvm-passes | FileCheck %s
-// RUN: %clang -std=c++20 -O0 -emit-llvm -c  %s -o %t -Xclang -disable-llvm-passes && %clang -c %t
-
-#include "Inputs/coroutine.h"
-
-struct detached_task {
-  struct promise_type {
-    detached_task get_return_object() noexcept {
-      return detached_task{std::coroutine_handle<promise_type>::from_promise(*this)};
-    }
-
-    void return_void() noexcept {}
-
-    struct final_awaiter {
-      bool await_ready() noexcept { return false; }
-      std::coroutine_handle<> await_suspend(std::coroutine_handle<promise_type> h) noexcept {
-        h.destroy();
-        return {};
-      }
-      void await_resume() noexcept {}
-    };
-
-    void unhandled_exception() noexcept {}
-
-    final_awaiter final_suspend() noexcept { return {}; }
-
-    std::suspend_always initial_suspend() noexcept { return {}; }
-  };
-
-  ~detached_task() {
-    if (coro_) {
-      coro_.destroy();
-      coro_ = {};
-    }
-  }
-
-  void start() && {
-    auto tmp = coro_;
-    coro_ = {};
-    tmp.resume();
-  }
-
-  std::coroutine_handle<promise_type> coro_;
-};
-
-detached_task foo() {
-  co_return;
-}
-
-// check that the lifetime of the coroutine handle used to obtain the address is contained within single basic block, and hence does not live across suspension points.
-// CHECK-LABEL: final.suspend:
-// CHECK:         %{{.+}} = call token @llvm.coro.save(ptr null)
-// CHECK:         %[[HDL_TRANSFER:.+]] = call ptr @llvm.coro.await.suspend.handle
-// CHECK:         call void @llvm.coro.resume(ptr %[[HDL_TRANSFER]])
diff --git a/clang/test/CodeGenCoroutines/coro-symmetric-transfer-02.cpp b/clang/test/CodeGenCoroutines/coro-symmetric-transfer-02.cpp
index ca6cf74115a3b1..f36f89926505f3 100644
--- a/clang/test/CodeGenCoroutines/coro-symmetric-transfer-02.cpp
+++ b/clang/test/CodeGenCoroutines/coro-symmetric-transfer-02.cpp
@@ -89,8 +89,7 @@ Task bar() {
 // CHECK:         br i1 %{{.+}}, label %[[CASE1_AWAIT_READY:.+]], label %[[CASE1_AWAIT_SUSPEND:.+]]
 // CHECK:       [[CASE1_AWAIT_SUSPEND]]:
 // CHECK-NEXT:    %{{.+}} = call token @llvm.coro.save(ptr null)
-// CHECK-NEXT:    %[[HANDLE1_PTR:.+]] = call ptr @llvm.coro.await.suspend.handle
-// CHECK-NEXT:    call void @llvm.coro.resume(ptr %[[HANDLE1_PTR]])
+// CHECK-NEXT:    call void @llvm.coro.await.suspend.handle
 // CHECK-NEXT:    %{{.+}} = call i8 @llvm.coro.suspend
 // CHECK-NEXT:    switch i8 %{{.+}}, label %coro.ret [
 // CHECK-NEXT:      i8 0, label %[[CASE1_AWAIT_READY]]
@@ -104,8 +103,7 @@ Task bar() {
 // CHECK:         br i1 %{{.+}}, label %[[CASE2_AWAIT_READY:.+]], label %[[CASE2_AWAIT_SUSPEND:.+]]
 // CHECK:       [[CASE2_AWAIT_SUSPEND]]:
 // CHECK-NEXT:    %{{.+}} = call token @llvm.coro.save(ptr null)
-// CHECK-NEXT:    %[[HANDLE2_PTR:.+]] = call ptr @llvm.coro.await.suspend.handle
-// CHECK-NEXT:    call void @llvm.coro.resume(ptr %[[HANDLE2_PTR]])
+// CHECK-NEXT:    call void @llvm.coro.await.suspend.handle
 // CHECK-NEXT:    %{{.+}} = call i8 @llvm.coro.suspend
 // CHECK-NEXT:    switch i8 %{{.+}}, label %coro.ret [
 // CHECK-NEXT:      i8 0, label %[[CASE2_AWAIT_READY]]
diff --git a/llvm/docs/Coroutines.rst b/llvm/docs/Coroutines.rst
index 83369d93c309a7..36092325e536fb 100644
--- a/llvm/docs/Coroutines.rst
+++ b/llvm/docs/Coroutines.rst
@@ -1922,7 +1922,7 @@ Example:
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ::
 
-  declare ptr @llvm.coro.await.suspend.handle(
+  declare void @llvm.coro.await.suspend.handle(
                 ptr <awaiter>,
                 ptr <handle>,
                 ptr <await_suspend_function>)
@@ -1967,7 +1967,9 @@ The intrinsic must be used between corresponding `coro.save`_ and
 `await_suspend_function` call during `CoroSplit`_ pass.
 
 `await_suspend_function` must return a pointer to a valid
-coroutine frame, which is immediately resumed
+coroutine frame. The intrinsic will be lowered to a tail call resuming the
+returned coroutine frame. It will be marked `musttail` on targets that support
+that. Instructions following the intrinsic will become unreachable.
 
 Example:
 """"""""
@@ -1977,11 +1979,10 @@ Example:
   ; before lowering
   await.suspend:
     %save = call token @llvm.coro.save(ptr %hdl)
-    %next = call ptr @llvm.coro.await.suspend.handle(
-                ptr %awaiter,
-                ptr %hdl,
-                ptr @await_suspend_function)
-    call void @llvm.coro.resume(%next)
+    call void @llvm.coro.await.suspend.handle(
+        ptr %awaiter,
+        ptr %hdl,
+        ptr @await_suspend_function)
     %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
     ...
 
@@ -1992,8 +1993,8 @@ Example:
     %next = call ptr @await_suspend_function(
                 ptr %awaiter,
                 ptr %hdl)
-    call void @llvm.coro.resume(%next)
-    %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
+    musttail call void @llvm.coro.resume(%next)
+    ret void
     ...
 
   ; wrapper function example
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index f1c7d950f92755..78f0dbec863e96 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1717,7 +1717,7 @@ def int_coro_await_suspend_bool : Intrinsic<[llvm_i1_ty],
                                             [llvm_ptr_ty, llvm_ptr_ty, llvm_ptr_ty],
                                             [Throws]>;
 
-def int_coro_await_suspend_handle : Intrinsic<[llvm_ptr_ty],
+def int_coro_await_suspend_handle : Intrinsic<[],
                                               [llvm_ptr_ty, llvm_ptr_ty, llvm_ptr_ty],
                                               [Throws]>;
 
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 84fd88806154e3..5716fd0ea4ab96 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -47,7 +47,7 @@ struct LowererBase {
   ConstantPointerNull *const NullPtr;
 
   LowererBase(Module &M);
-  Value *makeSubFnCall(Value *Arg, int Index, Instruction *InsertPt);
+  CallInst *makeSubFnCall(Value *Arg, int Index, Instruction *InsertPt);
 };
 
 enum class ABI {
@@ -85,6 +85,7 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
   SmallVector<AnyCoroSuspendInst *, 4> CoroSuspends;
   SmallVector<CallInst*, 2> SwiftErrorOps;
   SmallVector<CoroAwaitSuspendInst *, 4> CoroAwaitSuspends;
+  SmallVector<CallInst *, 2> SymmetricTransfers;
 
   // Field indexes for special fields in the switch lowering.
   struct SwitchFieldIndex {
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 4eb6e75d09fa53..450ea823437157 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -113,21 +113,24 @@ class CoroCloner {
   /// ABIs.
   AnyCoroSuspendInst *ActiveSuspend = nullptr;
 
+  TargetTransformInfo &TTI;
+
 public:
   /// Create a cloner for a switch lowering.
   CoroCloner(Function &OrigF, const Twine &Suffix, coro::Shape &Shape,
-             Kind FKind)
+             Kind FKind, TargetTransformInfo &TTI)
       : OrigF(OrigF), NewF(nullptr), Suffix(Suffix), Shape(Shape), FKind(FKind),
-        Builder(OrigF.getContext()) {
+        Builder(OrigF.getContext()), TTI(TTI) {
     assert(Shape.ABI == coro::ABI::Switch);
   }
 
   /// Create a cloner for a continuation lowering.
   CoroCloner(Function &OrigF, const Twine &Suffix, coro::Shape &Shape,
-             Function *NewF, AnyCoroSuspendInst *ActiveSuspend)
+             Function *NewF, AnyCoroSuspendInst *ActiveSuspend,
+             TargetTransformInfo &TTI)
       : OrigF(OrigF), NewF(NewF), Suffix(Suffix), Shape(Shape),
         FKind(Shape.ABI == coro::ABI::Async ? Kind::Async : Kind::Continuation),
-        Builder(OrigF.getContext()), ActiveSuspend(ActiveSuspend) {
+        Builder(OrigF.getContext()), ActiveSuspend(ActiveSuspend), TTI(TTI) {
     assert(Shape.ABI == coro::ABI::Retcon ||
            Shape.ABI == coro::ABI::RetconOnce || Shape.ABI == coro::ABI::Async);
     assert(NewF && "need existing function for continuation");
@@ -171,7 +174,8 @@ class CoroCloner {
 // Lower the intrinisc in CoroEarly phase if coroutine frame doesn't escape
 // and it is known that other transformations, for example, sanitizers
 // won't lead to incorrect code.
-static void lowerAwaitSuspend(IRBuilder<> &Builder, CoroAwaitSuspendInst *CB) {
+static void lowerAwaitSuspend(IRBuilder<> &Builder, CoroAwaitSuspendInst *CB,
+                              coro::Shape &Shape) {
   auto Wrapper = CB->getWrapperFunction();
   auto Awaiter = CB->getAwaiter();
   auto FramePtr = CB->getFrame();
@@ -206,6 +210,31 @@ static void lowerAwaitSuspend(IRBuilder<> &Builder, CoroAwaitSuspendInst *CB) {
     llvm_unreachable("Unexpected coro_await_suspend invocation method");
   }
 
+  if (CB->getCalledFunction()->getIntrinsicID() ==
+      Intrinsic::coro_await_suspend_handle) {
+    // Follow the lowered await_suspend call above with a lowered resume call
+    // to the returned coroutine.
+    if (auto *Invoke = dyn_cast<InvokeInst>(CB)) {
+      // If the await_suspend call is an invoke, we continue in the next block.
+      Builder.SetInsertPoint(Invoke->getNormalDest()->getFirstInsertionPt());
+    }
+
+    coro::LowererBase LB(*Wrapper->getParent());
+    auto *ResumeAddr = LB.makeSubFnCall(NewCall, CoroSubFnInst::ResumeIndex,
+                                        &*Builder.GetInsertPoint());
+
+    LLVMContext &Ctx = Builder.getContext();
+    FunctionType *ResumeTy = FunctionType::get(
+        Type::getVoidTy(Ctx), PointerType::getUnqual(Ctx), false);
+    auto *ResumeCall = Builder.CreateCall(ResumeTy, ResumeAddr, {NewCall});
+
+    // We can't insert the 'ret' instruction and adjust the cc until the
+    // function has been split, so remember this for later.
+    Shape.SymmetricTransfers.push_back(ResumeCall);
+
+    NewCall = ResumeCall;
+  }
+
   CB->replaceAllUsesWith(NewCall);
   CB->eraseFromParent();
 }
@@ -213,7 +242,7 @@ static void lowerAwaitSuspend(IRBuilder<> &Builder, CoroAwaitSuspendInst *CB) {
 static void lowerAwaitSuspends(Function &F, coro::Shape &Shape) {
   IRBuilder<> Builder(F.getContext());
   for (auto *AWS : Shape.CoroAwaitSuspends)
-    lowerAwaitSuspend(Builder, AWS);
+    lowerAwaitSuspend(Builder, AWS, Shape);
 }
 
 static void maybeFreeRetconStorage(IRBuilder<> &Builder,
@@ -1056,6 +1085,25 @@ void CoroCloner::create() {
   // Set up the new entry block.
   replaceEntryBlock();
 
+  // Turn symmetric transfers into musttail calls.
+  for (CallInst *ResumeCall : Shape.SymmetricTransfers) {
+    ResumeCall = cast<CallInst>(VMap[ResumeCall]);
+    ResumeCall->setCallingConv(NewF->getCallingConv());
+    if (TTI.supportsTailCallFor(ResumeCall)) {
+      // FIXME: Could we support symmetric transfer effectively without
+      // musttail?
+      ResumeCall->setTailCallKind(CallInst::TCK_MustTail);
+    }
+
+    // Put a 'ret void' after the call, and split any remaining instructions to
+    // an unreachable block.
+    BasicBlock *BB = ResumeCall->getParent();
+    BB->splitBasicBlock(ResumeCall->getNextNode());
+    Builder.SetInsertPoint(BB->getTerminator());
+    Builder.CreateRetVoid();
+    BB->getTerminator()->eraseFromParent();
+  }
+
   Builder.SetInsertPoint(&NewF->getEntryBlock().front());
   NewFramePtr = deriveNewFramePointer();
 
@@ -1186,130 +1234,6 @@ scanPHIsAndUpdateValueMap(Instruction *Prev, BasicBlock *NewBlock,
   }
 }
 
-// Replace a sequence of branches leading to a ret, with a clone of a ret
-// instruction. Suspend instruction represented by a switch, track the PHI
-// values and select the correct case successor when possible.
-static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) {
-  // There is nothing to simplify.
-  if (isa<ReturnInst>(InitialInst))
-    return false;
-
-  DenseMap<Value *, Value *> ResolvedValues;
-  assert(InitialInst->getModule());
-  const DataLayout &DL = InitialInst->getModule()->getDataLayout();
-
-  auto TryResolveConstant = [&ResolvedValues](Value *V) {
-    auto It = ResolvedValues.find(V);
-    if (It != ResolvedValues.end())
-      V = It->second;
-    return dyn_cast<ConstantInt>(V);
-  };
-
-  Instruction *I = InitialInst;
-  while (true) {
-    if (isa<ReturnInst>(I)) {
-      assert(!cast<ReturnInst>(I)->getReturnValue());
-      ReplaceInstWithInst(InitialInst, I->clone());
-      return true;
-    }
-
-    if (auto *BR = dyn_cast<BranchInst>(I)) {
-      unsigned SuccIndex = 0;
-      if (BR->isConditional()) {
-        // Handle the case the condition of the conditional branch is constant.
-        // e.g.,
-        //
-        //     br i1 false, label %cleanup, label %CoroEnd
-        //
-        // It is possible during the transformation. We could continue the
-        // simplifying in this case.
-        ConstantInt *Cond = TryResolveConstant(BR->getCondition());
-        if (!Cond)
-          return false;
-
-        SuccIndex = Cond->isOne() ? 0 : 1;
-      }
-
-      BasicBlock *Succ = BR->getSuccessor(SuccIndex);
-      scanPHIsAndUpdateValueMap(I, Succ, ResolvedValues);
-      I = Succ->getFirstNonPHIOrDbgOrLifetime();
-      continue;
-    }
-
-    if (auto *Cmp = dyn_cast<CmpInst>(I)) {
-      // If the case number of suspended switch instruction is reduced to
-      // 1, then it is simplified to CmpInst in llvm::ConstantFoldTerminator.
-      // Try to constant fold it.
-      ConstantInt *Cond0 = TryResolveConstant(Cmp->getOperand(0));
-      ConstantInt *Cond1 = TryResolveConstant(Cmp->getOperand(1));
-      if (Cond0 && Cond1) {
-        ConstantInt *Result =
-            dyn_cast_or_null<ConstantInt>(ConstantFoldCompareInstOperands(
-                Cmp->getPredicate(), Cond0, Cond1, DL));
-        if (Result) {
-          ResolvedValues[Cmp] = Result;
-          I = I->getNextNode();
-          continue;
-        }
-      }
-    }
-
-    if (auto *SI = dyn_cast<SwitchInst>(I)) {
-      ConstantInt *Cond = TryResolveConstant(SI->getCondition());
-      if (!Cond)
-        return false;
-
-      BasicBlock *Succ = SI->findCaseValue(Cond)->getCaseSuccessor();
-      scanPHIsAndUpdateValueMap(I, Succ, ResolvedValues);
-      I = Succ->getFirstNonPHIOrDbgOrLifetime();
-      continue;
-    }
-
-    if (I->isDebugOrPseudoInst() || I->isLifetimeStartOrEnd() ||
-        wouldInstructionBeTriviallyDead(I)) {
-      // We can skip instructions without side effects. If their values are
-      // needed, we'll notice later, e.g. when hitting a conditional branch.
-      I = I->getNextNode();
-      continue;
-    }
-
-    break;
-  }
-
-  return false;
-}
-
-// Check whether CI obeys the rules of musttail attribute.
-static bool shouldBeMustTail(const CallInst &CI, const Function &F) {
-  if (CI.isInlineAsm())
-    return false;
-
-  // Match prototypes and calling conventions of resume function.
-  FunctionType *CalleeTy = CI.getFunctionType();
-  if (!CalleeTy->getReturnType()->isVoidTy() || (CalleeTy->getNumParams() != 1))
-    return false;
-
-  Type *CalleeParmTy = CalleeTy->getParamType(0);
-  if (!CalleeParmTy->isPointerTy() ||
-      (CalleeParmTy->getPointerAddressSpace() != 0))
-    return false;
-
-  if (CI.getCallingConv() != F.getCallingConv())
-    return false;
-
-  // CI should not has any ABI-impacting function attributes.
-  static const Attribute::AttrKind ABIAttrs[] = {
-      Attribute::StructRet,    Attribute::ByVal,     Attribute::InAlloca,
-      Attribute::Preallocated, Attribute::InReg,     Attribute::Returned,
-      Attribute::SwiftSelf,    Attribute::SwiftError};
-  AttributeList Attrs = CI.getAttributes();
-  for (auto AK : ABIAttrs)
-    if (Attrs.hasParamAttr(0, AK))
-      return false;
-
-  return true;
-}
-
 // Coroutine has no suspend points. Remove heap allocation for the coroutine
 // frame if possible.
 static void handleNoSuspendCoroutine(coro::Shape &Shape) {
@@ -1523,24 +1447,16 @@ struct SwitchCoroutineSplitter {
 
     createResumeEntryBlock(F, Shape);
     auto *ResumeClone =
-        createClone(F, ".resume", Shape, CoroCloner::Kind::SwitchResume);
+        createClone(F, ".resume", Shape, CoroCloner::Kind::SwitchResume, TTI);
     auto *DestroyClone =
-        createClone(F, ".destroy", Shape, CoroCloner::Kind::SwitchUnwind);
+        createClone(F, ".destroy", Shape, CoroCloner::Kind::SwitchUnwind, TTI);
     auto *CleanupClone =
-        createClone(F, ".cleanup", Shape, CoroCloner::Kind::SwitchCleanup);
+        createClone(F, ".cleanup", Shape, CoroCloner::Kind::SwitchCleanup, TTI);
 
     postSplitCleanup(*ResumeClone);
     postSplitCleanup(*DestroyClone);
     postSplitCleanup(*CleanupClone);
 
-    // Adding musttail call to support symmetric transfer.
-    // Skip targets which don't support tail call.
-    //
-    // FIXME: Could we support symmetric transfer effectively without musttail
-    // call?
-    if (TTI.supportsTailCalls())
-      addMustTailToCoroResumes(*ResumeClone, TTI);
-
     // Store addresses resume/destroy/cleanup functions in the coroutine frame.
     updateCoroFrame(Shape, ResumeClone, DestroyClone, CleanupClone);
 
@@ -1560,8 +1476,9 @@ struct SwitchCoroutineSplitter {
   // new entry block and replacing coro.suspend an appropriate value to force
   // resume or cleanup pass for every suspend point.
   static Function *createClone(Function &F, const Twine &Suffix,
-                               coro::Shape &Shape, CoroCloner::Kind FKind) {
-    CoroCloner Cloner(F, Suffix, Shape, FKind);
+                               coro::Shape &Shape, CoroCloner::Kind FKind,
+                               TargetTransformInfo &TTI) {
+    CoroCloner Cloner(F, Suffix, Shape, FKind, TTI);
     Cloner.create();
     return Cloner.getFunction();
   }
@@ -1662,34 +1579,6 @@ struct SwitchCoroutineSplitter {
     Shape.SwitchLowering.ResumeEntryBlock = NewEntry;
   }
 
-  // Add musttail to any resume instructions that is immediately followed by a
-  // suspend (i.e. ret). We do this even in -O0 to support guaranteed tail call
-  // for symmetrical coroutine control transfer (C++ Coroutines TS extension).
-  // This transformation is done only in the resume part of the coroutine that
-  // has identical signature and calling convention as the coro.resume call.
-  static void addMustTailToCoroResumes(Function &F, TargetTransformInfo &TTI) {
-    bool Changed = false;
-
-    // Collect potential resume instructions.
-    SmallVector<CallInst *, 4> Resumes;
-    for (auto &I : instructions(F))
-      if (auto *Call = dyn_cast<CallInst>(&I))
-        if (shouldBeMustTail(*Call, F))
-          Resumes.push_back(Call);
-
-    // Set musttail on those that are followed by a ret instruction.
-    for (CallInst *Call : Resumes)
-      // Skip targets which don't support tail call on the specific case.
-      if (TTI.supportsTailCallFor(Call) &&
-          simplifyTerminatorLeadingToRet(Call->getNextNode())) {
-        Call->setTailCallKind(CallInst::TCK_MustTail);
-        Changed = true;
-      }
-
-    if (Changed)
-      removeUnreachableBlocks(F);
-  }
-
   // Store addresses of Resume/Destroy/Cleanup functions in the coroutine frame.
   static void updateCoroFrame(coro::Shape &Shape, Function *ResumeFn,
                               Function *DestroyFn, Function *CleanupFn) {
@@ -1893,12 +1782,13 @@ static void splitAsyncCoroutine(Function &F, coro::Shape &Shape,
     auto *Suspend = Shape.CoroSuspends[Idx];
     auto *Clone = Clones[Idx];
 
-    CoroCloner(F, "resume." + Twine(Idx), Shape, Clone, Suspend).create();
+    CoroCloner(F, "resume." + Twine(Idx), Shape, Clone, Suspend, TTI).create();
   }
 }
 
 static void splitRetconCoroutine(Function &F, coro::Shape &Shape,
-                                 SmallVectorImpl<Function *> &Clones) {
+                                 SmallVectorImpl<Function *> &Clones,
+                                 TargetTransformInfo &TTI) {
   assert(Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce);
   assert(Clones.empty());
 
@@ -2021,7 +1911,7 @@ static void splitRetconCoroutine(Function &F, coro::Shape &Shape,
     auto Suspend = Shape.CoroSuspends[i];
     auto Clone = Clones[i];
 
-    CoroCloner(F, "resume." + Twine(i), Shape, Clone, Suspend).create();
+    CoroCloner(F, "resume." + Twine(i), Shape, Clone, Suspend, TTI).create();
   }
 }
 
@@ -2073,7 +1963,7 @@ splitCoroutine(Function &F, SmallVectorImpl<Function *> &Clones,
       break;
     case coro::ABI::Retcon:
     case coro::ABI::RetconOnce:
-      splitRetconCoroutine(F, Shape, Clones);
+      splitRetconCoroutine(F, Shape, Clones, TTI);
       break;
     }
   }
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index a1c78d6a44ef46..1a92bc1636257b 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -47,8 +47,8 @@ coro::LowererBase::LowererBase(Module &M)
 //
 //    call ptr @llvm.coro.subfn.addr(ptr %Arg, i8 %index)
 
-Value *coro::LowererBase::makeSubFnCall(Value *Arg, int Index,
-                                        Instruction *InsertPt) {
+CallInst *coro::LowererBase::makeSubFnCall(Value *Arg, int Index,
+                                           Instruction *InsertPt) {
   auto *IndexVal = ConstantInt::get(Type::getInt8Ty(Context), Index);
   auto *Fn = Intrinsic::getDeclaration(&TheModule, Intrinsic::coro_subfn_addr);
 
diff --git a/llvm/test/Transforms/Coroutines/coro-await-suspend-lower-invoke.ll b/llvm/test/Transforms/Coroutines/coro-await-suspend-lower-invoke.ll
index fbc4a2c006f84e..fd3b7bd815300c 100644
--- a/llvm/test/Transforms/Coroutines/coro-await-suspend-lower-invoke.ll
+++ b/llvm/test/Transforms/Coroutines/coro-await-suspend-lower-invoke.ll
@@ -58,14 +58,13 @@ suspend.cond:
 ; CHECK-NEXT:       to label %[[STEP2_CONT:[^ ]+]] unwind label %[[PAD]]
 step2:
   %save2 = call token @llvm.coro.save(ptr null)
-  %resume.handle = invoke ptr @llvm.coro.await.suspend.handle(ptr %awaiter, ptr %hdl, ptr @await_suspend_wrapper_handle)
+  invoke void @llvm.coro.await.suspend.handle(ptr %awaiter, ptr %hdl, ptr @await_suspend_wrapper_handle)
     to label %step2.continue unwind label %pad
 
 ; CHECK:      [[STEP2_CONT]]:
 ; CHECK-NEXT:   %[[NEXT_RESUME:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[NEXT_HDL]], i8 0)
 ; CHECK-NEXT:   musttail call {{.*}} void %[[NEXT_RESUME]](ptr %[[NEXT_HDL]])
 step2.continue:
-  call void @llvm.coro.resume(ptr %resume.handle)
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
   switch i8 %suspend2, label %ret [
     i8 0, label %step3
@@ -112,7 +111,7 @@ declare i1 @llvm.coro.alloc(token)
 declare ptr @llvm.coro.begin(token, ptr)
 declare void @llvm.coro.await.suspend.void(ptr, ptr, ptr)
 declare i1 @llvm.coro.await.suspend.bool(ptr, ptr, ptr)
-declare ptr @llvm.coro.await.suspend.handle(ptr, ptr, ptr)
+declare void @llvm.coro.await.suspend.handle(ptr, ptr, ptr)
 declare i1 @llvm.coro.end(ptr, i1, token)
 
 declare ptr @__cxa_begin_catch(ptr)
diff --git a/llvm/test/Transforms/Coroutines/coro-await-suspend-lower.ll b/llvm/test/Transforms/Coroutines/coro-await-suspend-lower.ll
index 0f574c4acc26e7..8d019e6954628b 100644
--- a/llvm/test/Transforms/Coroutines/coro-await-suspend-lower.ll
+++ b/llvm/test/Transforms/Coroutines/coro-await-suspend-lower.ll
@@ -49,8 +49,7 @@ suspend.cond:
 ; CHECK-NEXT:   musttail call {{.*}} void %[[CONT]](ptr %[[NEXT_HDL]])
 step2:
   %save2 = call token @llvm.coro.save(ptr null)
-  %resume.handle = call ptr @llvm.coro.await.suspend.handle(ptr %awaiter, ptr %hdl, ptr @await_suspend_wrapper_handle)
-  call void @llvm.coro.resume(ptr %resume.handle)
+  call void @llvm.coro.await.suspend.handle(ptr %awaiter, ptr %hdl, ptr @await_suspend_wrapper_handle)
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
   switch i8 %suspend2, label %ret [
     i8 0, label %step3
@@ -89,7 +88,7 @@ declare i1 @llvm.coro.alloc(token)
 declare ptr @llvm.coro.begin(token, ptr)
 declare void @llvm.coro.await.suspend.void(ptr, ptr, ptr)
 declare i1 @llvm.coro.await.suspend.bool(ptr, ptr, ptr)
-declare ptr @llvm.coro.await.suspend.handle(ptr, ptr, ptr)
+declare void @llvm.coro.await.suspend.handle(ptr, ptr, ptr)
 declare i1 @llvm.coro.end(ptr, i1, token)
 
 declare noalias ptr @malloc(i32)
diff --git a/llvm/test/Transforms/Coroutines/coro-preserve-final.ll b/llvm/test/Transforms/Coroutines/coro-preserve-final.ll
deleted file mode 100644
index 16eeb84e7915ae..00000000000000
--- a/llvm/test/Transforms/Coroutines/coro-preserve-final.ll
+++ /dev/null
@@ -1,131 +0,0 @@
-; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
-
-%"struct.std::__n4861::noop_coroutine_promise" = type { i8 }
-%struct.Promise = type { %"struct.std::__n4861::coroutine_handle" }
-%"struct.std::__n4861::coroutine_handle" = type { ptr }
-
-define dso_local ptr @_Z5Outerv() #1 {
-entry:
-  %__promise = alloca %struct.Promise, align 8
-  %0 = call token @llvm.coro.id(i32 16, ptr nonnull %__promise, ptr nonnull @_Z5Outerv, ptr null)
-  %1 = call i1 @llvm.coro.alloc(token %0)
-  br i1 %1, label %coro.alloc, label %init.suspend
-
-coro.alloc:                                       ; preds = %entry
-  %2 = tail call i64 @llvm.coro.size.i64()
-  %call = call noalias noundef nonnull ptr @_Znwm(i64 noundef %2) #12
-  br label %init.suspend
-
-init.suspend:                                     ; preds = %entry, %coro.alloc
-  %3 = phi ptr [ null, %entry ], [ %call, %coro.alloc ]
-  %4 = call ptr @llvm.coro.begin(token %0, ptr %3) #13
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %__promise) #3
-  store ptr null, ptr %__promise, align 8
-  %5 = call token @llvm.coro.save(ptr null)
-  %6 = call i8 @llvm.coro.suspend(token %5, i1 false)
-  switch i8 %6, label %coro.ret [
-  i8 0, label %await.suspend
-  i8 1, label %cleanup62
-  ]
-
-await.suspend:                                    ; preds = %init.suspend
-  %7 = call token @llvm.coro.save(ptr null)
-  %8 = call ptr @llvm.coro.subfn.addr(ptr %4, i8 0)
-  call fastcc void %8(ptr %4) #3
-  %9 = call i8 @llvm.coro.suspend(token %7, i1 false)
-  switch i8 %9, label %coro.ret [
-  i8 0, label %await2.suspend
-  i8 1, label %cleanup62
-  ]
-
-await2.suspend:                                   ; preds = %await.suspend
-  %call27 = call ptr @_Z5Innerv() #3
-  %10 = call token @llvm.coro.save(ptr null)
-  %11 = getelementptr inbounds i8, ptr %__promise, i64 -16
-  store ptr %11, ptr %call27, align 8
-  %12 = getelementptr inbounds i8, ptr %call27, i64 -16
-  %13 = call ptr @llvm.coro.subfn.addr(ptr nonnull %12, i8 0)
-  call fastcc void %13(ptr nonnull %12) #3
-  %14 = call i8 @llvm.coro.suspend(token %10, i1 false)
-  switch i8 %14, label %coro.ret [
-  i8 0, label %final.suspend
-  i8 1, label %cleanup62
-  ]
-
-final.suspend:                                    ; preds = %await2.suspend
-  %15 = call ptr @llvm.coro.subfn.addr(ptr nonnull %12, i8 1)
-  call fastcc void %15(ptr nonnull %12) #3
-  %16 = call token @llvm.coro.save(ptr null)
-  %retval.sroa.0.0.copyload.i = load ptr, ptr %__promise, align 8
-  %17 = call ptr @llvm.coro.subfn.addr(ptr %retval.sroa.0.0.copyload.i, i8 0)
-  call fastcc void %17(ptr %retval.sroa.0.0.copyload.i) #3
-  %18 = call i8 @llvm.coro.suspend(token %16, i1 true) #13
-  switch i8 %18, label %coro.ret [
-  i8 0, label %final.ready
-  i8 1, label %cleanup62
-  ]
-
-final.ready:                                      ; preds = %final.suspend
-  call void @_Z5_exiti(i32 noundef 1) #14
-  unreachable
-
-cleanup62:                                        ; preds = %await2.suspend, %await.suspend, %init.suspend, %final.suspend
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %__promise) #3
-  %19 = call ptr @llvm.coro.free(token %0, ptr %4)
-  %.not = icmp eq ptr %19, null
-  br i1 %.not, label %coro.ret, label %coro.free
-
-coro.free:                                        ; preds = %cleanup62
-  call void @_ZdlPv(ptr noundef nonnull %19) #3
-  br label %coro.ret
-
-coro.ret:                                         ; preds = %coro.free, %cleanup62, %final.suspend, %await2.suspend, %await.suspend, %init.suspend
-  %20 = call i1 @llvm.coro.end(ptr null, i1 false, token none) #13
-  ret ptr %__promise
-}
-
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #2
-declare i1 @llvm.coro.alloc(token) #3
-declare dso_local noundef nonnull ptr @_Znwm(i64 noundef) local_unnamed_addr #4
-declare i64 @llvm.coro.size.i64() #5
-declare ptr @llvm.coro.begin(token, ptr writeonly) #3
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #6
-declare token @llvm.coro.save(ptr) #7
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #6
-declare i8 @llvm.coro.suspend(token, i1) #3
-declare dso_local ptr @_Z5Innerv() local_unnamed_addr #8
-declare dso_local void @_ZdlPv(ptr noundef) local_unnamed_addr #9
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2
-declare i1 @llvm.coro.end(ptr, i1, token) #3
-declare dso_local void @_Z5_exiti(i32 noundef) local_unnamed_addr #10
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #11
-
-attributes #0 = { mustprogress nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #1 = { nounwind presplitcoroutine uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #2 = { argmemonly nofree nounwind readonly }
-attributes #3 = { nounwind }
-attributes #4 = { nobuiltin allocsize(0) "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #5 = { nofree nosync nounwind readnone }
-attributes #6 = { argmemonly mustprogress nocallback nofree nosync nounwind willreturn }
-attributes #7 = { nomerge nounwind }
-attributes #8 = { "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #9 = { nobuiltin nounwind "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #10 = { noreturn "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
-attributes #11 = { argmemonly nounwind readonly }
-attributes #12 = { nounwind allocsize(0) }
-attributes #13 = { noduplicate }
-attributes #14 = { noreturn nounwind }
-
-; CHECK: define{{.*}}@_Z5Outerv.resume(
-; CHECK: entry.resume:
-; CHECK: switch i2 %index
-; CHECK-NEXT:    i2 0, label %await2.suspend
-; CHECK-NEXT:    i2 1, label %final.suspend
-;
-; CHECK: await2.suspend:
-; CHECK: musttail call
-; CHECK-NEXT: ret void
-;
-; CHECK: final.suspend:
-; CHECK: musttail call
-; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail-chain-pgo-counter-promo.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail-chain-pgo-counter-promo.ll
index ddd293eed2409e..e2ed205f2c2f4f 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail-chain-pgo-counter-promo.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail-chain-pgo-counter-promo.ll
@@ -90,10 +90,7 @@ define ptr @f(i32 %0) presplitcoroutine align 32 {
   %25 = getelementptr inbounds { ptr, ptr }, ptr %5, i64 0, i32 1
   store ptr %24, ptr %25, align 8
   %26 = call token @llvm.coro.save(ptr null)
-  %27 = call ptr @await_transform_await_suspend(ptr noundef nonnull align 8 dereferenceable(16) %5, ptr %14)
-  %28 = call ptr @llvm.coro.subfn.addr(ptr %27, i8 0)
-  %29 = ptrtoint ptr %28 to i64
-  call fastcc void %28(ptr %27) #9
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_transform_await_suspend)
   %30 = call i8 @llvm.coro.suspend(token %26, i1 false)
   switch i8 %30, label %60 [
     i8 0, label %31
@@ -123,9 +120,7 @@ define ptr @f(i32 %0) presplitcoroutine align 32 {
   br i1 %42, label %43, label %46
 
 43:                                               ; preds = %36
-  %44 = call ptr @llvm.coro.subfn.addr(ptr nonnull %14, i8 1)
-  %45 = ptrtoint ptr %44 to i64
-  call fastcc void %44(ptr nonnull %14) #9
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_transform_await_suspend)
   br label %47
 
 46:                                               ; preds = %36
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail.ll
index 825e44471db27a..70f29f4a9a4dc4 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail.ll
@@ -1,7 +1,6 @@
-; Tests that coro-split will convert coro.resume followed by a suspend to a
-; musttail call.
-; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,NOPGO %s
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,PGO %s
+; Tests that coro-split will convert coro.await.suspend.handle to a musttail call.
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK %s
 
 define void @f() #0 {
 entry:
@@ -20,8 +19,7 @@ entry:
   ]
 await.ready:
   %save2 = call token @llvm.coro.save(ptr null)
-  %addr2 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr2(ptr null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
 
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
   switch i8 %suspend2, label %exit [
@@ -40,10 +38,8 @@ exit:
 
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @f.resume(
-; CHECK: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-; NOPGO-NEXT: musttail call fastcc void %[[addr2]](ptr null)
-; PGO: call void @llvm.instrprof
-; PGO-NEXT: musttail call fastcc void %[[addr2]](ptr null)
+; CHECK: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr
+; CHECK-NEXT: musttail call fastcc void %[[addr2]]
 ; CHECK-NEXT: ret void
 
 declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
@@ -57,6 +53,7 @@ declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
 declare i1 @llvm.coro.end(ptr, i1, token) #2
 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
 declare ptr @malloc(i64)
+declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl)
 
 attributes #0 = { presplitcoroutine }
 attributes #1 = { argmemonly nounwind readonly }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll
index d0d11fc4495e48..3edb8728d8550b 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll
@@ -1,7 +1,6 @@
-; Tests that coro-split will convert coro.resume followed by a suspend to a
-; musttail call.
-; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,NOPGO %s
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,PGO %s
+; Tests that coro-split will convert coro.await.suspend.handle to a musttail call.
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK %s
 
 define void @f() #0 {
 entry:
@@ -28,17 +27,14 @@ await.suspend:
   ]
 await.resume1:
   %hdl = call ptr @g()
-  %addr2 = call ptr @llvm.coro.subfn.addr(ptr %hdl, i8 0)
-  call fastcc void %addr2(ptr %hdl)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr %hdl, ptr @await_suspend_function)
   br label %final.suspend
 await.resume2:
   %hdl2 = call ptr @h()
-  %addr3 = call ptr @llvm.coro.subfn.addr(ptr %hdl2, i8 0)
-  call fastcc void %addr3(ptr %hdl2)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr %hdl2, ptr @await_suspend_function)
   br label %final.suspend
 await.resume3:
-  %addr4 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr4(ptr null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
   br label %final.suspend
 final.suspend:
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
@@ -63,18 +59,18 @@ unreach:
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @f.resume(
 ; CHECK: %[[hdl:.+]] = call ptr @g()
-; CHECK-NEXT: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl]], i8 0)
-; NOPGO-NEXT: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
-; PGO: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
+; CHECK-NEXT: call ptr @await_suspend_function
+; CHECK-NEXT: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr
+; CHECK-NEXT: musttail call fastcc void %[[addr2]]
 ; CHECK-NEXT: ret void
 ; CHECK: %[[hdl2:.+]] = call ptr @h()
-; CHECK-NEXT: %[[addr3:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl2]], i8 0)
-; NOPGO-NEXT: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
-; PGO: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
+; CHECK-NEXT: call ptr @await_suspend_function
+; CHECK-NEXT: %[[addr3:.+]] = call ptr @llvm.coro.subfn.addr
+; CHECK-NEXT: musttail call fastcc void %[[addr3]]
 ; CHECK-NEXT: ret void
-; CHECK: %[[addr4:.+]] = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-; NOPGO-NEXT: musttail call fastcc void %[[addr4]](ptr null)
-; PGO: musttail call fastcc void %[[addr4]](ptr null)
+; CHECK: call ptr @await_suspend_function
+; CHECK: %[[addr4:.+]] = call ptr @llvm.coro.subfn.addr
+; CHECK-NEXT: musttail call fastcc void %[[addr4]]
 ; CHECK-NEXT: ret void
 
 
@@ -93,6 +89,7 @@ declare ptr @malloc(i64)
 declare i8 @switch_result()
 declare ptr @g()
 declare ptr @h()
+declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl)
 
 attributes #0 = { presplitcoroutine }
 attributes #1 = { argmemonly nounwind readonly }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll
index 3e91b79c10f736..a55b3d16e2ded3 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll
@@ -1,4 +1,4 @@
-; Tests that we would convert coro.resume to a musttail call if the target is
+; Tests that coro-split will convert coro.await.suspend.handle to a musttail call if the target is
 ; Wasm64 or Wasm32 with tail-call support.
 ; REQUIRES: webassembly-registered-target
 
@@ -25,8 +25,7 @@ entry:
   ]
 await.ready:
   %save2 = call token @llvm.coro.save(ptr null)
-  %addr2 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr2(ptr null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
 
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
   switch i8 %suspend2, label %exit [
@@ -51,6 +50,7 @@ declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
 declare i1 @llvm.coro.end(ptr, i1, token) #2
 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
 declare ptr @malloc(i64)
+declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl)
 
 attributes #0 = { presplitcoroutine "target-features"="+tail-call" }
 attributes #1 = { argmemonly nounwind readonly }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll
index 2f27f79480ab1b..ca1611e19b9f94 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll
@@ -1,5 +1,4 @@
-; Tests that coro-split will convert coro.resume followed by a suspend to a
-; musttail call.
+; Tests that coro-split will convert coro.await.suspend.handle to a musttail call.
 ; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
 ; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
 
@@ -8,11 +7,6 @@ entry:
   ret void;
 }
 
-define void @fakeresume2(ptr align 8)  {
-entry:
-  ret void;
-}
-
 define void @g() #0 {
 entry:
   %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
@@ -29,7 +23,7 @@ entry:
   ]
 await.ready:
   %save2 = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume2(ptr align 8 null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
 
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
   switch i8 %suspend2, label %exit [
@@ -47,7 +41,9 @@ exit:
 
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @g.resume(
-; CHECK: musttail call fastcc void @fakeresume2(ptr align 8 null)
+; CHECK: call ptr @await_suspend_function
+; CHECK-NEXT: call ptr @llvm.coro.subfn.addr
+; CHECK-NEXT: musttail call fastcc void
 ; CHECK-NEXT: ret void
 
 declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
@@ -61,6 +57,7 @@ declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1
 declare i1 @llvm.coro.end(ptr, i1, token) #2
 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
 declare ptr @malloc(i64)
+declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl)
 
 attributes #0 = { presplitcoroutine }
 attributes #1 = { argmemonly nounwind readonly }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll
index 4778e3dcaf9957..84cdac17beebb1 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll
@@ -1,7 +1,6 @@
-; Tests that coro-split will convert coro.resume followed by a suspend to a
-; musttail call.
-; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,NOPGO %s
-; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK,PGO %s
+; Tests that coro-split will convert coro.await.suspend.handle to a musttail call.
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK %s
+; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck --check-prefixes=CHECK %s
 
 define void @f() #0 {
 entry:
@@ -26,17 +25,14 @@ await.suspend:
   ]
 await.resume1:
   %hdl = call ptr @g()
-  %addr2 = call ptr @llvm.coro.subfn.addr(ptr %hdl, i8 0)
-  call fastcc void %addr2(ptr %hdl)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr %hdl, ptr @await_suspend_function)
   br label %final.suspend
 await.resume2:
   %hdl2 = call ptr @h()
-  %addr3 = call ptr @llvm.coro.subfn.addr(ptr %hdl2, i8 0)
-  call fastcc void %addr3(ptr %hdl2)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr %hdl2, ptr @await_suspend_function)
   br label %final.suspend
 await.resume3:
-  %addr4 = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-  call fastcc void %addr4(ptr null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
   br label %final.suspend
 final.suspend:
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
@@ -59,22 +55,21 @@ unreach:
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @f.resume(
 ; CHECK: %[[hdl:.+]] = call ptr @g()
-; CHECK-NEXT: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl]], i8 0)
-; NOPGO-NEXT: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
-; PGO: musttail call fastcc void %[[addr2]](ptr %[[hdl]])
+; CHECK-NEXT: call ptr @await_suspend_function
+; CHECK-NEXT: %[[addr2:.+]] = call ptr @llvm.coro.subfn.addr
+; CHECK-NEXT: musttail call fastcc void %[[addr2]]
 ; CHECK-NEXT: ret void
 ; CHECK: %[[hdl2:.+]] = call ptr @h()
-; CHECK-NEXT: %[[addr3:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[hdl2]], i8 0)
-; NOPGO-NEXT: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
-; PGO: musttail call fastcc void %[[addr3]](ptr %[[hdl2]])
+; CHECK-NEXT: call ptr @await_suspend_function
+; CHECK-NEXT: %[[addr3:.+]] = call ptr @llvm.coro.subfn.addr
+; CHECK-NEXT: musttail call fastcc void %[[addr3]]
 ; CHECK-NEXT: ret void
-; CHECK: %[[addr4:.+]] = call ptr @llvm.coro.subfn.addr(ptr null, i8 0)
-; NOPGO-NEXT: musttail call fastcc void %[[addr4]](ptr null)
-; PGO: musttail call fastcc void %[[addr4]](ptr null)
+; CHECK: call ptr @await_suspend_function
+; CHECK: %[[addr4:.+]] = call ptr @llvm.coro.subfn.addr
+; CHECK-NEXT: musttail call fastcc void %[[addr4]]
 ; CHECK-NEXT: ret void
 
 
-
 declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
 declare i1 @llvm.coro.alloc(token) #2
 declare i64 @llvm.coro.size.i64() #3
@@ -89,6 +84,7 @@ declare ptr @malloc(i64)
 declare i8 @switch_result()
 declare ptr @g()
 declare ptr @h()
+declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl)
 
 attributes #0 = { presplitcoroutine }
 attributes #1 = { argmemonly nounwind readonly }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll
index 00ee422ce5863d..b647bd2e4a207f 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll
@@ -1,5 +1,4 @@
-; Tests that coro-split will convert a call before coro.suspend to a musttail call
-; while the user of the coro.suspend is a icmpinst.
+; Tests that coro-split will convert coro.await.suspend.handle to a musttail call.
 ; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
 ; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
 
@@ -24,7 +23,7 @@ entry:
 await.ready:
   %save2 = call token @llvm.coro.save(ptr null)
 
-  call fastcc void @fakeresume1(ptr align 8 null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
   %suspend = call i8 @llvm.coro.suspend(token %save2, i1 true)
   %switch = icmp ult i8 %suspend, 2
   br i1 %switch, label %cleanup, label %coro.end
@@ -44,7 +43,7 @@ coro.end:
 }
 
 ; CHECK-LABEL: @f.resume(
-; CHECK:          musttail call fastcc void @fakeresume1(
+; CHECK:          musttail call fastcc void
 ; CHECK-NEXT:     ret void
 
 declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
@@ -59,6 +58,7 @@ declare i1 @llvm.coro.end(ptr, i1, token) #2
 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1
 declare ptr @malloc(i64)
 declare void @delete(ptr nonnull) #2
+declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl)
 
 attributes #0 = { presplitcoroutine }
 attributes #1 = { argmemonly nounwind readonly }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll
index 9afc79abbe88cd..7c1a13fd83cec5 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll
@@ -1,5 +1,4 @@
-; Tests that sinked lifetime markers wouldn't provent optimization
-; to convert a resuming call to a musttail call.
+; Tests that coro-split will convert coro.await.suspend.handle to a musttail call.
 ; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
 ; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
 
@@ -22,7 +21,7 @@ entry:
   ]
 await.suspend:
   %save2 = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume1(ptr align 8 null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
   switch i8 %suspend2, label %exit [
     i8 0, label %await.ready
@@ -39,7 +38,7 @@ exit:
 
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @g.resume(
-; CHECK:          musttail call fastcc void @fakeresume1(ptr align 8 null)
+; CHECK:          musttail call fastcc void
 ; CHECK-NEXT:     ret void
 
 declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
@@ -56,6 +55,7 @@ declare ptr @malloc(i64)
 declare void @consume(ptr)
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
+declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl)
 
 attributes #0 = { presplitcoroutine }
 attributes #1 = { argmemonly nounwind readonly }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
index d9dba92ec4eb7e..e05169a7291680 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll
@@ -1,5 +1,5 @@
 ; Tests that sinked lifetime markers wouldn't provent optimization
-; to convert a resuming call to a musttail call.
+; to convert a coro.await.suspend.handle call to a musttail call.
 ; The difference between this and coro-split-musttail5.ll is that there is
 ; an extra bitcast instruction in the path, which makes it harder to
 ; optimize.
@@ -25,7 +25,7 @@ entry:
   ]
 await.suspend:
   %save2 = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume1(ptr align 8 null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
   switch i8 %suspend2, label %exit [
     i8 0, label %await.ready
@@ -42,7 +42,7 @@ exit:
 
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @g.resume(
-; CHECK:      musttail call fastcc void @fakeresume1(ptr align 8 null)
+; CHECK:      musttail call fastcc void
 ; CHECK-NEXT: ret void
 
 ; It has a cleanup bb.
@@ -63,7 +63,7 @@ entry:
   ]
 await.suspend:
   %save2 = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume1(ptr align 8 null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
   switch i8 %suspend2, label %exit [
     i8 0, label %await.ready
@@ -90,7 +90,7 @@ exit:
 
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @f.resume(
-; CHECK:      musttail call fastcc void @fakeresume1(ptr align 8 null)
+; CHECK:      musttail call fastcc void
 ; CHECK-NEXT: ret void
 
 declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
@@ -108,6 +108,7 @@ declare void @delete(ptr nonnull) #2
 declare void @consume(ptr)
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
+declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl)
 
 attributes #0 = { presplitcoroutine }
 attributes #1 = { argmemonly nounwind readonly }
diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
index d0d5005587bda6..8ceb0dda94f6ac 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll
@@ -1,13 +1,11 @@
 ; Tests that sinked lifetime markers wouldn't provent optimization
-; to convert a resuming call to a musttail call.
+; to convert a coro.await.suspend.handle call to a musttail call.
 ; The difference between this and coro-split-musttail5.ll and coro-split-musttail6.ll
 ; is that this contains dead instruction generated during the transformation,
 ; which makes the optimization harder.
 ; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
 ; RUN: opt < %s -passes='pgo-instr-gen,cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
 
-declare void @fakeresume1(ptr align 8)
-
 define i64 @g() #0 {
 entry:
   %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
@@ -25,7 +23,7 @@ entry:
   ]
 await.suspend:
   %save2 = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume1(ptr align 8 null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
 
   ; These (non-trivially) dead instructions are in the way.
@@ -48,7 +46,9 @@ exit:
 
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @g.resume(
-; CHECK:         musttail call fastcc void @fakeresume1(ptr align 8 null)
+; CHECK:         %[[FRAME:[0-9]+]] = call ptr @await_suspend_function(ptr null, ptr null)
+; CHECK:         %[[RESUMEADDR:[0-9]+]] = call ptr @llvm.coro.subfn.addr(ptr %[[FRAME]], i8 0)
+; CHECK:         musttail call fastcc void %[[RESUMEADDR]](ptr %[[FRAME]])
 ; CHECK-NEXT:    ret void
 
 ; It has a cleanup bb.
@@ -69,7 +69,7 @@ entry:
   ]
 await.suspend:
   %save2 = call token @llvm.coro.save(ptr null)
-  call fastcc void @fakeresume1(ptr align 8 null)
+  call void @llvm.coro.await.suspend.handle(ptr null, ptr null, ptr @await_suspend_function)
   %suspend2 = call i8 @llvm.coro.suspend(token %save2, i1 false)
   switch i8 %suspend2, label %exit [
     i8 0, label %await.ready
@@ -96,7 +96,9 @@ exit:
 
 ; Verify that in the resume part resume call is marked with musttail.
 ; CHECK-LABEL: @f.resume(
-; CHECK:         musttail call fastcc void @fakeresume1(ptr align 8 null)
+; CHECK:         %[[FRAME:[0-9]+]] = call ptr @await_suspend_function(ptr null, ptr null)
+; CHECK:         %[[RESUMEADDR:[0-9]+]] = call ptr @llvm.coro.subfn.addr(ptr %[[FRAME]], i8 0)
+; CHECK:         musttail call fastcc void %[[RESUMEADDR]](ptr %[[FRAME]])
 ; CHECK-NEXT:    ret void
 
 declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #1
@@ -114,6 +116,7 @@ declare void @delete(ptr nonnull) #2
 declare void @consume(ptr)
 declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
+declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl)
 
 attributes #0 = { presplitcoroutine }
 attributes #1 = { argmemonly nounwind readonly }

From 95e307caeb17c080724921564d96e1b8457264bc Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Wed, 15 May 2024 08:07:17 -0500
Subject: [PATCH 20/71] Fix bolt build with -DBUILD_SHARED_LIBS=ON after
 71fbbb69d

Commit 71fbbb69d63c461f391cbabf1e32cd9977c4ce68 moved getGUID out of
line in llvm/IR/GlobalValue, now users have to link LLVMCore to have
the definition of it.

/usr/bin/ld: CMakeFiles/LLVMBOLTRewrite.dir/PseudoProbeRewriter.cpp.o: in function `(anonymous namespace)::PseudoProbeRewriter::parsePseudoProbe()':
PseudoProbeRewriter.cpp:(.text._ZN12_GLOBAL__N_119PseudoProbeRewriter16parsePseudoProbeEv+0x3d0): undefined reference to `llvm::GlobalValue::getGUID(llvm::StringRef)'
/usr/bin/ld: CMakeFiles/LLVMBOLTRewrite.dir/PseudoProbeRewriter.cpp.o: in function `(anonymous namespace)::PseudoProbeRewriter::encodePseudoProbes()':
PseudoProbeRewriter.cpp:(.text._ZN12_GLOBAL__N_119PseudoProbeRewriter18encodePseudoProbesEv+0x11a1): undefined reference to `llvm::GlobalValue::getGUID(llvm::StringRef)'
collect2: error: ld returned 1 exit status
make[2]: *** [tools/bolt/lib/Rewrite/CMakeFiles/LLVMBOLTRewrite.dir/build.make:275: lib/libLLVMBOLTRewrite.so.19.0git] Error 1
---
 bolt/lib/Rewrite/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bolt/lib/Rewrite/CMakeLists.txt b/bolt/lib/Rewrite/CMakeLists.txt
index 6890f52e2b28bb..578f1763bfe4ec 100644
--- a/bolt/lib/Rewrite/CMakeLists.txt
+++ b/bolt/lib/Rewrite/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  Core
   DebugInfoDWARF
   DWP
   JITLink

From c2fba6df944e11e2c9a7073405c6a817fdba14e3 Mon Sep 17 00:00:00 2001
From: Koakuma <koachan@protonmail.com>
Date: Wed, 15 May 2024 20:49:28 +0700
Subject: [PATCH 21/71] [clang][SPARC] Treat empty structs as if it's a one-bit
 type in the CC (#90338)

Make sure that empty structs are treated as if it has a size of one bit
in function parameters and return types so that it occupies a full
argument and/or return register slot.

This fixes crashes and miscompilations when passing and/or returning
empty structs.

Reviewed by: @s-barannikov
---
 clang/lib/CodeGen/Targets/Sparc.cpp         |  5 ++-
 clang/test/CodeGen/sparcv9-abi.c            | 41 +++++++++++++++++++++
 clang/test/CodeGen/sparcv9-class-return.cpp | 24 ++++++++++++
 3 files changed, 69 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGen/sparcv9-class-return.cpp

diff --git a/clang/lib/CodeGen/Targets/Sparc.cpp b/clang/lib/CodeGen/Targets/Sparc.cpp
index 9025a633f328e2..b82e9a69e19671 100644
--- a/clang/lib/CodeGen/Targets/Sparc.cpp
+++ b/clang/lib/CodeGen/Targets/Sparc.cpp
@@ -263,7 +263,10 @@ SparcV9ABIInfo::classifyType(QualType Ty, unsigned SizeLimit) const {
 
   CoerceBuilder CB(getVMContext(), getDataLayout());
   CB.addStruct(0, StrTy);
-  CB.pad(llvm::alignTo(CB.DL.getTypeSizeInBits(StrTy), 64));
+  // All structs, even empty ones, should take up a register argument slot,
+  // so pin the minimum struct size to one bit.
+  CB.pad(llvm::alignTo(
+      std::max(CB.DL.getTypeSizeInBits(StrTy).getKnownMinValue(), 1UL), 64));
 
   // Try to use the original type for coercion.
   llvm::Type *CoerceTy = CB.isUsableType(StrTy) ? StrTy : CB.getType();
diff --git a/clang/test/CodeGen/sparcv9-abi.c b/clang/test/CodeGen/sparcv9-abi.c
index 5e74a9a883cefa..616e24e7c519d8 100644
--- a/clang/test/CodeGen/sparcv9-abi.c
+++ b/clang/test/CodeGen/sparcv9-abi.c
@@ -21,6 +21,47 @@ char f_int_4(char x) { return x; }
 // CHECK-LABEL: define{{.*}} fp128 @f_ld(fp128 noundef %x)
 long double f_ld(long double x) { return x; }
 
+// Zero-sized structs reserves an argument register slot if passed directly.
+struct empty {};
+struct emptyarr { struct empty a[10]; };
+
+// CHECK-LABEL: define{{.*}} i64 @f_empty(i64 %x.coerce)
+struct empty f_empty(struct empty x) { return x; }
+
+// CHECK-LABEL: define{{.*}} i64 @f_emptyarr(i64 %x.coerce)
+struct empty f_emptyarr(struct emptyarr x) { return x.a[0]; }
+
+// CHECK-LABEL: define{{.*}} i64 @f_emptyvar(i32 noundef zeroext %count, ...)
+long f_emptyvar(unsigned count, ...) {
+    long ret;
+    va_list args;
+    va_start(args, count);
+
+// CHECK: %[[CUR:[^ ]+]] = load ptr, ptr %args
+// CHECK-DAG: %[[NXT:[^ ]+]] = getelementptr inbounds i8, ptr %[[CUR]], i64 8
+// CHECK-DAG: store ptr %[[NXT]], ptr %args
+    va_arg(args, struct empty);
+
+// CHECK: %[[CUR:[^ ]+]] = load ptr, ptr %args
+// CHECK-DAG: %[[NXT:[^ ]+]] = getelementptr inbounds i8, ptr %[[CUR]], i64 8
+// CHECK-DAG: store ptr %[[NXT]], ptr %args
+// CHECK-DAG: load i64, ptr %[[CUR]]
+    ret = va_arg(args, long);
+    va_end(args);
+    return ret;
+}
+
+// If the zero-sized struct is contained in a non-zero-sized struct,
+// though, it doesn't reserve any registers.
+struct emptymixed { struct empty a; long b; };
+struct emptyflex { unsigned count; struct empty data[10]; };
+
+// CHECK-LABEL: define{{.*}} i64 @f_emptymixed(i64 %x.coerce)
+long f_emptymixed(struct emptymixed x) { return x.b; }
+
+// CHECK-LABEL: define{{.*}} i64 @f_emptyflex(i64 %x.coerce, i64 noundef %y)
+long f_emptyflex(struct emptyflex x, long y) { return y; }
+
 // Small structs are passed in registers.
 struct small {
   int *a, *b;
diff --git a/clang/test/CodeGen/sparcv9-class-return.cpp b/clang/test/CodeGen/sparcv9-class-return.cpp
new file mode 100644
index 00000000000000..2428219422d8a0
--- /dev/null
+++ b/clang/test/CodeGen/sparcv9-class-return.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -triple sparcv9-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+
+class Empty {
+};
+
+class Long : public Empty {
+public:
+  long l;
+};
+
+// CHECK: define{{.*}} i64 @_Z4foo15Empty(i64 %e.coerce)
+Empty foo1(Empty e) {
+  return e;
+}
+
+// CHECK: define{{.*}} %class.Long @_Z4foo24Long(i64 %l.coerce)
+Long foo2(Long l) {
+  return l;
+}
+
+// CHECK: define{{.*}} i64 @_Z4foo34Long(i64 %l.coerce)
+long foo3(Long l) {
+  return l.l;
+}

From 97a30448f9477e0196f9340303aa20d544f1629a Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Wed, 15 May 2024 08:57:45 -0500
Subject: [PATCH 22/71] [flang][OpenMP] Add `private` to `allocate` in
 parallel-sections.f90 (#92185)

Add a privatizing clause to the construct that uses `allocate` clause.
Amend the CHECK lines to reflect the expected output.
---
 flang/test/Lower/OpenMP/parallel-sections.f90 | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/flang/test/Lower/OpenMP/parallel-sections.f90 b/flang/test/Lower/OpenMP/parallel-sections.f90
index 2f78dd4562b0ae..285102e06cad1d 100644
--- a/flang/test/Lower/OpenMP/parallel-sections.f90
+++ b/flang/test/Lower/OpenMP/parallel-sections.f90
@@ -39,13 +39,10 @@ end subroutine omp_parallel_sections
 subroutine omp_parallel_sections_allocate(x, y)
   use omp_lib
   integer, intent(inout) :: x, y
+  !CHECK: omp.parallel
   !CHECK: %[[allocator_1:.*]] = arith.constant 4 : i64
-  !CHECK: %[[allocator_2:.*]] = arith.constant 4 : i64
-  !CHECK: omp.parallel allocate(
-  !CHECK: %[[allocator_2]] : i64 -> %{{.*}} : !fir.ref<i32>) {
-  !CHECK: omp.sections allocate(
-  !CHECK: %[[allocator_1]] : i64 -> %{{.*}} : !fir.ref<i32>) {
-  !$omp parallel sections allocate(omp_high_bw_mem_alloc: x)
+  !CHECK: omp.sections allocate(%[[allocator_1]] : i64 -> %{{.*}} : !fir.ref<i32>) {
+  !$omp parallel sections allocate(omp_high_bw_mem_alloc: x) private(x, y)
     !CHECK: omp.section {
     !$omp section
       x = x + 12

From 4ec4a8e7fe463852e197d4ff396f4911ccce7449 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Wed, 15 May 2024 09:04:07 -0500
Subject: [PATCH 23/71] [Frontend][OpenMP] Privatizing clauses in construct
 decomposition (#92176)

Add remaining clauses with the "privatizing" property to construct
decomposition, specifically to the part handling the `allocate` clause.

---------

Co-authored-by: Tom Eccles <t@freedommail.info>
---
 .../Frontend/OpenMP/ConstructCompositionT.h   |  20 ++-
 .../Frontend/OpenMP/ConstructDecompositionT.h |   5 +
 .../Frontend/OpenMPDecompositionTest.cpp      | 123 ++++++++++++++++--
 3 files changed, 139 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructCompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructCompositionT.h
index 9dcb115a0c514b..f6ee963bd88559 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructCompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructCompositionT.h
@@ -343,13 +343,31 @@ template <typename C> void ConstructCompositionT<C>::mergeDSA() {
     }
   }
 
-  // Check reductions as well, clear "shared" if set.
+  // Check other privatizing clauses as well, clear "shared" if set.
+  for (auto &clause : clauseSets[llvm::omp::Clause::OMPC_in_reduction]) {
+    using InReductionTy = tomp::clause::InReductionT<TypeTy, IdTy, ExprTy>;
+    using ListTy = typename InReductionTy::List;
+    for (auto &object : std::get<ListTy>(std::get<InReductionTy>(clause.u).t))
+      getDsa(object).second &= ~DSA::Shared;
+  }
+  for (auto &clause : clauseSets[llvm::omp::Clause::OMPC_linear]) {
+    using LinearTy = tomp::clause::LinearT<TypeTy, IdTy, ExprTy>;
+    using ListTy = typename LinearTy::List;
+    for (auto &object : std::get<ListTy>(std::get<LinearTy>(clause.u).t))
+      getDsa(object).second &= ~DSA::Shared;
+  }
   for (auto &clause : clauseSets[llvm::omp::Clause::OMPC_reduction]) {
     using ReductionTy = tomp::clause::ReductionT<TypeTy, IdTy, ExprTy>;
     using ListTy = typename ReductionTy::List;
     for (auto &object : std::get<ListTy>(std::get<ReductionTy>(clause.u).t))
       getDsa(object).second &= ~DSA::Shared;
   }
+  for (auto &clause : clauseSets[llvm::omp::Clause::OMPC_task_reduction]) {
+    using TaskReductionTy = tomp::clause::TaskReductionT<TypeTy, IdTy, ExprTy>;
+    using ListTy = typename TaskReductionTy::List;
+    for (auto &object : std::get<ListTy>(std::get<TaskReductionTy>(clause.u).t))
+      getDsa(object).second &= ~DSA::Shared;
+  }
 
   tomp::ListT<ObjectTy> privateObj, sharedObj, firstpObj, lastpObj, lastpcObj;
   for (auto &[object, dsa] : objectDsa) {
diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index 5f12c62b832fc4..02c88a58e09933 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -793,9 +793,14 @@ bool ConstructDecompositionT<C, H>::applyClause(
   // [5.2:340:33]
   auto canMakePrivateCopy = [](llvm::omp::Clause id) {
     switch (id) {
+    // Clauses with "privatization" property:
     case llvm::omp::Clause::OMPC_firstprivate:
+    case llvm::omp::Clause::OMPC_in_reduction:
     case llvm::omp::Clause::OMPC_lastprivate:
+    case llvm::omp::Clause::OMPC_linear:
     case llvm::omp::Clause::OMPC_private:
+    case llvm::omp::Clause::OMPC_reduction:
+    case llvm::omp::Clause::OMPC_task_reduction:
       return true;
     default:
       return false;
diff --git a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
index df48e9cc0ff4a4..8157e41e833a9c 100644
--- a/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPDecompositionTest.cpp
@@ -288,6 +288,14 @@ std::string stringify(const omp::DirectiveWithClauses &DWC) {
 
 // --- Tests ----------------------------------------------------------
 
+namespace red {
+// Make it easier to construct reduction operators from built-in intrinsics.
+omp::clause::ReductionOperator
+makeOp(omp::clause::DefinedOperator::IntrinsicOperator Op) {
+  return omp::clause::ReductionOperator{omp::clause::DefinedOperator{Op}};
+}
+} // namespace red
+
 namespace {
 using namespace llvm::omp;
 
@@ -699,6 +707,92 @@ TEST_F(OpenMPDecompositionTest, Order1) {
 TEST_F(OpenMPDecompositionTest, Allocate1) {
   omp::Object x{"x"};
 
+  // Allocate + firstprivate
+  omp::List<omp::Clause> Clauses{
+      {OMPC_allocate,
+       omp::clause::Allocate{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+      {OMPC_firstprivate, omp::clause::Firstprivate{{x}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_sections,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "parallel shared(x)");                           // (33)
+  ASSERT_EQ(Dir1, "sections firstprivate(x) allocate(, , , (x))"); // (33)
+}
+
+TEST_F(OpenMPDecompositionTest, Allocate2) {
+  omp::Object x{"x"};
+  auto Add = red::makeOp(omp::clause::DefinedOperator::IntrinsicOperator::Add);
+
+  // Allocate + in_reduction
+  omp::List<omp::Clause> Clauses{
+      {OMPC_allocate,
+       omp::clause::Allocate{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+      {OMPC_in_reduction, omp::clause::InReduction{{{Add}, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_target_parallel,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "target in_reduction((3), (x)) allocate(, , , (x))"); // (33)
+  ASSERT_EQ(Dir1, "parallel");                                          // (33)
+}
+
+TEST_F(OpenMPDecompositionTest, Allocate3) {
+  omp::Object x{"x"};
+
+  // Allocate + linear
+  omp::List<omp::Clause> Clauses{
+      {OMPC_allocate,
+       omp::clause::Allocate{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+      {OMPC_linear,
+       omp::clause::Linear{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_for,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  // The "shared" clause is duplicated---this isn't harmful, but it
+  // should be fixed eventually.
+  ASSERT_EQ(Dir0, "parallel shared(x) shared(x)"); // (33)
+  ASSERT_EQ(Dir1, "for linear(, , , (x)) firstprivate(x) lastprivate(, (x)) "
+                  "allocate(, , , (x))"); // (33)
+}
+
+TEST_F(OpenMPDecompositionTest, Allocate4) {
+  omp::Object x{"x"};
+
+  // Allocate + lastprivate
+  omp::List<omp::Clause> Clauses{
+      {OMPC_allocate,
+       omp::clause::Allocate{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+      {OMPC_lastprivate, omp::clause::Lastprivate{{std::nullopt, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_sections,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "parallel shared(x)");                              // (33)
+  ASSERT_EQ(Dir1, "sections lastprivate(, (x)) allocate(, , , (x))"); // (33)
+}
+
+TEST_F(OpenMPDecompositionTest, Allocate5) {
+  omp::Object x{"x"};
+
+  // Allocate + private
   omp::List<omp::Clause> Clauses{
       {OMPC_allocate,
        omp::clause::Allocate{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
@@ -715,6 +809,27 @@ TEST_F(OpenMPDecompositionTest, Allocate1) {
   ASSERT_EQ(Dir1, "sections private(x) allocate(, , , (x))"); // (33)
 }
 
+TEST_F(OpenMPDecompositionTest, Allocate6) {
+  omp::Object x{"x"};
+  auto Add = red::makeOp(omp::clause::DefinedOperator::IntrinsicOperator::Add);
+
+  // Allocate + reduction
+  omp::List<omp::Clause> Clauses{
+      {OMPC_allocate,
+       omp::clause::Allocate{{std::nullopt, std::nullopt, std::nullopt, {x}}}},
+      {OMPC_reduction, omp::clause::Reduction{{std::nullopt, {Add}, {x}}}},
+  };
+
+  omp::ConstructDecomposition Dec(AnyVersion, Helper, OMPD_parallel_sections,
+                                  Clauses);
+  ASSERT_EQ(Dec.output.size(), 2u);
+
+  std::string Dir0 = stringify(Dec.output[0]);
+  std::string Dir1 = stringify(Dec.output[1]);
+  ASSERT_EQ(Dir0, "parallel shared(x)");                                 // (33)
+  ASSERT_EQ(Dir1, "sections reduction(, (3), (x)) allocate(, , , (x))"); // (33)
+}
+
 // REDUCTION
 // [5.2:134:17-18]
 // Directives: do, for, loop, parallel, scope, sections, simd, taskloop, teams
@@ -741,14 +856,6 @@ TEST_F(OpenMPDecompositionTest, Allocate1) {
 // clause on the construct, then the effect is as if the list item in the
 // reduction clause appears as a list item in a map clause with a map-type of
 // tofrom.
-namespace red {
-// Make is easier to construct reduction operators from built-in intrinsics.
-omp::clause::ReductionOperator
-makeOp(omp::clause::DefinedOperator::IntrinsicOperator Op) {
-  return omp::clause::ReductionOperator{omp::clause::DefinedOperator{Op}};
-}
-} // namespace red
-
 TEST_F(OpenMPDecompositionTest, Reduction1) {
   omp::Object x{"x"};
   auto Add = red::makeOp(omp::clause::DefinedOperator::IntrinsicOperator::Add);

From 8d386c63a8d38bc50acba8dba2cd5f0daca57012 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Wed, 15 May 2024 09:05:02 -0500
Subject: [PATCH 24/71] [Frontend][OpenMP] Reduction modifier must be applied
 somewhere (#92160)

Detect the case when a reduction modifier ends up not being applied
after construct decomposition, treat it as an error.

This fixes a regression in the gfortran test suite after PR90098.
---
 .../Lower/OpenMP/invalid-reduction-modifier.f90    | 14 ++++++++++++++
 .../llvm/Frontend/OpenMP/ConstructDecompositionT.h |  3 ++-
 2 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Lower/OpenMP/invalid-reduction-modifier.f90

diff --git a/flang/test/Lower/OpenMP/invalid-reduction-modifier.f90 b/flang/test/Lower/OpenMP/invalid-reduction-modifier.f90
new file mode 100644
index 00000000000000..817c5b731c62f8
--- /dev/null
+++ b/flang/test/Lower/OpenMP/invalid-reduction-modifier.f90
@@ -0,0 +1,14 @@
+!Remove the --crash below once we can diagnose the issue more gracefully.
+!RUN: not --crash %flang_fc1 -fopenmp -emit-hlfir -o - %s
+
+! Check that we reject the "task" reduction modifier on the "simd" directive.
+
+subroutine fred(x)
+  integer, intent(inout) :: x
+
+  !$omp simd reduction(task, +:x)
+  do i = 1, 100
+    x = foo(i)
+  enddo
+  !$omp end simd
+end
diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index 02c88a58e09933..3fa27608ead948 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -935,7 +935,8 @@ bool ConstructDecompositionT<C, H>::applyClause(
       // Apply clause without modifier.
       leaf.clauses.push_back(unmodified);
     }
-    applied = true;
+    // The modifier must be applied to some construct.
+    applied = effectiveApplied;
   }
 
   if (!applied)

From 7c8176ebd39c357fc4fa488861318409cd87d8f2 Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Wed, 15 May 2024 22:08:17 +0800
Subject: [PATCH 25/71] [Coroutines] Remove unused function (NFC)

llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp:1223:1:
error: unused function 'scanPHIsAndUpdateValueMap' [-Werror,-Wunused-function]
scanPHIsAndUpdateValueMap(Instruction *Prev, BasicBlock *NewBlock,
^
1 error generated.
---
 llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 450ea823437157..1d9cf185b75a71 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -1217,23 +1217,6 @@ static void postSplitCleanup(Function &F) {
 #endif
 }
 
-// Assuming we arrived at the block NewBlock from Prev instruction, store
-// PHI's incoming values in the ResolvedValues map.
-static void
-scanPHIsAndUpdateValueMap(Instruction *Prev, BasicBlock *NewBlock,
-                          DenseMap<Value *, Value *> &ResolvedValues) {
-  auto *PrevBB = Prev->getParent();
-  for (PHINode &PN : NewBlock->phis()) {
-    auto V = PN.getIncomingValueForBlock(PrevBB);
-    // See if we already resolved it.
-    auto VI = ResolvedValues.find(V);
-    if (VI != ResolvedValues.end())
-      V = VI->second;
-    // Remember the value.
-    ResolvedValues[&PN] = V;
-  }
-}
-
 // Coroutine has no suspend points. Remove heap allocation for the coroutine
 // frame if possible.
 static void handleNoSuspendCoroutine(coro::Shape &Shape) {

From 466d266945196ebbdefd8d72f654551d54d68600 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 15 May 2024 15:13:53 +0100
Subject: [PATCH 26/71] [AMDGPU] Fix GFX90x check prefixes in tests (#92254)

---
 llvm/test/CodeGen/AMDGPU/agpr-register-count.ll | 6 ++----
 llvm/test/CodeGen/AMDGPU/spill-vgpr.ll          | 4 ++--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
index f84d476fc12271..bd5dc6e2070986 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
@@ -157,10 +157,8 @@ declare void @undef_func()
 ; GFX908: .amdhsa_next_free_vgpr 32
 ; GFX90A: .amdhsa_next_free_vgpr 64
 ; GFX90A: .amdhsa_accum_offset 32
-; GCN908: NumVgprs: 128
-; GCN908: NumAgprs: 128
-; GCN90A: NumVgprs: 256
-; GCN90A: NumAgprs: 256
+; GCN:    NumVgprs: 32
+; GCN:    NumAgprs: 32
 ; GFX908: TotalNumVgprs: 32
 ; GFX90A: TotalNumVgprs: 64
 ; GFX908: VGPRBlocks: 7
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
index 9eacb88066c0f3..1cc5b7f7d14eec 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr.ll
@@ -88,8 +88,8 @@ define amdgpu_kernel void @max_10_vgprs_spill_v32(ptr addrspace(1) %p) #0 {
 ; GFX900: ScratchSize: 132
 ; GFX908: NumVgprs: 252
 ; GFX908: ScratchSize: 0
-; GCN900:    VGPRBlocks: 63
-; GCN908:    VGPRBlocks: 62
+; GFX900:    VGPRBlocks: 63
+; GFX908:    VGPRBlocks: 62
 ; GFX900:    NumVGPRsForWavesPerEU: 256
 ; GFX908:    NumVGPRsForWavesPerEU: 252
 define amdgpu_kernel void @max_256_vgprs_spill_9x32(ptr addrspace(1) %p) #1 {

From 61da6366d043792d7db280ce9edd2db62516e0e8 Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer@amd.com>
Date: Wed, 15 May 2024 15:20:27 +0100
Subject: [PATCH 27/71] [flang] Initial debug info support for local variables.
 (#90905)

We need the information in the `DeclareOp` to generate debug information
for variables.  Currently, cg-rewrite removes the `DeclareOp`. As
`AddDebugInfo` runs after that, it cannot process the `DeclareOp`. My
initial plan was to make the `AddDebugInfo` pass run before the cg-rewrite
but that has few issues.

1. Initially I was thinking to use the memref op to carry the variable
attr. But as @tblah suggested in the #86939, it makes more sense to
carry that information on `DeclareOp`. It also makes it easy to handle
it in codegen and there is no special handling needed for arguments. For
this reason, we need to preserve the `DeclareOp` till the codegen.

2. Running earlier, we will miss the changes in passes that run between
cg-rewrite and codegen.

But not removing the DeclareOp in cg-rewrite has the issue that ShapeOp
remains and it causes errors during codegen. To solve this problem, I
convert DeclareOp to XDeclareOp in cg-rewrite instead of removing
it. This was mentioned as possible solution by @jeanPerier in
https://reviews.llvm.org/D136254

The conversion follows similar logic as used for other operators in that
file. The FortranAttr and CudaAttr are currently not converted but left
as TODO when the need arise.

Now `AddDebugInfo` pass can extracts information about local variables
from `XDeclareOp` and creates `DILocalVariableAttr`. These are attached
to `XDeclareOp` using `FusedLoc` approach. Codegen can use them to
create `DbgDeclareOp`.  I have added tests that checks the debug
information in mlir from and also in llvm ir.

Currently we only handle very limited types. Rest are given a place
holder type. The previous placeholder type was basic type with
`DW_ATE_address` encoding. When variables are added, it started
causing assertions in the llvm debug info generation logic for some
types. It has been changed to an interger type to prevent these issues
until we handle those types properly.
---
 .../flang}/Optimizer/CodeGen/CGOps.h          |  1 +
 .../include/flang/Optimizer/CodeGen/CGOps.td  | 34 +++++++
 .../flang/Optimizer/CodeGen/CGPasses.td       |  4 +
 .../include/flang/Optimizer/CodeGen/CodeGen.h |  6 +-
 flang/include/flang/Tools/CLOptions.inc       | 11 ++-
 flang/lib/Optimizer/CodeGen/CGOps.cpp         |  2 +-
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       | 50 +++++++---
 flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp  | 49 ++++++++--
 .../lib/Optimizer/Transforms/AddDebugInfo.cpp | 56 +++++++++++-
 .../Transforms/DebugTypeGenerator.cpp         | 10 +-
 flang/test/Fir/declare-codegen.fir            | 22 +++--
 flang/test/Fir/dummy-scope-codegen.fir        | 11 ++-
 flang/test/Transforms/debug-local-var-2.f90   | 91 +++++++++++++++++++
 flang/test/Transforms/debug-local-var.f90     | 54 +++++++++++
 14 files changed, 354 insertions(+), 47 deletions(-)
 rename flang/{lib => include/flang}/Optimizer/CodeGen/CGOps.h (94%)
 create mode 100644 flang/test/Transforms/debug-local-var-2.f90
 create mode 100644 flang/test/Transforms/debug-local-var.f90

diff --git a/flang/lib/Optimizer/CodeGen/CGOps.h b/flang/include/flang/Optimizer/CodeGen/CGOps.h
similarity index 94%
rename from flang/lib/Optimizer/CodeGen/CGOps.h
rename to flang/include/flang/Optimizer/CodeGen/CGOps.h
index b5a6d5bb9a9e6f..df909d9ee81cb4 100644
--- a/flang/lib/Optimizer/CodeGen/CGOps.h
+++ b/flang/include/flang/Optimizer/CodeGen/CGOps.h
@@ -13,6 +13,7 @@
 #ifndef OPTIMIZER_CODEGEN_CGOPS_H
 #define OPTIMIZER_CODEGEN_CGOPS_H
 
+#include "flang/Optimizer/Dialect/FIRAttr.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 
diff --git a/flang/include/flang/Optimizer/CodeGen/CGOps.td b/flang/include/flang/Optimizer/CodeGen/CGOps.td
index 35e70fa2ffa3fb..c375edee1fa77f 100644
--- a/flang/include/flang/Optimizer/CodeGen/CGOps.td
+++ b/flang/include/flang/Optimizer/CodeGen/CGOps.td
@@ -16,6 +16,8 @@
 
 include "mlir/IR/SymbolInterfaces.td"
 include "flang/Optimizer/Dialect/FIRTypes.td"
+include "flang/Optimizer/Dialect/FIRAttr.td"
+include "mlir/IR/BuiltinAttributes.td"
 
 def fircg_Dialect : Dialect {
   let name = "fircg";
@@ -202,4 +204,36 @@ def fircg_XArrayCoorOp : fircg_Op<"ext_array_coor", [AttrSizedOperandSegments]>
   }];
 }
 
+// Extended Declare operation.
+def fircg_XDeclareOp : fircg_Op<"ext_declare", [AttrSizedOperandSegments]> {
+  let summary = "for internal conversion only";
+
+  let description = [{
+    Prior to lowering to LLVM IR dialect, a DeclareOp will
+    be converted to an extended DeclareOp.
+  }];
+
+  let arguments = (ins
+    AnyRefOrBox:$memref,
+    Variadic<AnyIntegerType>:$shape,
+    Variadic<AnyIntegerType>:$shift,
+    Variadic<AnyIntegerType>:$typeparams,
+    Optional<fir_DummyScopeType>:$dummy_scope,
+    Builtin_StringAttr:$uniq_name
+  );
+  let results = (outs AnyRefOrBox);
+
+  let assemblyFormat = [{
+    $memref (`(` $shape^ `)`)? (`origin` $shift^)? (`typeparams` $typeparams^)?
+    (`dummy_scope` $dummy_scope^)?
+    attr-dict `:` functional-type(operands, results)
+  }];
+
+  let extraClassDeclaration = [{
+    // Shape is optional, but if it exists, it will be at offset 1.
+    unsigned shapeOffset() { return 1; }
+    unsigned shiftOffset() { return shapeOffset() + getShape().size(); }
+  }];
+}
+
 #endif
diff --git a/flang/include/flang/Optimizer/CodeGen/CGPasses.td b/flang/include/flang/Optimizer/CodeGen/CGPasses.td
index f524fb42373444..565920e55e6a8d 100644
--- a/flang/include/flang/Optimizer/CodeGen/CGPasses.td
+++ b/flang/include/flang/Optimizer/CodeGen/CGPasses.td
@@ -47,6 +47,10 @@ def CodeGenRewrite : Pass<"cg-rewrite", "mlir::ModuleOp"> {
   let dependentDialects = [
     "fir::FIROpsDialect", "fir::FIRCodeGenDialect"
   ];
+  let options = [
+    Option<"preserveDeclare", "preserve-declare", "bool", /*default=*/"false",
+           "Preserve DeclareOp during pre codegen re-write.">
+  ];
   let statistics = [
     Statistic<"numDCE", "num-dce'd", "Number of operations eliminated">
   ];
diff --git a/flang/include/flang/Optimizer/CodeGen/CodeGen.h b/flang/include/flang/Optimizer/CodeGen/CodeGen.h
index 26097dabf56c45..4d2b191b46d088 100644
--- a/flang/include/flang/Optimizer/CodeGen/CodeGen.h
+++ b/flang/include/flang/Optimizer/CodeGen/CodeGen.h
@@ -30,7 +30,8 @@ struct NameUniquer;
 
 /// Prerequiste pass for code gen. Perform intermediate rewrites to perform
 /// the code gen (to LLVM-IR dialect) conversion.
-std::unique_ptr<mlir::Pass> createFirCodeGenRewritePass();
+std::unique_ptr<mlir::Pass> createFirCodeGenRewritePass(
+    CodeGenRewriteOptions Options = CodeGenRewriteOptions{});
 
 /// FirTargetRewritePass options.
 struct TargetRewriteOptions {
@@ -88,7 +89,8 @@ void populateFIRToLLVMConversionPatterns(fir::LLVMTypeConverter &converter,
                                          fir::FIRToLLVMPassOptions &options);
 
 /// Populate the pattern set with the PreCGRewrite patterns.
-void populatePreCGRewritePatterns(mlir::RewritePatternSet &patterns);
+void populatePreCGRewritePatterns(mlir::RewritePatternSet &patterns,
+                                  bool preserveDeclare);
 
 // declarative passes
 #define GEN_PASS_REGISTRATION
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index cc3431d5b71d29..761315e0abc812 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -169,9 +169,11 @@ inline void addMemoryAllocationOpt(mlir::PassManager &pm) {
 }
 
 #if !defined(FLANG_EXCLUDE_CODEGEN)
-inline void addCodeGenRewritePass(mlir::PassManager &pm) {
-  addPassConditionally(
-      pm, disableCodeGenRewrite, fir::createFirCodeGenRewritePass);
+inline void addCodeGenRewritePass(mlir::PassManager &pm, bool preserveDeclare) {
+  fir::CodeGenRewriteOptions options;
+  options.preserveDeclare = preserveDeclare;
+  addPassConditionally(pm, disableCodeGenRewrite,
+      [&]() { return fir::createFirCodeGenRewritePass(options); });
 }
 
 inline void addTargetRewritePass(mlir::PassManager &pm) {
@@ -353,7 +355,8 @@ inline void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
     MLIRToLLVMPassPipelineConfig config, llvm::StringRef inputFilename = {}) {
   fir::addBoxedProcedurePass(pm);
   addNestedPassToAllTopLevelOperations(pm, fir::createAbstractResultOpt);
-  fir::addCodeGenRewritePass(pm);
+  fir::addCodeGenRewritePass(
+      pm, (config.DebugInfo != llvm::codegenoptions::NoDebugInfo));
   fir::addTargetRewritePass(pm);
   fir::addExternalNameConversionPass(pm, config.Underscoring);
   fir::createDebugPasses(pm, config.DebugInfo, config.OptLevel, inputFilename);
diff --git a/flang/lib/Optimizer/CodeGen/CGOps.cpp b/flang/lib/Optimizer/CodeGen/CGOps.cpp
index 44d07d26dd2b68..6b8ba74525556e 100644
--- a/flang/lib/Optimizer/CodeGen/CGOps.cpp
+++ b/flang/lib/Optimizer/CodeGen/CGOps.cpp
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CGOps.h"
+#include "flang/Optimizer/CodeGen/CGOps.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 21154902d23f8f..72172f63888e1c 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -12,7 +12,7 @@
 
 #include "flang/Optimizer/CodeGen/CodeGen.h"
 
-#include "CGOps.h"
+#include "flang/Optimizer/CodeGen/CGOps.h"
 #include "flang/Optimizer/CodeGen/CodeGenOpenMP.h"
 #include "flang/Optimizer/CodeGen/FIROpPatterns.h"
 #include "flang/Optimizer/CodeGen/TypeConverter.h"
@@ -170,6 +170,28 @@ genAllocationScaleSize(OP op, mlir::Type ity,
   return nullptr;
 }
 
+namespace {
+struct DeclareOpConversion : public fir::FIROpConversion<fir::cg::XDeclareOp> {
+public:
+  using FIROpConversion::FIROpConversion;
+  mlir::LogicalResult
+  matchAndRewrite(fir::cg::XDeclareOp declareOp, OpAdaptor adaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    auto memRef = adaptor.getOperands()[0];
+    if (auto fusedLoc = mlir::dyn_cast<mlir::FusedLoc>(declareOp.getLoc())) {
+      if (auto varAttr =
+              mlir::dyn_cast_or_null<mlir::LLVM::DILocalVariableAttr>(
+                  fusedLoc.getMetadata())) {
+        rewriter.create<mlir::LLVM::DbgDeclareOp>(memRef.getLoc(), memRef,
+                                                  varAttr, nullptr);
+      }
+    }
+    rewriter.replaceOp(declareOp, memRef);
+    return mlir::success();
+  }
+};
+} // namespace
+
 namespace {
 /// convert to LLVM IR dialect `alloca`
 struct AllocaOpConversion : public fir::FIROpConversion<fir::AllocaOp> {
@@ -3714,19 +3736,19 @@ void fir::populateFIRToLLVMConversionPatterns(
       BoxOffsetOpConversion, BoxProcHostOpConversion, BoxRankOpConversion,
       BoxTypeCodeOpConversion, BoxTypeDescOpConversion, CallOpConversion,
       CmpcOpConversion, ConstcOpConversion, ConvertOpConversion,
-      CoordinateOpConversion, DTEntryOpConversion, DivcOpConversion,
-      EmboxOpConversion, EmboxCharOpConversion, EmboxProcOpConversion,
-      ExtractValueOpConversion, FieldIndexOpConversion, FirEndOpConversion,
-      FreeMemOpConversion, GlobalLenOpConversion, GlobalOpConversion,
-      HasValueOpConversion, InsertOnRangeOpConversion, InsertValueOpConversion,
-      IsPresentOpConversion, LenParamIndexOpConversion, LoadOpConversion,
-      MulcOpConversion, NegcOpConversion, NoReassocOpConversion,
-      SelectCaseOpConversion, SelectOpConversion, SelectRankOpConversion,
-      SelectTypeOpConversion, ShapeOpConversion, ShapeShiftOpConversion,
-      ShiftOpConversion, SliceOpConversion, StoreOpConversion,
-      StringLitOpConversion, SubcOpConversion, TypeDescOpConversion,
-      TypeInfoOpConversion, UnboxCharOpConversion, UnboxProcOpConversion,
-      UndefOpConversion, UnreachableOpConversion,
+      CoordinateOpConversion, DTEntryOpConversion, DeclareOpConversion,
+      DivcOpConversion, EmboxOpConversion, EmboxCharOpConversion,
+      EmboxProcOpConversion, ExtractValueOpConversion, FieldIndexOpConversion,
+      FirEndOpConversion, FreeMemOpConversion, GlobalLenOpConversion,
+      GlobalOpConversion, HasValueOpConversion, InsertOnRangeOpConversion,
+      InsertValueOpConversion, IsPresentOpConversion, LenParamIndexOpConversion,
+      LoadOpConversion, MulcOpConversion, NegcOpConversion,
+      NoReassocOpConversion, SelectCaseOpConversion, SelectOpConversion,
+      SelectRankOpConversion, SelectTypeOpConversion, ShapeOpConversion,
+      ShapeShiftOpConversion, ShiftOpConversion, SliceOpConversion,
+      StoreOpConversion, StringLitOpConversion, SubcOpConversion,
+      TypeDescOpConversion, TypeInfoOpConversion, UnboxCharOpConversion,
+      UnboxProcOpConversion, UndefOpConversion, UnreachableOpConversion,
       UnrealizedConversionCastOpConversion, XArrayCoorOpConversion,
       XEmboxOpConversion, XReboxOpConversion, ZeroOpConversion>(converter,
                                                                 options);
diff --git a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
index 5bd3ec8d18450e..c54a7457db7610 100644
--- a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
@@ -12,8 +12,8 @@
 
 #include "flang/Optimizer/CodeGen/CodeGen.h"
 
-#include "CGOps.h"
 #include "flang/Optimizer/Builder/Todo.h" // remove when TODO's are done
+#include "flang/Optimizer/CodeGen/CGOps.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
@@ -270,13 +270,43 @@ class ArrayCoorConversion : public mlir::OpRewritePattern<fir::ArrayCoorOp> {
 };
 
 class DeclareOpConversion : public mlir::OpRewritePattern<fir::DeclareOp> {
+  bool preserveDeclare;
+
 public:
   using OpRewritePattern::OpRewritePattern;
+  DeclareOpConversion(mlir::MLIRContext *ctx, bool preserveDecl)
+      : OpRewritePattern(ctx), preserveDeclare(preserveDecl) {}
 
   mlir::LogicalResult
   matchAndRewrite(fir::DeclareOp declareOp,
                   mlir::PatternRewriter &rewriter) const override {
-    rewriter.replaceOp(declareOp, declareOp.getMemref());
+    if (!preserveDeclare) {
+      rewriter.replaceOp(declareOp, declareOp.getMemref());
+      return mlir::success();
+    }
+    auto loc = declareOp.getLoc();
+    llvm::SmallVector<mlir::Value> shapeOpers;
+    llvm::SmallVector<mlir::Value> shiftOpers;
+    if (auto shapeVal = declareOp.getShape()) {
+      if (auto shapeOp = mlir::dyn_cast<fir::ShapeOp>(shapeVal.getDefiningOp()))
+        populateShape(shapeOpers, shapeOp);
+      else if (auto shiftOp =
+                   mlir::dyn_cast<fir::ShapeShiftOp>(shapeVal.getDefiningOp()))
+        populateShapeAndShift(shapeOpers, shiftOpers, shiftOp);
+      else if (auto shiftOp =
+                   mlir::dyn_cast<fir::ShiftOp>(shapeVal.getDefiningOp()))
+        populateShift(shiftOpers, shiftOp);
+      else
+        return mlir::failure();
+    }
+    // FIXME: Add FortranAttrs and CudaAttrs
+    auto xDeclOp = rewriter.create<fir::cg::XDeclareOp>(
+        loc, declareOp.getType(), declareOp.getMemref(), shapeOpers, shiftOpers,
+        declareOp.getTypeparams(), declareOp.getDummyScope(),
+        declareOp.getUniqName());
+    LLVM_DEBUG(llvm::dbgs()
+               << "rewriting " << declareOp << " to " << xDeclOp << '\n');
+    rewriter.replaceOp(declareOp, xDeclOp.getOperation()->getResults());
     return mlir::success();
   }
 };
@@ -297,6 +327,7 @@ class DummyScopeOpConversion
 
 class CodeGenRewrite : public fir::impl::CodeGenRewriteBase<CodeGenRewrite> {
 public:
+  CodeGenRewrite(fir::CodeGenRewriteOptions opts) : Base(opts) {}
   void runOnOperation() override final {
     mlir::ModuleOp mod = getOperation();
 
@@ -314,7 +345,7 @@ class CodeGenRewrite : public fir::impl::CodeGenRewriteBase<CodeGenRewrite> {
                    mlir::cast<fir::BaseBoxType>(embox.getType()).getEleTy()));
     });
     mlir::RewritePatternSet patterns(&context);
-    fir::populatePreCGRewritePatterns(patterns);
+    fir::populatePreCGRewritePatterns(patterns, preserveDeclare);
     if (mlir::failed(
             mlir::applyPartialConversion(mod, target, std::move(patterns)))) {
       mlir::emitError(mlir::UnknownLoc::get(&context),
@@ -330,12 +361,14 @@ class CodeGenRewrite : public fir::impl::CodeGenRewriteBase<CodeGenRewrite> {
 
 } // namespace
 
-std::unique_ptr<mlir::Pass> fir::createFirCodeGenRewritePass() {
-  return std::make_unique<CodeGenRewrite>();
+std::unique_ptr<mlir::Pass>
+fir::createFirCodeGenRewritePass(fir::CodeGenRewriteOptions Options) {
+  return std::make_unique<CodeGenRewrite>(Options);
 }
 
-void fir::populatePreCGRewritePatterns(mlir::RewritePatternSet &patterns) {
+void fir::populatePreCGRewritePatterns(mlir::RewritePatternSet &patterns,
+                                       bool preserveDeclare) {
   patterns.insert<EmboxConversion, ArrayCoorConversion, ReboxConversion,
-                  DeclareOpConversion, DummyScopeOpConversion>(
-      patterns.getContext());
+                  DummyScopeOpConversion>(patterns.getContext());
+  patterns.add<DeclareOpConversion>(patterns.getContext(), preserveDeclare);
 }
diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index 908c8fc96f633e..cfad366cb5cb5a 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -15,6 +15,7 @@
 #include "flang/Common/Version.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/CodeGen/CGOps.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
@@ -45,13 +46,59 @@ namespace fir {
 namespace {
 
 class AddDebugInfoPass : public fir::impl::AddDebugInfoBase<AddDebugInfoPass> {
+  void handleDeclareOp(fir::cg::XDeclareOp declOp,
+                       mlir::LLVM::DIFileAttr fileAttr,
+                       mlir::LLVM::DIScopeAttr scopeAttr,
+                       fir::DebugTypeGenerator &typeGen);
+
 public:
   AddDebugInfoPass(fir::AddDebugInfoOptions options) : Base(options) {}
   void runOnOperation() override;
 };
 
+static uint32_t getLineFromLoc(mlir::Location loc) {
+  uint32_t line = 1;
+  if (auto fileLoc = mlir::dyn_cast<mlir::FileLineColLoc>(loc))
+    line = fileLoc.getLine();
+  return line;
+}
+
 } // namespace
 
+void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp,
+                                       mlir::LLVM::DIFileAttr fileAttr,
+                                       mlir::LLVM::DIScopeAttr scopeAttr,
+                                       fir::DebugTypeGenerator &typeGen) {
+  mlir::MLIRContext *context = &getContext();
+  mlir::OpBuilder builder(context);
+  auto result = fir::NameUniquer::deconstruct(declOp.getUniqName());
+
+  if (result.first != fir::NameUniquer::NameKind::VARIABLE)
+    return;
+
+  // Only accept local variables.
+  if (result.second.procs.empty())
+    return;
+
+  // FIXME: There may be cases where an argument is processed a bit before
+  // DeclareOp is generated. In that case, DeclareOp may point to an
+  // intermediate op and not to BlockArgument. We need to find those cases and
+  // walk the chain to get to the actual argument.
+
+  unsigned argNo = 0;
+  if (auto Arg = llvm::dyn_cast<mlir::BlockArgument>(declOp.getMemref()))
+    argNo = Arg.getArgNumber() + 1;
+
+  auto tyAttr = typeGen.convertType(fir::unwrapRefType(declOp.getType()),
+                                    fileAttr, scopeAttr, declOp.getLoc());
+
+  auto localVarAttr = mlir::LLVM::DILocalVariableAttr::get(
+      context, scopeAttr, mlir::StringAttr::get(context, result.second.name),
+      fileAttr, getLineFromLoc(declOp.getLoc()), argNo, /* alignInBits*/ 0,
+      tyAttr);
+  declOp->setLoc(builder.getFusedLoc({declOp->getLoc()}, localVarAttr));
+}
+
 void AddDebugInfoPass::runOnOperation() {
   mlir::ModuleOp module = getOperation();
   mlir::MLIRContext *context = &getContext();
@@ -144,14 +191,15 @@ void AddDebugInfoPass::runOnOperation() {
       subprogramFlags =
           subprogramFlags | mlir::LLVM::DISubprogramFlags::Definition;
     }
-    unsigned line = 1;
-    if (auto funcLoc = mlir::dyn_cast<mlir::FileLineColLoc>(l))
-      line = funcLoc.getLine();
-
+    unsigned line = getLineFromLoc(l);
     auto spAttr = mlir::LLVM::DISubprogramAttr::get(
         context, id, compilationUnit, fileAttr, funcName, fullName,
         funcFileAttr, line, line, subprogramFlags, subTypeAttr);
     funcOp->setLoc(builder.getFusedLoc({funcOp->getLoc()}, spAttr));
+
+    funcOp.walk([&](fir::cg::XDeclareOp declOp) {
+      handleDeclareOp(declOp, fileAttr, spAttr, typeGen);
+    });
   });
 }
 
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index e5b4050dfb2426..64c6547e06e0f9 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -24,11 +24,6 @@ DebugTypeGenerator::DebugTypeGenerator(mlir::ModuleOp m)
   LLVM_DEBUG(llvm::dbgs() << "DITypeAttr generator\n");
 }
 
-static mlir::LLVM::DITypeAttr genPlaceholderType(mlir::MLIRContext *context) {
-  return mlir::LLVM::DIBasicTypeAttr::get(
-      context, llvm::dwarf::DW_TAG_base_type, "void", 32, 1);
-}
-
 static mlir::LLVM::DITypeAttr genBasicType(mlir::MLIRContext *context,
                                            mlir::StringAttr name,
                                            unsigned bitSize,
@@ -37,6 +32,11 @@ static mlir::LLVM::DITypeAttr genBasicType(mlir::MLIRContext *context,
       context, llvm::dwarf::DW_TAG_base_type, name, bitSize, decoding);
 }
 
+static mlir::LLVM::DITypeAttr genPlaceholderType(mlir::MLIRContext *context) {
+  return genBasicType(context, mlir::StringAttr::get(context, "integer"), 32,
+                      llvm::dwarf::DW_ATE_signed);
+}
+
 mlir::LLVM::DITypeAttr
 DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr,
                                 mlir::LLVM::DIScopeAttr scope,
diff --git a/flang/test/Fir/declare-codegen.fir b/flang/test/Fir/declare-codegen.fir
index 9d68d3b2f9d4de..c5879facb1572f 100644
--- a/flang/test/Fir/declare-codegen.fir
+++ b/flang/test/Fir/declare-codegen.fir
@@ -1,5 +1,7 @@
 // Test rewrite of fir.declare. The result is replaced by the memref operand.
-// RUN: fir-opt --cg-rewrite %s -o - | FileCheck %s
+// RUN: fir-opt --cg-rewrite="preserve-declare=true" %s -o - | FileCheck %s --check-prefixes DECL
+// RUN: fir-opt --cg-rewrite="preserve-declare=false" %s -o - | FileCheck %s --check-prefixes NODECL
+// RUN: fir-opt --cg-rewrite %s -o - | FileCheck %s --check-prefixes NODECL
 
 
 func.func @test(%arg0: !fir.ref<!fir.array<12x23xi32>>) {
@@ -15,9 +17,14 @@ func.func @test(%arg0: !fir.ref<!fir.array<12x23xi32>>) {
 func.func private @bar(%arg0: !fir.ref<!fir.array<12x23xi32>>)
 
 
-// CHECK-LABEL: func.func @test(
-// CHECK-SAME: %[[arg0:.*]]: !fir.ref<!fir.array<12x23xi32>>) {
-// CHECK-NEXT: fir.call @bar(%[[arg0]]) : (!fir.ref<!fir.array<12x23xi32>>) -> ()
+// NODECL-LABEL: func.func @test(
+// NODECL-SAME: %[[arg0:.*]]: !fir.ref<!fir.array<12x23xi32>>) {
+// NODECL-NEXT: fir.call @bar(%[[arg0]]) : (!fir.ref<!fir.array<12x23xi32>>) -> ()
+
+// DECL-LABEL: func.func @test(
+// DECL-SAME: %[[arg0:.*]]: !fir.ref<!fir.array<12x23xi32>>) {
+// DECL: fircg.ext_declare
+
 
 func.func @useless_shape_with_duplicate_extent_operand(%arg0: !fir.ref<!fir.array<3x3xf32>>) {
   %c3 = arith.constant 3 : index
@@ -26,5 +33,8 @@ func.func @useless_shape_with_duplicate_extent_operand(%arg0: !fir.ref<!fir.arra
   return
 }
 
-// CHECK-LABEL: func.func @useless_shape_with_duplicate_extent_operand(
-// CHECK-NEXT: return
+// NODECL-LABEL: func.func @useless_shape_with_duplicate_extent_operand(
+// NODECL-NEXT: return
+
+// DECL-LABEL: func.func @useless_shape_with_duplicate_extent_operand(
+// DECL: fircg.ext_declare
diff --git a/flang/test/Fir/dummy-scope-codegen.fir b/flang/test/Fir/dummy-scope-codegen.fir
index caef3c1b257832..2e5c091552a7a6 100644
--- a/flang/test/Fir/dummy-scope-codegen.fir
+++ b/flang/test/Fir/dummy-scope-codegen.fir
@@ -1,9 +1,14 @@
-// RUN: fir-opt --cg-rewrite %s -o - | FileCheck %s
+// RUN: fir-opt --cg-rewrite="preserve-declare=true" %s -o - | FileCheck %s --check-prefixes DECL
+// RUN: fir-opt --cg-rewrite="preserve-declare=false" %s -o - | FileCheck %s --check-prefixes NODECL
+// RUN: fir-opt --cg-rewrite %s -o - | FileCheck %s  --check-prefixes NODECL
 
 func.func @dummy_scope(%arg0: !fir.ref<f32>) {
   %scope = fir.dummy_scope : !fir.dscope
   %0 = fir.declare %arg0 dummy_scope %scope {uniq_name = "x"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
   return
 }
-// CHECK-LABEL: func.func @dummy_scope(
-// CHECK-NEXT: return
+// DECL-LABEL: func.func @dummy_scope(
+// DECL: fircg.ext_declare
+
+// NODECL-LABEL: func.func @dummy_scope(
+// NODECL-NEXT: return
\ No newline at end of file
diff --git a/flang/test/Transforms/debug-local-var-2.f90 b/flang/test/Transforms/debug-local-var-2.f90
new file mode 100644
index 00000000000000..15b9b148492e14
--- /dev/null
+++ b/flang/test/Transforms/debug-local-var-2.f90
@@ -0,0 +1,91 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s
+
+! This tests checks the debug information for local variables in llvm IR.
+
+! CHECK-LABEL: define void @_QQmain
+! CHECK-DAG: %[[AL11:.*]] = alloca i32
+! CHECK-DAG: %[[AL12:.*]] = alloca i64
+! CHECK-DAG: %[[AL13:.*]] = alloca i8
+! CHECK-DAG: %[[AL14:.*]] = alloca i32
+! CHECK-DAG: %[[AL15:.*]] = alloca float
+! CHECK-DAG: %[[AL16:.*]] = alloca double
+! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[AL11]], metadata ![[I4:.*]], metadata !DIExpression())
+! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[AL12]], metadata ![[I8:.*]], metadata !DIExpression())
+! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[AL13]], metadata ![[L1:.*]], metadata !DIExpression())
+! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[AL14]], metadata ![[L4:.*]], metadata !DIExpression())
+! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[AL15]], metadata ![[R4:.*]], metadata !DIExpression())
+! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[AL16]], metadata ![[R8:.*]], metadata !DIExpression())
+! CHECK-LABEL: }
+
+! CHECK-LABEL: define {{.*}}i64 @_QFPfn1
+! CHECK-SAME: (ptr %[[ARG1:.*]], ptr %[[ARG2:.*]], ptr %[[ARG3:.*]])
+! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[ARG1]], metadata ![[A1:.*]], metadata !DIExpression())
+! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[ARG2]], metadata ![[B1:.*]], metadata !DIExpression())
+! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[ARG3]], metadata ![[C1:.*]], metadata !DIExpression())
+! CHECK-DAG: %[[AL2:.*]] = alloca i64
+! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[AL2]], metadata ![[RES1:.*]], metadata !DIExpression())
+! CHECK-LABEL: }
+
+! CHECK-LABEL: define {{.*}}i32 @_QFPfn2
+! CHECK-SAME: (ptr %[[FN2ARG1:.*]], ptr %[[FN2ARG2:.*]], ptr %[[FN2ARG3:.*]])
+! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[FN2ARG1]], metadata ![[A2:.*]], metadata !DIExpression())
+! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[FN2ARG2]], metadata ![[B2:.*]], metadata !DIExpression())
+! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[FN2ARG3]], metadata ![[C2:.*]], metadata !DIExpression())
+! CHECK-DAG: %[[AL3:.*]] = alloca i32
+! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[AL3]], metadata ![[RES2:.*]], metadata !DIExpression())
+! CHECK-LABEL: }
+
+program mn
+! CHECK-DAG: ![[MAIN:.*]] = distinct !DISubprogram(name: "_QQmain", {{.*}})
+
+! CHECK-DAG: ![[TYI32:.*]] = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed)
+! CHECK-DAG: ![[TYI64:.*]] = !DIBasicType(name: "integer", size: 64, encoding: DW_ATE_signed)
+! CHECK-DAG: ![[TYL8:.*]]  = !DIBasicType(name: "logical", size: 8, encoding: DW_ATE_boolean)
+! CHECK-DAG: ![[TYL32:.*]] = !DIBasicType(name: "logical", size: 32, encoding: DW_ATE_boolean)
+! CHECK-DAG: ![[TYR32:.*]] = !DIBasicType(name: "real", size: 32, encoding: DW_ATE_float)
+! CHECK-DAG: ![[TYR64:.*]] = !DIBasicType(name: "real", size: 64, encoding: DW_ATE_float)
+
+! CHECK-DAG: ![[I4]] = !DILocalVariable(name: "i4", scope: ![[MAIN]], file: !{{.*}}, line: [[@LINE+6]], type: ![[TYI32]])
+! CHECK-DAG: ![[I8]] = !DILocalVariable(name: "i8", scope: ![[MAIN]], file: !{{.*}}, line: [[@LINE+6]], type: ![[TYI64]])
+! CHECK-DAG: ![[R4]] = !DILocalVariable(name: "r4", scope: ![[MAIN]], file: !{{.*}}, line: [[@LINE+6]], type: ![[TYR32]])
+! CHECK-DAG: ![[R8]] = !DILocalVariable(name: "r8", scope: ![[MAIN]], file: !{{.*}}, line: [[@LINE+6]], type: ![[TYR64]])
+! CHECK-DAG: ![[L1]] = !DILocalVariable(name: "l1", scope: ![[MAIN]], file: !{{.*}}, line: [[@LINE+6]], type: ![[TYL8]])
+! CHECK-DAG: ![[L4]] = !DILocalVariable(name: "l4", scope: ![[MAIN]], file: !{{.*}}, line: [[@LINE+6]], type: ![[TYL32]])
+  integer(kind=4) :: i4
+  integer(kind=8) :: i8
+  real(kind=4) :: r4
+  real(kind=8) :: r8
+  logical(kind=1) :: l1
+  logical(kind=4) :: l4
+
+  i8 = fn1(i4, r8, l1)
+  i4 = fn2(i8, r4, l4)
+contains
+! CHECK-DAG: ![[FN1:.*]] = distinct !DISubprogram(name: "fn1", {{.*}})
+! CHECK-DAG: ![[A1]] = !DILocalVariable(name: "a1", arg: 1, scope: ![[FN1]], file: !{{.*}}, line: [[@LINE+5]], type: ![[TYI32]])
+! CHECK-DAG: ![[B1]] = !DILocalVariable(name: "b1", arg: 2, scope: ![[FN1]], file: !{{.*}}, line: [[@LINE+5]], type: ![[TYR64]])
+! CHECK-DAG: ![[C1]] = !DILocalVariable(name: "c1", arg: 3, scope: ![[FN1]], file: !{{.*}}, line: [[@LINE+5]], type: ![[TYL8]])
+! CHECK-DAG: ![[RES1]] = !DILocalVariable(name: "res1", scope: ![[FN1]], file: !{{.*}}, line: [[@LINE+5]], type: ![[TYI64]])
+  function fn1(a1, b1, c1) result (res1)
+    integer(kind=4), intent(in) :: a1
+    real(kind=8), intent(in) :: b1
+    logical(kind=1), intent(in) :: c1
+    integer(kind=8) :: res1
+
+    res1 = a1 + b1
+  end function
+
+! CHECK-DAG: ![[FN2:.*]] = distinct !DISubprogram(name: "fn2", {{.*}})
+! CHECK-DAG: ![[A2]] = !DILocalVariable(name: "a2", arg: 1, scope: ![[FN2]], file: !{{.*}}, line: [[@LINE+5]], type: ![[TYI64]])
+! CHECK-DAG: ![[B2]] = !DILocalVariable(name: "b2", arg: 2, scope: ![[FN2]], file: !{{.*}}, line: [[@LINE+5]], type: ![[TYR32]])
+! CHECK-DAG: ![[C2]] = !DILocalVariable(name: "c2", arg: 3, scope: ![[FN2]], file: !{{.*}}, line: [[@LINE+5]], type: ![[TYL32]])
+! CHECK-DAG: ![[RES2]] = !DILocalVariable(name: "res2", scope: ![[FN2]], file: !{{.*}}, line: [[@LINE+5]], type: ![[TYI32]])
+  function fn2(a2, b2, c2) result (res2)
+    integer(kind=8), intent(in) :: a2
+    real(kind=4), intent(in) :: b2
+    logical(kind=4), intent(in) :: c2
+    integer(kind=4) :: res2
+
+    res2 = a2 + b2
+  end function
+end program
diff --git a/flang/test/Transforms/debug-local-var.f90 b/flang/test/Transforms/debug-local-var.f90
new file mode 100644
index 00000000000000..96dc111ad308e1
--- /dev/null
+++ b/flang/test/Transforms/debug-local-var.f90
@@ -0,0 +1,54 @@
+! RUN: %flang_fc1 -emit-fir -debug-info-kind=standalone -mmlir --mlir-print-debuginfo %s -o - | \
+! RUN: fir-opt --cg-rewrite="preserve-declare=true" --mlir-print-debuginfo | fir-opt --add-debug-info --mlir-print-debuginfo | FileCheck %s
+
+! CHECK-DAG: #[[INT8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 64, encoding = DW_ATE_signed>
+! CHECK-DAG: #[[INT4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
+! CHECK-DAG: #[[REAL8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 64, encoding = DW_ATE_float>
+! CHECK-DAG: #[[LOG1:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 8, encoding = DW_ATE_boolean>
+! CHECK-DAG: #[[REAL4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
+! CHECK-DAG: #[[LOG4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 32, encoding = DW_ATE_boolean>
+! CHECK-DAG: #[[MAIN:.*]] = #llvm.di_subprogram<{{.*}}name = "_QQmain"{{.*}}>
+! CHECK-DAG: #[[FN1:.*]] = #llvm.di_subprogram<{{.*}}name = "fn1"{{.*}}>
+! CHECK-DAG: #[[FN2:.*]] = #llvm.di_subprogram<{{.*}}name = "fn2"{{.*}}>
+
+program mn
+! CHECK-DAG: #[[I4:.*]] = #llvm.di_local_variable<scope = #[[MAIN]], name = "i4", file = #{{.*}}, line = [[@LINE+6]], type = #[[INT4]]>
+! CHECK-DAG: #[[I8:.*]] = #llvm.di_local_variable<scope = #[[MAIN]], name = "i8", file = #{{.*}}, line = [[@LINE+6]], type = #[[INT8]]>
+! CHECK-DAG: #[[R4:.*]] = #llvm.di_local_variable<scope = #[[MAIN]], name = "r4", file = #{{.*}}, line = [[@LINE+6]], type = #[[REAL4]]>
+! CHECK-DAG: #[[R8:.*]] = #llvm.di_local_variable<scope = #[[MAIN]], name = "r8", file = #{{.*}}, line = [[@LINE+6]], type = #[[REAL8]]>
+! CHECK-DAG: #[[L1:.*]] = #llvm.di_local_variable<scope = #[[MAIN]], name = "l1", file = #{{.*}}, line = [[@LINE+6]], type = #[[LOG1]]>
+! CHECK-DAG: #[[L4:.*]] = #llvm.di_local_variable<scope = #[[MAIN]], name = "l4", file = #{{.*}}, line = [[@LINE+6]], type = #[[LOG4]]>
+  integer(kind=4) :: i4
+  integer(kind=8) :: i8
+  real(kind=4) :: r4
+  real(kind=8) :: r8
+  logical(kind=1) :: l1
+  logical(kind=4) :: l4
+  i8 = fn1(i4, r8, l1)
+  i4 = fn2(i8, r4, l4)
+contains
+  function fn1(a1, b1, c1) result (res1)
+! CHECK-DAG: #[[A1:.*]] = #llvm.di_local_variable<scope = #[[FN1]], name = "a1", file = #{{.*}}, line = [[@LINE+4]], arg = 1, type = #[[INT4]]>
+! CHECK-DAG: #[[B1:.*]] = #llvm.di_local_variable<scope = #[[FN1]], name = "b1", file = #{{.*}}, line = [[@LINE+4]], arg = 2, type = #[[REAL8]]>
+! CHECK-DAG: #[[C1:.*]] = #llvm.di_local_variable<scope = #[[FN1]], name = "c1", file = #{{.*}}, line = [[@LINE+4]], arg = 3, type = #[[LOG1]]>
+! CHECK-DAG: #[[RES1:.*]] = #llvm.di_local_variable<scope = #[[FN1]], name = "res1", file = #{{.*}}, line = [[@LINE+4]], type = #[[INT8]]>
+    integer(kind=4), intent(in) :: a1
+    real(kind=8), intent(in) :: b1
+    logical(kind=1), intent(in) :: c1
+    integer(kind=8) :: res1
+    res1 = a1 + b1
+  end function
+
+  function fn2(a2, b2, c2) result (res2)
+    implicit none
+! CHECK-DAG: #[[A2:.*]] = #llvm.di_local_variable<scope = #[[FN2]], name = "a2", file = #{{.*}}, line = [[@LINE+4]], arg = 1, type = #[[INT8]]>
+! CHECK-DAG: #[[B2:.*]] = #llvm.di_local_variable<scope = #[[FN2]], name = "b2", file = #{{.*}}, line = [[@LINE+4]], arg = 2, type = #[[REAL4]]>
+! CHECK-DAG: #[[C2:.*]] = #llvm.di_local_variable<scope = #[[FN2]], name = "c2", file = #{{.*}}, line = [[@LINE+4]], arg = 3, type = #[[LOG4]]>
+! CHECK-DAG: #[[RES2:.*]] = #llvm.di_local_variable<scope = #[[FN2]], name = "res2", file = #{{.*}}, line = [[@LINE+4]], type = #[[INT4]]>
+    integer(kind=8), intent(in) :: a2
+    real(kind=4), intent(in) :: b2
+    logical(kind=4), intent(in) :: c2
+    integer(kind=4) :: res2
+    res2 = a2 + b2
+  end function
+end program

From eda098aadea3e542f95b5f0d4173f00eae42dc72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Endre=20F=C3=BCl=C3=B6p?= <endre.fulop@sigmatechnology.com>
Date: Wed, 15 May 2024 16:26:17 +0200
Subject: [PATCH 28/71] [clang][analyzer] Fix a crash in
 alpha.unix.BlockInCriticalSection (#90030)

When analyzing C code with function pointers the checker crashes because
of how the implementation extracts `IdentifierInfo`. Without the fix, this
test crashes.
---
 .../Checkers/BlockInCriticalSectionChecker.cpp              | 5 ++---
 clang/test/Analysis/block-in-critical-section.c             | 6 ++++++
 2 files changed, 8 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/Analysis/block-in-critical-section.c

diff --git a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
index e138debd1361ca..92347f8fafc000 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BlockInCriticalSectionChecker.cpp
@@ -103,9 +103,8 @@ class RAIIMutexDescriptor {
       // this function is called instead of early returning it. To avoid this, a
       // bool variable (IdentifierInfoInitialized) is used and the function will
       // be run only once.
-      Guard = &Call.getCalleeAnalysisDeclContext()->getASTContext().Idents.get(
-          GuardName);
-      IdentifierInfoInitialized = true;
+      const auto &ASTCtx = Call.getState()->getStateManager().getContext();
+      Guard = &ASTCtx.Idents.get(GuardName);
     }
   }
 
diff --git a/clang/test/Analysis/block-in-critical-section.c b/clang/test/Analysis/block-in-critical-section.c
new file mode 100644
index 00000000000000..1e174af541b183
--- /dev/null
+++ b/clang/test/Analysis/block-in-critical-section.c
@@ -0,0 +1,6 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.unix.BlockInCriticalSection -verify %s
+// expected-no-diagnostics
+
+// This should not crash
+int (*a)(void);
+void b(void) { a(); }

From da116bd82c0a78d2022c34b56e45cf6e4f91eaed Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 15 May 2024 15:37:51 +0100
Subject: [PATCH 29/71] [Clang] Use ULL for std::max constant argument to fix
 build failure.

getKnownMinValue returns uint64_t, use ULL to make sure the second arg
is also 64 bit.
---
 clang/lib/CodeGen/Targets/Sparc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/Targets/Sparc.cpp b/clang/lib/CodeGen/Targets/Sparc.cpp
index b82e9a69e19671..13e9550781d170 100644
--- a/clang/lib/CodeGen/Targets/Sparc.cpp
+++ b/clang/lib/CodeGen/Targets/Sparc.cpp
@@ -266,7 +266,7 @@ SparcV9ABIInfo::classifyType(QualType Ty, unsigned SizeLimit) const {
   // All structs, even empty ones, should take up a register argument slot,
   // so pin the minimum struct size to one bit.
   CB.pad(llvm::alignTo(
-      std::max(CB.DL.getTypeSizeInBits(StrTy).getKnownMinValue(), 1UL), 64));
+      std::max(CB.DL.getTypeSizeInBits(StrTy).getKnownMinValue(), 1ULL), 64));
 
   // Try to use the original type for coercion.
   llvm::Type *CoerceTy = CB.isUsableType(StrTy) ? StrTy : CB.getType();

From b42d245b77a83f8f6ca88c2dc441a96a5e8d5b52 Mon Sep 17 00:00:00 2001
From: AdityaK <hiraditya@msn.com>
Date: Wed, 15 May 2024 07:44:34 -0700
Subject: [PATCH 30/71] [GVNHoist] Replace combineKnownMetadata with
 combineMetadataForCSE (#92197)

There is no reason to call combineMetadata directly with a list of MD_
nodes. The combineMetadataForCSE function handles all the metadata
correctly

Partially fixes: #30866
---
 llvm/lib/Transforms/Scalar/GVNHoist.cpp | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index 261c1259c9c961..b5333c532280ca 100644
--- a/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -238,18 +238,6 @@ class CallInfo {
   const VNtoInsns &getStoreVNTable() const { return VNtoCallsStores; }
 };
 
-static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) {
-  static const unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
-                                      LLVMContext::MD_alias_scope,
-                                      LLVMContext::MD_noalias,
-                                      LLVMContext::MD_range,
-                                      LLVMContext::MD_fpmath,
-                                      LLVMContext::MD_invariant_load,
-                                      LLVMContext::MD_invariant_group,
-                                      LLVMContext::MD_access_group};
-  combineMetadata(ReplInst, I, KnownIDs, true);
-}
-
 // This pass hoists common computations across branches sharing common
 // dominator. The primary goal is to reduce the code size, and in some
 // cases reduce critical path (by exposing more ILP).
@@ -996,8 +984,8 @@ unsigned GVNHoist::rauw(const SmallVecInsn &Candidates, Instruction *Repl,
         MSSAUpdater->removeMemoryAccess(OldMA);
       }
 
+      combineMetadataForCSE(Repl, I, true);
       Repl->andIRFlags(I);
-      combineKnownMetadata(Repl, I);
       I->replaceAllUsesWith(Repl);
       // Also invalidate the Alias Analysis cache.
       MD->removeInstruction(I);

From 8a4cbeada930bf11fe740a2038bd5a3230712284 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 15 May 2024 15:48:22 +0100
Subject: [PATCH 31/71] [Clang] Unbreak build take 2 using uint64_t()
 explicitly.

---
 clang/lib/CodeGen/Targets/Sparc.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/Targets/Sparc.cpp b/clang/lib/CodeGen/Targets/Sparc.cpp
index 13e9550781d170..561f0b514d909b 100644
--- a/clang/lib/CodeGen/Targets/Sparc.cpp
+++ b/clang/lib/CodeGen/Targets/Sparc.cpp
@@ -266,7 +266,8 @@ SparcV9ABIInfo::classifyType(QualType Ty, unsigned SizeLimit) const {
   // All structs, even empty ones, should take up a register argument slot,
   // so pin the minimum struct size to one bit.
   CB.pad(llvm::alignTo(
-      std::max(CB.DL.getTypeSizeInBits(StrTy).getKnownMinValue(), 1ULL), 64));
+      std::max(CB.DL.getTypeSizeInBits(StrTy).getKnownMinValue(), uint64_t(1)),
+      64));
 
   // Try to use the original type for coercion.
   llvm::Type *CoerceTy = CB.isUsableType(StrTy) ? StrTy : CB.getType();

From dceaa0f4491ebe30c0b0f1bc7fa5ec365b60ced6 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Wed, 15 May 2024 10:55:34 -0400
Subject: [PATCH 32/71] [Support] Use malloc instead of non-throwing new
 (#92157)

When allocating a memory buffer, we use a non-throwing new so that we
can explicitly handle memory buffers that are too large to fit into
memory. However, when exceptions are disabled, LLVM installs a custom
new handler

(https://github.com/llvm/llvm-project/blob/90109d444839683b09f0aafdc50b749cb4b3203b/llvm/lib/Support/InitLLVM.cpp#L61)
that explicitly crashes when we run out of memory

(https://github.com/llvm/llvm-project/blob/de14b749fee41d4ded711e771e43043ae3100cb3/llvm/lib/Support/ErrorHandling.cpp#L188)
and that means this particular out-of-memory situation cannot be
gracefully handled.

This was discovered while working on #embed
(https://github.com/llvm/llvm-project/pull/68620) on Windows and
resulted in a crash rather than the preprocessor issuing a diagnostic as
expected.

This patch switches away from the non-throwing new to a call to malloc
(and free), which will return a null pointer without calling a custom
new handler. It is the only instance in Clang or LLVM that I could find
which used a non-throwing new, so I did not think we would need anything
more involved than this change.

Testing this would be highly platform dependent and so it does not come
with test coverage. And because it doesn't change behavior that users
are likely to be able to observe, it does not come with a release note.
---
 llvm/lib/Support/MemoryBuffer.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index 4cc4fe019b75b1..50308bd2bf4a3b 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -98,7 +98,7 @@ class MemoryBufferMem : public MB {
 
   /// Disable sized deallocation for MemoryBufferMem, because it has
   /// tail-allocated data.
-  void operator delete(void *p) { ::operator delete(p); }
+  void operator delete(void *p) { std::free(p); }
 
   StringRef getBufferIdentifier() const override {
     // The name is stored after the class itself.
@@ -315,7 +315,14 @@ WritableMemoryBuffer::getNewUninitMemBuffer(size_t Size,
   size_t RealLen = StringLen + Size + 1 + BufAlign.value();
   if (RealLen <= Size) // Check for rollover.
     return nullptr;
-  char *Mem = static_cast<char*>(operator new(RealLen, std::nothrow));
+  // We use a call to malloc() rather than a call to a non-throwing operator
+  // new() because LLVM unconditionally installs an out of memory new handler
+  // when exceptions are disabled. This new handler intentionally crashes to
+  // aid with debugging, but that makes non-throwing new calls unhelpful.
+  // See MemoryBufferMem::operator delete() for the paired call to free(), and
+  // llvm::install_out_of_memory_new_handler() for the installation of the
+  // custom new handler.
+  char *Mem = static_cast<char *>(std::malloc(RealLen));
   if (!Mem)
     return nullptr;
 

From 8ab753c121447c1388c4cb1af08ab27b2cd62a82 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Wed, 15 May 2024 10:10:49 -0500
Subject: [PATCH 33/71] [flang][OpenMP] Add `REQUIRES: asserts` to test that
 relies on it

This should fix failures in release builds.
---
 flang/test/Lower/OpenMP/invalid-reduction-modifier.f90 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flang/test/Lower/OpenMP/invalid-reduction-modifier.f90 b/flang/test/Lower/OpenMP/invalid-reduction-modifier.f90
index 817c5b731c62f8..53871276761fa4 100644
--- a/flang/test/Lower/OpenMP/invalid-reduction-modifier.f90
+++ b/flang/test/Lower/OpenMP/invalid-reduction-modifier.f90
@@ -1,4 +1,5 @@
 !Remove the --crash below once we can diagnose the issue more gracefully.
+!REQUIRES: asserts
 !RUN: not --crash %flang_fc1 -fopenmp -emit-hlfir -o - %s
 
 ! Check that we reject the "task" reduction modifier on the "simd" directive.

From 413aaf11cd74f422f05b990613f822dc10db4391 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Wed, 15 May 2024 13:03:35 +0200
Subject: [PATCH 34/71] [clang][Interp][NFC] Support IntAP(S) in emitPrimCast

---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 1da74ac7c8bd1d..7cdc1c6d1947c6 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -3685,12 +3685,22 @@ bool ByteCodeExprGen<Emitter>::emitPrimCast(PrimType FromT, PrimType ToT,
       return this->emitCastFP(ToSem, getRoundingMode(E), E);
     }
 
+    if (ToT == PT_IntAP)
+      return this->emitCastFloatingIntegralAP(Ctx.getBitWidth(ToQT), E);
+    if (ToT == PT_IntAPS)
+      return this->emitCastFloatingIntegralAPS(Ctx.getBitWidth(ToQT), E);
+
     // Float to integral.
     if (isIntegralType(ToT) || ToT == PT_Bool)
       return this->emitCastFloatingIntegral(ToT, E);
   }
 
   if (isIntegralType(FromT) || FromT == PT_Bool) {
+    if (ToT == PT_IntAP)
+      return this->emitCastAP(FromT, Ctx.getBitWidth(ToQT), E);
+    if (ToT == PT_IntAPS)
+      return this->emitCastAPS(FromT, Ctx.getBitWidth(ToQT), E);
+
     // Integral to integral.
     if (isIntegralType(ToT) || ToT == PT_Bool)
       return FromT != ToT ? this->emitCast(FromT, ToT, E) : true;

From 28d5f7907e8c3adb6f0e2e16c9673a99f5e07522 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Wed, 15 May 2024 15:15:07 +0200
Subject: [PATCH 35/71] [clang][Interp][NFC] Use a smaller default size for
 IntegralAP

Since we later possibly initialize the value by using operator-new,
we need the default value to _not_ allocate memory.
---
 clang/lib/AST/Interp/IntegralAP.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/IntegralAP.h b/clang/lib/AST/Interp/IntegralAP.h
index fb7ee14515715a..7464f15cdb03b4 100644
--- a/clang/lib/AST/Interp/IntegralAP.h
+++ b/clang/lib/AST/Interp/IntegralAP.h
@@ -61,7 +61,7 @@ template <bool Signed> class IntegralAP final {
 
   IntegralAP(APInt V) : V(V) {}
   /// Arbitrary value for uninitialized variables.
-  IntegralAP() : IntegralAP(-1, 1024) {}
+  IntegralAP() : IntegralAP(-1, 3) {}
 
   IntegralAP operator-() const { return IntegralAP(-V); }
   IntegralAP operator-(const IntegralAP &Other) const {

From 4527adc500ea0dc4b942a51dc7209da4ea26d9a2 Mon Sep 17 00:00:00 2001
From: Daniel Kuts <kutz@ispras.ru>
Date: Wed, 15 May 2024 18:15:13 +0300
Subject: [PATCH 36/71] Fix null pointer dereference in logging in mlir
 TransformOps (#92237)

A variable `typeConverterOp` may be nullptr after dynamic cast. There is
a security guard for this, but during logging error message the variable
getting dereferenced.
Found with static analysis.
---
 mlir/lib/Dialect/Transform/IR/TransformOps.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index eb09f007fbca88..247759e21efb16 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -648,13 +648,14 @@ LogicalResult transform::ApplyConversionPatternsOp::verify() {
     if (!llvm::hasSingleElement(typeConverterRegion.front()))
       return emitOpError()
              << "expected exactly one op in default type converter region";
+    Operation *maybeTypeConverter = &typeConverterRegion.front().front();
     auto typeConverterOp = dyn_cast<transform::TypeConverterBuilderOpInterface>(
-        &typeConverterRegion.front().front());
+        maybeTypeConverter);
     if (!typeConverterOp) {
       InFlightDiagnostic diag = emitOpError()
                                 << "expected default converter child op to "
                                    "implement TypeConverterBuilderOpInterface";
-      diag.attachNote(typeConverterOp->getLoc()) << "op without interface";
+      diag.attachNote(maybeTypeConverter->getLoc()) << "op without interface";
       return diag;
     }
     // Check default type converter type.

From b576a6b0452b9bfb634feaa215506d8a1afe857d Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Wed, 15 May 2024 23:15:48 +0800
Subject: [PATCH 37/71] [X86][AMX] Fix a bug after #83628 (#91207)

We need to check if `GR64Cand` a valid register before using it.

Test is not needed since it's covered in llvm-test-suite.

Fixes #90954
---
 llvm/lib/Target/X86/X86LowerTileCopy.cpp    |   4 +-
 llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll | 153 ++++++++++++++++++++
 2 files changed, 155 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
index fd05e16ac1ceff..60c024556ff13f 100644
--- a/llvm/lib/Target/X86/X86LowerTileCopy.cpp
+++ b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
@@ -146,7 +146,7 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
           addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc)), TileSS)
               .addReg(SrcReg, getKillRegState(SrcMO.isKill()));
       MachineOperand &MO = NewMI->getOperand(2);
-      MO.setReg(GR64Cand);
+      MO.setReg(GR64Cand ? GR64Cand : X86::RAX);
       MO.setIsKill(true);
       // tileloadd (%sp, %idx), %tmm
       Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
@@ -157,7 +157,7 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
         // restore %rax
         // mov (%sp) %rax
         addFrameReference(
-            BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), GR64Cand), StrideSS);
+            BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), X86::RAX), StrideSS);
       }
       MI.eraseFromParent();
       Changed = true;
diff --git a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
index 4a9f9d3bf77aac..7511e5953dac14 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
@@ -51,3 +51,156 @@ declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_
 declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)
+
+define void @PR90954(ptr %0, ptr %1, i32 %2) {
+; CHECK-LABEL: PR90954:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    .cfi_def_cfa_register %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    andq $-1024, %rsp # imm = 0xFC00
+; CHECK-NEXT:    subq $5120, %rsp # imm = 0x1400
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $16, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $64, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $16, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $64, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $16, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $64, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    shll $4, %edx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movw $64, %cx
+; CHECK-NEXT:    movw $16, %di
+; CHECK-NEXT:    movb $1, %r8b
+; CHECK-NEXT:    movl $64, %r9d
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %r11
+; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    jmp .LBB1_1
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB1_5: # in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    incq %r14
+; CHECK-NEXT:    addl %edx, %ebx
+; CHECK-NEXT:  .LBB1_1: # =>This Loop Header: Depth=1
+; CHECK-NEXT:    # Child Loop BB1_2 Depth 2
+; CHECK-NEXT:    movslq %ebx, %r15
+; CHECK-NEXT:    leaq (%rsi,%r15,4), %r15
+; CHECK-NEXT:    xorl %r12d, %r12d
+; CHECK-NEXT:    xorl %r13d, %r13d
+; CHECK-NEXT:    jmp .LBB1_2
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB1_4: # in Loop: Header=BB1_2 Depth=2
+; CHECK-NEXT:    tilestored %tmm1, (%r15,%rax)
+; CHECK-NEXT:    incq %r13
+; CHECK-NEXT:    addq $64, %r15
+; CHECK-NEXT:    decq %r12
+; CHECK-NEXT:    je .LBB1_5
+; CHECK-NEXT:  .LBB1_2: # Parent Loop BB1_1 Depth=1
+; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    tilezero %tmm0
+; CHECK-NEXT:    tilezero %tmm1
+; CHECK-NEXT:    testb %r8b, %r8b
+; CHECK-NEXT:    jne .LBB1_4
+; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB1_2 Depth=2
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    tileloadd (%r10,%r9), %tmm1
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    tileloadd (%r11,%r9), %tmm2
+; CHECK-NEXT:    tdpbf16ps %tmm2, %tmm1, %tmm0
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movabsq $64, %rax
+; CHECK-NEXT:    tilestored %tmm0, 3072(%rsp,%rax) # 1024-byte Folded Spill
+; CHECK-NEXT:    tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm1 # 1024-byte Folded Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT:    jmp .LBB1_4
+  %4 = shl i32 %2, 4
+  %5 = icmp eq i64 0, 0
+  br label %6
+
+6:                                                ; preds = %31, %3
+  %7 = phi i64 [ 0, %3 ], [ %32, %31 ]
+  %8 = trunc nuw nsw i64 %7 to i32
+  %9 = mul i32 %4, %8
+  %10 = mul i32 0, %8
+  %11 = sext i32 %9 to i64
+  %12 = getelementptr inbounds i32, ptr %1, i64 %11
+  br label %13
+
+13:                                               ; preds = %25, %6
+  %14 = phi i64 [ %29, %25 ], [ 0, %6 ]
+  %15 = tail call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
+  %16 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %15)
+  %17 = shl nsw i64 %14, 4
+  %18 = getelementptr i32, ptr %0, i64 %17
+  br i1 %5, label %25, label %19
+
+19:                                               ; preds = %13
+  %20 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %16)
+  %21 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
+  %22 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
+  %23 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %20, x86_amx %21, x86_amx %22)
+  %24 = tail call noundef <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %23)
+  br label %25
+
+25:                                               ; preds = %19, %13
+  %26 = phi <256 x i32> [ undef, %13 ], [ %24, %19 ]
+  %27 = getelementptr inbounds i32, ptr %12, i64 %17
+  %28 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %26)
+  tail call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %27, i64 0, x86_amx %28)
+  %29 = add nuw nsw i64 %14, 1
+  %30 = icmp eq i64 %29, 0
+  br i1 %30, label %31, label %13
+
+31:                                               ; preds = %25
+  %32 = add nuw nsw i64 %7, 1
+  br label %6
+}
+
+declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>)
+declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx)

From bed5546bb53bdb231b62f569b67f449019426ce8 Mon Sep 17 00:00:00 2001
From: Rajveer Singh Bharadwaj <rajveer.developer@icloud.com>
Date: Wed, 15 May 2024 20:46:15 +0530
Subject: [PATCH 38/71] [DebugInfo] Get rid of redundant conditional checks in
 `/DebugInfo` (#92111)

Resolves #90326
---
 llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h
index 3f7f8c7838fd16..efc8db12a69720 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h
@@ -246,20 +246,17 @@ class LVObject {
   virtual void setName(StringRef ObjectName) {}
 
   LVElement *getParent() const {
-    assert((!Parent.Element ||
-            (Parent.Element && static_cast<LVElement *>(Parent.Element))) &&
+    assert((!Parent.Element || static_cast<LVElement *>(Parent.Element)) &&
            "Invalid element");
     return Parent.Element;
   }
   LVScope *getParentScope() const {
-    assert((!Parent.Scope ||
-            (Parent.Scope && static_cast<LVScope *>(Parent.Scope))) &&
+    assert((!Parent.Scope || static_cast<LVScope *>(Parent.Scope)) &&
            "Invalid scope");
     return Parent.Scope;
   }
   LVSymbol *getParentSymbol() const {
-    assert((!Parent.Symbol ||
-            (Parent.Symbol && static_cast<LVSymbol *>(Parent.Symbol))) &&
+    assert((!Parent.Symbol || static_cast<LVSymbol *>(Parent.Symbol)) &&
            "Invalid symbol");
     return Parent.Symbol;
   }

From dcf3102be8458fe7588f9d11315beddfca4323b0 Mon Sep 17 00:00:00 2001
From: Elvina Yakubova <eyakubova@nvidia.com>
Date: Wed, 15 May 2024 16:16:39 +0100
Subject: [PATCH 39/71] [BOLT][NFC] Add documentation on BOLT options (#92117)

Add .md file documentation with all BOLT options to display it more
conveniently.
---
 bolt/docs/CommandLineArgumentReference.md | 1213 +++++++++++++++++++++
 1 file changed, 1213 insertions(+)
 create mode 100644 bolt/docs/CommandLineArgumentReference.md

diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
new file mode 100644
index 00000000000000..1951ad5a2dc5eb
--- /dev/null
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -0,0 +1,1213 @@
+# BOLT - a post-link optimizer developed to speed up large applications
+
+## SYNOPSIS
+
+`llvm-bolt <executable> [-o outputfile] <executable>.bolt [-data=perf.fdata] [options]`
+
+## OPTIONS
+
+### Generic options
+
+- `-h`
+
+  Alias for `--help`
+
+- `--help`
+
+  Display available options (`--help-hidden` for more).
+
+- `--help-hidden`
+
+  Display all available options.
+
+- `--help-list`
+
+  Display list of available options (`--help-list-hidden` for more).
+
+- `--help-list-hidden`
+
+  Display list of all available options.
+
+- `--print-all-options`
+
+  Print all option values after command line parsing.
+
+- `--print-options`
+
+  Print non-default options after command line parsing.
+
+- `--version`
+
+  Display the version of this program.
+
+### Output options
+
+- `-o <string>`
+
+  output file
+
+- `-w <string>`
+
+  Save recorded profile to a file
+
+### BOLT generic options
+
+- `--align-text=<uint>`
+
+  Alignment of .text section
+
+- `--allow-stripped`
+
+  Allow processing of stripped binaries
+
+- `--asm-dump[=<dump folder>]`
+
+  Dump function into assembly
+
+- `-b`
+
+  Alias for -data
+
+- `--bolt-id=<string>`
+
+  Add any string to tag this execution in the output binary via bolt info section
+
+- `--break-funcs=<func1,func2,func3,...>`
+
+  List of functions to core dump on (debugging)
+
+- `--check-encoding`
+
+  Perform verification of LLVM instruction encoding/decoding. Every instruction
+  in the input is decoded and re-encoded. If the resulting bytes do not match
+  the input, a warning message is printed.
+
+- `--cu-processing-batch-size=<uint>`
+
+  Specifies the size of batches for processing CUs. Higher number has better
+  performance, but more memory usage. Default value is 1.
+
+- `--data=<string>`
+
+  <data file>
+
+- `--debug-skeleton-cu`
+
+  Prints out offsets for abbrev and debug_info of Skeleton CUs that get patched.
+
+- `--deterministic-debuginfo`
+
+  Disables parallel execution of tasks that may produce nondeterministic debug info
+
+- `--dot-tooltip-code`
+
+  Add basic block instructions as tool tips on nodes
+
+- `--dump-cg=<string>`
+
+  Dump callgraph to the given file
+
+- `--dump-data`
+
+  Dump parsed bolt data for debugging
+
+- `--dump-dot-all`
+
+  Dump function CFGs to graphviz format after each stage; enable '-print-loops'
+  for color-coded blocks
+
+- `--dump-orc`
+
+  Dump raw ORC unwind information (sorted)
+
+- `--dwarf-output-path=<string>`
+
+  Path to where .dwo files or dwp file will be written out to.
+
+- `--dwp=<string>`
+
+  Path and name to DWP file.
+
+- `--dyno-stats`
+
+  Print execution info based on profile
+
+- `--dyno-stats-all`
+
+  Print dyno stats after each stage
+
+- `--dyno-stats-scale=<uint>`
+
+  Scale to be applied while reporting dyno stats
+
+- `--enable-bat`
+
+  Write BOLT Address Translation tables
+
+- `--force-data-relocations`
+
+  Force relocations to data sections to always be processed
+
+- `--force-patch`
+
+  Force patching of original entry points
+
+- `--funcs=<func1,func2,func3,...>`
+
+  Limit optimizations to functions from the list
+
+- `--funcs-file=<string>`
+
+  File with list of functions to optimize
+
+- `--funcs-file-no-regex=<string>`
+
+  File with list of functions to optimize (non-regex)
+
+- `--funcs-no-regex=<func1,func2,func3,...>`
+
+  Limit optimizations to functions from the list (non-regex)
+
+- `--hot-data`
+
+  Hot data symbols support (relocation mode)
+
+- `--hot-functions-at-end`
+
+  If reorder-functions is used, order functions putting hottest last
+
+- `--hot-text`
+
+  Generate hot text symbols. Apply this option to a precompiled binary that
+  manually calls into hugify, such that at runtime hugify call will put hot
+  code into 2M pages. This requires relocation.
+
+- `--hot-text-move-sections=<sec1,sec2,sec3,...>`
+
+  List of sections containing functions used for hugifying hot text. BOLT makes
+  sure these functions are not placed on the same page as the hot text.
+  (default='.stub,.mover').
+
+- `--insert-retpolines`
+
+  Run retpoline insertion pass
+
+- `--keep-aranges`
+
+  Keep or generate .debug_aranges section if .gdb_index is written
+
+- `--keep-tmp`
+
+  Preserve intermediate .o file
+
+- `--lite`
+
+  Skip processing of cold functions
+
+- `--max-data-relocations=<uint>`
+
+  Maximum number of data relocations to process
+
+- `--max-funcs=<uint>`
+
+  Maximum number of functions to process
+
+- `--no-huge-pages`
+
+  Use regular size pages for code alignment
+
+- `--no-threads`
+
+  Disable multithreading
+
+- `--pad-funcs=<func1:pad1,func2:pad2,func3:pad3,...>`
+
+  List of functions to pad with amount of bytes
+
+- `--profile-format=<value>`
+
+  Format to dump profile output in aggregation mode, default is fdata
+  - `=fdata`: offset-based plaintext format
+  - `=yaml`: dense YAML representation
+
+- `--r11-availability=<value>`
+
+  Determine the availability of r11 before indirect branches
+  - `=never`: r11 not available
+  - `=always`: r11 available before calls and jumps
+  - `=abi`r11 available before calls but not before jumps
+
+- `--relocs`
+
+  Use relocations in the binary (default=autodetect)
+
+- `--remove-symtab`
+
+  Remove .symtab section
+
+- `--reorder-skip-symbols=<symbol1,symbol2,symbol3,...>`
+
+  List of symbol names that cannot be reordered
+
+- `--reorder-symbols=<symbol1,symbol2,symbol3,...>`
+
+  List of symbol names that can be reordered
+
+- `--retpoline-lfence`
+
+  Determine if lfence instruction should exist in the retpoline
+
+- `--skip-funcs=<func1,func2,func3,...>`
+
+  List of functions to skip
+
+- `--skip-funcs-file=<string>`
+
+  File with list of functions to skip
+
+- `--strict`
+
+  Trust the input to be from a well-formed source
+
+- `--tasks-per-thread=<uint>`
+
+  Number of tasks to be created per thread
+
+- `--thread-count=<uint>`
+
+  Number of threads
+
+- `--top-called-limit=<uint>`
+
+  Maximum number of functions to print in top called functions section
+
+- `--trap-avx512`
+
+  In relocation mode trap upon entry to any function that uses AVX-512 instructions
+
+- `--trap-old-code`
+
+  Insert traps in old function bodies (relocation mode)
+
+- `--update-debug-sections`
+
+  Update DWARF debug sections of the executable
+
+- `--use-gnu-stack`
+
+  Use GNU_STACK program header for new segment (workaround for issues with
+  strip/objcopy)
+
+- `--use-old-text`
+
+  Re-use space in old .text if possible (relocation mode)
+
+- `-v <uint>`
+
+  Set verbosity level for diagnostic output
+
+- `--write-dwp`
+
+  Output a single dwarf package file (dwp) instead of multiple non-relocatable
+  dwarf object files (dwo).
+
+### BOLT optimization options
+
+- `--align-blocks`
+
+  Align basic blocks
+
+- `--align-blocks-min-size=<uint>`
+
+  Minimal size of the basic block that should be aligned
+
+- `--align-blocks-threshold=<uint>`
+
+  Align only blocks with frequency larger than containing function execution
+  frequency specified in percent. E.g. 1000 means aligning blocks that are 10
+  times more frequently executed than the containing function.
+
+- `--align-functions=<uint>`
+
+  Align functions at a given value (relocation mode)
+
+- `--align-functions-max-bytes=<uint>`
+
+  Maximum number of bytes to use to align functions
+
+- `--assume-abi`
+
+  Assume the ABI is never violated
+
+- `--block-alignment=<uint>`
+
+  Boundary to use for alignment of basic blocks
+
+- `--bolt-seed=<uint>`
+
+  Seed for randomization
+
+- `--cg-from-perf-data`
+
+  Use perf data directly when constructing the call graph for stale functions
+
+- `--cg-ignore-recursive-calls`
+
+  Ignore recursive calls when constructing the call graph
+
+- `--cg-use-split-hot-size`
+
+  Use hot/cold data on basic blocks to determine hot sizes for call graph functions
+
+- `--cold-threshold=<uint>`
+
+  Tenths of percents of main entry frequency to use as a threshold when
+  evaluating whether a basic block is cold (0 means it is only considered
+  cold if the block has zero samples). Default: 0
+
+- `--elim-link-veneers`
+
+  Run veneer elimination pass
+
+- `--eliminate-unreachable`
+
+  Eliminate unreachable code
+
+- `--equalize-bb-counts`
+
+  Use same count for BBs that should have equivalent count (used in non-LBR
+  and shrink wrapping)
+
+- `--execution-count-threshold=<uint>`
+
+  Perform profiling accuracy-sensitive optimizations only if function execution
+  count >= the threshold (default: 0)
+
+- `--fix-block-counts`
+
+  Adjust block counts based on outgoing branch counts
+
+- `--fix-func-counts`
+
+  Adjust function counts based on basic blocks execution count
+
+- `--force-inline=<func1,func2,func3,...>`
+
+  List of functions to always consider for inlining
+
+- `--frame-opt=<value>`
+
+  Optimize stack frame accesses
+  - `none`: do not perform frame optimization
+  - `hot`: perform FOP on hot functions
+  - `all`: perform FOP on all functions
+
+- `--frame-opt-rm-stores`
+
+  Apply additional analysis to remove stores (experimental)
+
+- `--function-order=<string>`
+
+  File containing an ordered list of functions to use for function reordering
+
+- `--generate-function-order=<string>`
+
+  File to dump the ordered list of functions to use for function reordering
+
+- `--generate-link-sections=<string>`
+
+  Generate a list of function sections in a format suitable for inclusion in a
+  linker script
+
+- `--group-stubs`
+
+  Share stubs across functions
+
+- `--hugify`
+
+  Automatically put hot code on 2MB page(s) (hugify) at runtime. No manual call
+  to hugify is needed in the binary (which is what --hot-text relies on).
+
+- `--icf`
+
+  Fold functions with identical code
+
+- `--icp`
+
+  Alias for --indirect-call-promotion
+
+- `--icp-calls-remaining-percent-threshold=<uint>`
+
+  The percentage threshold against remaining unpromoted indirect call count
+  for the promotion for calls
+
+- `--icp-calls-topn`
+
+  Alias for --indirect-call-promotion-calls-topn
+
+- `--icp-calls-total-percent-threshold=<uint>`
+
+  The percentage threshold against total count for the promotion for calls
+
+- `--icp-eliminate-loads`
+
+  Enable load elimination using memory profiling data when performing ICP
+
+- `--icp-funcs=<func1,func2,func3,...>`
+
+  List of functions to enable ICP for
+
+- `--icp-inline`
+
+  Only promote call targets eligible for inlining
+
+- `--icp-jt-remaining-percent-threshold=<uint>`
+
+  The percentage threshold against remaining unpromoted indirect call count for
+  the promotion for jump tables
+
+- `--icp-jt-targets`
+
+  Alias for --icp-jump-tables-targets
+
+- `--icp-jt-topn`
+
+  Alias for --indirect-call-promotion-jump-tables-topn
+
+- `--icp-jt-total-percent-threshold=<uint>`
+
+  The percentage threshold against total count for the promotion for jump tables
+
+- `--icp-jump-tables-targets`
+
+  For jump tables, optimize indirect jmp targets instead of indices
+
+- `--icp-mp-threshold`
+
+  Alias for --indirect-call-promotion-mispredict-threshold
+
+- `--icp-old-code-sequence`
+
+  Use old code sequence for promoted calls
+
+- `--icp-top-callsites=<uint>`
+
+  Optimize hottest calls until at least this percentage of all indirect calls
+  frequency is covered. 0 = all callsites
+
+- `--icp-topn`
+
+  Alias for --indirect-call-promotion-topn
+
+- `--icp-use-mp`
+
+  Alias for --indirect-call-promotion-use-mispredicts
+
+- `--indirect-call-promotion=<value>`
+
+  Indirect call promotion
+  - `none`: do not perform indirect call promotion
+  - `calls`: perform ICP on indirect calls
+  - `jump-tables`: perform ICP on jump tables
+  - `all`: perform ICP on calls and jump tables
+
+- `--indirect-call-promotion-calls-topn=<uint>`
+
+  Limit number of targets to consider when doing indirect call promotion on
+  calls. 0 = no limit
+
+- `--indirect-call-promotion-jump-tables-topn=<uint>`
+
+  Limit number of targets to consider when doing indirect call promotion on
+  jump tables. 0 = no limit
+
+- `--indirect-call-promotion-mispredict-threshold=<uint>`
+
+  Misprediction threshold for skipping ICP on an indirect call
+
+- `--indirect-call-promotion-topn=<uint>`
+
+  Limit number of targets to consider when doing indirect call promotion.
+  0 = no limit
+
+- `--indirect-call-promotion-use-mispredicts`
+
+  Use misprediction frequency for determining whether or not ICP should be
+  applied at a callsite. The `-indirect-call-promotion-mispredict-threshold`
+  value will be used by this heuristic
+
+- `--infer-fall-throughs`
+
+  Infer execution count for fall-through blocks
+
+- `--infer-stale-profile`
+
+  Infer counts from stale profile data.
+
+- `--inline-all`
+
+  Inline all functions
+
+- `--inline-ap`
+
+  Adjust function profile after inlining
+
+- `--inline-limit=<uint>`
+
+  Maximum number of call sites to inline
+
+- `--inline-max-iters=<uint>`
+
+  Maximum number of inline iterations
+
+- `--inline-memcpy`
+
+  Inline memcpy using 'rep movsb' instruction (X86-only)
+
+- `--inline-small-functions`
+
+  Inline functions if increase in size is less than defined by `-inline-small-functions-bytes`
+
+- `--inline-small-functions-bytes=<uint>`
+
+  Max number of bytes for the function to be considered small for inlining purposes
+
+- `--instrument`
+
+  Instrument code to generate accurate profile data
+
+- `--iterative-guess`
+
+  In non-LBR mode, guess edge counts using iterative technique
+
+- `--jt-footprint-optimize-for-icache`
+
+  With jt-footprint-reduction, only process PIC jumptables and turn off other
+  transformations that increase code size
+
+- `--jt-footprint-reduction`
+
+  Make jump tables size smaller at the cost of using more instructions at jump
+  sites
+
+- `-jump-tables=<value>`
+
+  Jump tables support (default=basic)
+  - `none`: do not optimize functions with jump tables
+  - `basic`: optimize functions with jump tables
+  - `move`: move jump tables to a separate section
+  - `split`: split jump tables section into hot and cold based on function
+  execution frequency
+  - `aggressive`: aggressively split jump tables section based on usage of the
+  tables
+
+- `--keep-nops`
+
+  Keep no-op instructions. By default they are removed.
+
+- `--lite-threshold-count=<uint>`
+
+  Similar to '-lite-threshold-pct' but specify threshold using absolute function
+  call count. I.e. limit processing to functions executed at least the specified
+  number of times.
+
+- `--lite-threshold-pct=<uint>`
+
+  Threshold (in percent) for selecting functions to process in lite mode. Higher
+  threshold means fewer functions to process. E.g threshold of 90 means only top
+  10 percent of functions with profile will be processed.
+
+- `--mcf-use-rarcs`
+
+  In MCF, consider the possibility of cancelling flow to balance edges
+
+- `--memcpy1-spec=<func1,func2:cs1:cs2,func3:cs1,...>`
+
+  List of functions with call sites for which to specialize memcpy() for size 1
+
+- `--min-branch-clusters`
+
+  Use a modified clustering algorithm geared towards minimizing branches
+
+- `--no-inline`
+
+  Disable all inlining (overrides other inlining options)
+
+- `--no-scan`
+
+  Do not scan cold functions for external references (may result in slower binary)
+
+- `--peepholes=<value>`
+
+  Enable peephole optimizations
+  - `none`: disable peepholes
+  - `double-jumps`: remove double jumps when able
+  - `tailcall-traps`: insert tail call traps
+  - `useless-branches`: remove useless conditional branches
+  - `all`: enable all peephole optimizations
+
+- `--plt=<value>`
+
+  Optimize PLT calls (requires linking with -znow)
+  - `none`: do not optimize PLT calls
+  - `hot`: optimize executed (hot) PLT calls
+  - `all`: optimize all PLT calls
+
+- `--preserve-blocks-alignment`
+
+  Try to preserve basic block alignment
+
+- `--profile-ignore-hash`
+
+  Ignore hash while reading function profile
+
+- `--profile-use-dfs`
+
+  Use DFS order for YAML profile
+
+- `--reg-reassign`
+
+  Reassign registers so as to avoid using REX prefixes in hot code
+
+- `--reorder-blocks=<value>`
+
+  Change layout of basic blocks in a function
+  - `none`: do not reorder basic blocks
+  - `reverse`: layout blocks in reverse order
+  - `normal`: perform optimal layout based on profile
+  - `branch-predictor`: perform optimal layout prioritizing branch predictions
+  - `cache`: perform optimal layout prioritizing I-cache behavior
+  - `cache+`: perform layout optimizing I-cache behavior
+  - `ext-tsp`: perform layout optimizing I-cache behavior
+  - `cluster-shuffle`: perform random layout of clusters
+
+- `--reorder-data=<section1,section2,section3,...>`
+
+  List of sections to reorder
+
+- `--reorder-data-algo=<value>`
+
+  Algorithm used to reorder data sections
+  - `count`: sort hot data by read counts
+  - `funcs`: sort hot data by hot function usage and count
+
+- `--reorder-data-inplace`
+
+  Reorder data sections in place
+
+- `--reorder-data-max-bytes=<uint>`
+
+  Maximum number of bytes to reorder
+
+- `--reorder-data-max-symbols=<uint>`
+
+  Maximum number of symbols to reorder
+
+- `--reorder-functions=<value>`
+
+  Reorder and cluster functions (works only with relocations)
+  - `none`: do not reorder functions
+  - `exec-count`: order by execution count
+  - `hfsort`: use hfsort algorithm
+  - `hfsort+`: use hfsort+ algorithm
+  - `cdsort`: use cache-directed sort
+  - `pettis-hansen`: use Pettis-Hansen algorithm
+  - `random`: reorder functions randomly
+  - `user`: use function order specified by -function-order
+
+- `--reorder-functions-use-hot-size`
+
+  Use a function's hot size when doing clustering
+
+- `--report-bad-layout=<uint>`
+
+  Print top <uint> functions with suboptimal code layout on input
+
+- `--report-stale`
+
+  Print the list of functions with stale profile
+
+- `--runtime-hugify-lib=<string>`
+
+  Specify file name of the runtime hugify library
+
+- `--runtime-instrumentation-lib=<string>`
+
+  Specify file name of the runtime instrumentation library
+
+- `--sctc-mode=<value>`
+
+  Mode for simplify conditional tail calls
+  - `always`: always perform sctc
+  - `preserve`: only perform sctc when branch direction is preserved
+  - `heuristic`: use branch prediction data to control sctc
+
+- `--sequential-disassembly`
+
+  Performs disassembly sequentially
+
+- `--shrink-wrapping-threshold=<uint>`
+
+  Percentage of prologue execution count to use as threshold when evaluating
+  whether a block is cold enough to be profitable to move eligible spills there
+
+- `--simplify-conditional-tail-calls`
+
+  Simplify conditional tail calls by removing unnecessary jumps
+
+- `--simplify-rodata-loads`
+
+  Simplify loads from read-only sections by replacing the memory operand with
+  the constant found in the corresponding section
+
+- `--split-align-threshold=<uint>`
+
+  When deciding to split a function, apply this alignment while doing the size
+  comparison (see -split-threshold). Default value: 2.
+
+- `--split-all-cold`
+
+  Outline as many cold basic blocks as possible
+
+- `--split-eh`
+
+  Split C++ exception handling code
+
+- `--split-functions`
+
+  Split functions into fragments
+
+- `--split-strategy=<value>`
+
+  Strategy used to partition blocks into fragments
+
+  - `profile2`: split each function into a hot and cold fragment using
+  profiling information
+  - `cdsplit`: split each function into a hot, warm, and cold fragment using
+  profiling information
+  - `random2`: split each function into a hot and cold fragment at a randomly
+  chosen split point (ignoring any available profiling information)
+  - `randomN`: split each function into N fragments at randomly chosen split
+  points (ignoring any available profiling information)
+  - `all`: split all basic blocks of each function into fragments such that
+  each fragment contains exactly a single basic block
+
+- `--split-threshold=<uint>`
+
+  Split function only if its main size is reduced by more than given amount of
+  bytes. Default value: 0, i.e. split iff the size is reduced. Note that on
+  some architectures the size can increase after splitting.
+
+- `--stale-matching-max-func-size=<uint>`
+
+  The maximum size of a function to consider for inference.
+
+- `--stale-threshold=<uint>`
+
+  Maximum percentage of stale functions to tolerate (default: 100)
+
+- `--stoke`
+
+  Turn on the stoke analysis
+
+- `--strip-rep-ret`
+
+  Strip 'repz' prefix from 'repz retq' sequence (on by default)
+
+- `--tail-duplication=<value>`
+
+  Duplicate unconditional branches that cross a cache line
+
+  - `none` do not apply
+  - `aggressive` aggressive strategy
+  - `moderate` moderate strategy
+  - `cache` cache-aware duplication strategy
+
+- `--tsp-threshold=<uint>`
+
+  Maximum number of hot basic blocks in a function for which to use a precise TSP solution while re-ordering basic blocks
+
+- `--use-aggr-reg-reassign`
+
+  Use register liveness analysis to try to find more opportunities for -reg-reassign optimization
+
+- `--use-compact-aligner`
+
+  Use compact approach for aligning functions
+
+- `--use-edge-counts`
+
+  Use edge count data when doing clustering
+
+- `--verify-cfg`
+
+  Verify the CFG after every pass
+
+- `--x86-align-branch-boundary-hot-only`
+
+  Only apply branch boundary alignment in hot code
+
+- `--x86-strip-redundant-address-size`
+
+  Remove redundant Address-Size override prefix
+
+### BOLT options in relocation mode
+
+- `-align-macro-fusion=<value>`
+
+  Fix instruction alignment for macro-fusion (x86 relocation mode)
+
+  - `none`: do not insert alignment no-ops for macro-fusion
+  - `hot`: only insert alignment no-ops on hot execution paths (default)
+  - `all`: always align instructions to allow macro-fusion
+
+### BOLT instrumentation options
+
+`llvm-bolt <executable> -instrument [-o outputfile] <instrumented-executable>`
+
+- `--conservative-instrumentation`
+
+  Disable instrumentation optimizations that sacrifice profile accuracy (for
+  debugging, default: false)
+
+- `--instrument-calls`
+
+  Record profile for inter-function control flow activity (default: true)
+
+- `--instrument-hot-only`
+
+  Only insert instrumentation on hot functions (needs profile, default: false)
+
+- `--instrumentation-binpath=<string>`
+
+  Path to instrumented binary in case if /proc/self/map_files is not accessible
+  due to access restriction issues
+
+- `--instrumentation-file=<string>`
+
+  File name where instrumented profile will be saved (default: /tmp/prof.fdata)
+
+- `--instrumentation-file-append-pid`
+
+  Append PID to saved profile file name (default: false)
+
+- `--instrumentation-no-counters-clear`
+
+  Don't clear counters across dumps (use with `instrumentation-sleep-time` option)
+
+- `--instrumentation-sleep-time=<uint>`
+
+  Interval between profile writes (default: 0 = write only at program end).
+  This is useful for service workloads when you want to dump profile every X
+  minutes or if you are killing the program and the profile is not being
+  dumped at the end.
+
+- `--instrumentation-wait-forks`
+
+  Wait until all forks of instrumented process will finish (use with
+  `instrumentation-sleep-time` option)
+
+### Data aggregation options (perf2bolt)
+
+`perf2bolt -p perf.data [-o outputfile] perf.fdata <executable>`
+
+- `--autofdo`
+
+  Generate autofdo textual data instead of bolt data
+
+- `--filter-mem-profile`
+
+  If processing a memory profile, filter out stack or heap accesses that won't
+  be useful for BOLT to reduce profile file size
+
+- `--ignore-build-id`
+
+  Continue even if build-ids in input binary and perf.data mismatch
+
+- `--ignore-interrupt-lbr`
+
+  Ignore kernel interrupt LBR that happens asynchronously
+
+- `--itrace=<string>`
+
+  Generate LBR info with perf itrace argument
+
+- `--nl`
+
+  Aggregate basic samples (without LBR info)
+
+- `--pa`
+
+  Skip perf and read data from a pre-aggregated file format
+
+- `--perfdata=<string>`
+
+  Data file
+
+- `--pid=<ulong>`
+
+  Only use samples from process with specified PID
+
+- `--time-aggr`
+
+  Time BOLT aggregator
+
+- `--use-event-pc`
+
+  Use event PC in combination with LBR sampling
+
+### BOLT printing options
+
+#### Generic options
+
+- `--print-aliases`
+
+  Print aliases when printing objects
+
+- `--print-all`
+
+  Print functions after each stage
+
+- `--print-cfg`
+
+  Print functions after CFG construction
+
+- `--print-debug-info`
+
+  Print debug info when printing functions
+
+- `--print-disasm`
+
+  Print function after disassembly
+
+- `--print-dyno-opcode-stats=<uint>`
+
+  Print per instruction opcode dyno stats and the functionnames:BB offsets of
+  the nth highest execution counts
+
+- `--print-dyno-stats-only`
+
+  While printing functions output dyno-stats and skip instructions
+
+- `--print-exceptions`
+
+  Print exception handling data
+
+- `--print-globals`
+
+  Print global symbols after disassembly
+
+- `--print-jump-tables`
+
+  Print jump tables
+
+- `--print-loops`
+
+  Print loop related information
+
+- `--print-mem-data`
+
+  Print memory data annotations when printing functions
+
+- `--print-normalized`
+
+  Print functions after CFG is normalized
+
+- `--print-only=<func1,func2,func3,...>`
+
+  List of functions to print
+
+- `--print-orc`
+
+  Print ORC unwind information for instructions
+
+- `--print-profile`
+
+  Print functions after attaching profile
+
+- `--print-profile-stats`
+
+  Print profile quality/bias analysis
+
+- `--print-pseudo-probes=<value>`
+
+  Print pseudo probe info
+  - `=decode`: decode probes section from binary
+  - `=address_conversion`: update address2ProbesMap with output block address
+  - `=encoded_probes`: display the encoded probes in binary section
+  - `=all`: enable all debugging printout
+
+- `--print-relocations`
+
+  Print relocations when printing functions/objects
+
+- `--print-reordered-data`
+
+  Print section contents after reordering
+
+- `--print-retpoline-insertion`
+
+  Print functions after retpoline insertion pass
+
+- `--print-sdt`
+
+  Print all SDT markers
+
+- `--print-sections`
+
+  Print all registered sections
+
+- `--print-unknown`
+
+  Print names of functions with unknown control flow
+
+- `--time-opts`
+
+  Print time spent in each optimization
+
+#### Optimization options
+
+- `--print-after-branch-fixup`
+
+  Print function after fixing local branches
+
+- `--print-after-jt-footprint-reduction`
+
+  Print function after jt-footprint-reduction pass
+
+- `--print-after-lowering`
+
+  Print function after instruction lowering
+
+- `--print-cache-metrics`
+
+  Calculate and print various metrics for instruction cache
+
+- `--print-clusters`
+
+  Print clusters
+
+- `--print-finalized`
+
+  Print function after CFG is finalized
+
+- `--print-fix-relaxations`
+
+  Print functions after fix relaxations pass
+
+- `--print-fix-riscv-calls`
+
+  Print functions after fix RISCV calls pass
+
+- `--print-fop`
+
+  Print functions after frame optimizer pass
+
+- `--print-function-statistics=<uint>`
+
+  Print statistics about basic block ordering
+
+- `--print-icf`
+
+  Print functions after ICF optimization
+
+- `--print-icp`
+
+  Print functions after indirect call promotion
+
+- `--print-inline`
+
+  Print functions after inlining optimization
+
+- `--print-longjmp`
+
+  Print functions after longjmp pass
+
+- `--print-optimize-bodyless`
+
+  Print functions after bodyless optimization
+
+- `--print-output-address-range`
+
+  Print output address range for each basic block in the function
+  whenBinaryFunction::print is called
+
+- `--print-peepholes`
+
+  Print functions after peephole optimization
+
+- `--print-plt`
+
+  Print functions after PLT optimization
+
+- `--print-regreassign`
+
+  Print functions after regreassign pass
+
+- `--print-reordered`
+
+  Print functions after layout optimization
+
+- `--print-reordered-functions`
+
+  Print functions after clustering
+
+- `--print-sctc`
+
+  Print functions after conditional tail call simplification
+
+- `--print-simplify-rodata-loads`
+
+  Print functions after simplification of RO data loads
+
+- `--print-sorted-by=<value>`
+
+  Print functions sorted by order of dyno stats
+  - `executed-forward-branches`: executed forward branches
+  - `taken-forward-branches`: taken forward branches
+  - `executed-backward-branches`: executed backward branches
+  - `taken-backward-branches`: taken backward branches
+  - `executed-unconditional-branches`: executed unconditional branches
+  - `all-function-calls`: all function calls
+  - `indirect-calls`: indirect calls
+  - `PLT-calls`: PLT calls
+  - `executed-instructions`: executed instructions
+  - `executed-load-instructions`: executed load instructions
+  - `executed-store-instructions`: executed store instructions
+  - `taken-jump-table-branches`: taken jump table branches
+  - `taken-unknown-indirect-branches`: taken unknown indirect branches
+  - `total-branches`: total branches
+  - `taken-branches`: taken branches
+  - `non-taken-conditional-branches`: non-taken conditional branches
+  - `taken-conditional-branches`: taken conditional branches
+  - `all-conditional-branches`: all conditional branches
+  - `linker-inserted-veneer-calls`: linker-inserted veneer calls
+  - `all`: sorted by all names
+
+- `--print-sorted-by-order=<value>`
+
+  Use ascending or descending order when printing functions ordered by dyno stats
+
+- `--print-split`
+
+  Print functions after code splitting
+
+- `--print-stoke`
+
+  Print functions after stoke analysis
+
+- `--print-uce`
+
+  Print functions after unreachable code elimination
+
+- `--print-veneer-elimination`
+
+  Print functions after veneer elimination pass
+
+- `--time-build`
+
+  Print time spent constructing binary functions
+
+- `--time-rewrite`
+
+  Print time spent in rewriting passes

From 8e00703be9ceb41d9b80c2bc8f024a9610b9aaa1 Mon Sep 17 00:00:00 2001
From: jyu2-git <jennifer.yu@intel.com>
Date: Wed, 15 May 2024 08:20:25 -0700
Subject: [PATCH 40/71] [Clang][OpenMP] Fix runtime problem when explicit map
 both pointer and pointee (#92210)

ponter int *p for following map, test currently crash.

  map(p, p[:100]) or map(p, p[1])

Currly IR looks like
// &p, &p, sizeof(int), TARGET_PARAM | TO | FROM
// &p, p[0], 100sizeof(float) TO | FROM

Worrking IR is
// map(p, p[0:100]) to map(p[0:100])
// &p, &p[0], 100*sizeof(float), TARGET_PARAM | TO | FROM | PTR_AND_OBJ

The change is add new argument AreBothBasePtrAndPteeMapped in
generateInfoForComponentList

Use that to skip map for map(p), when processing map(p[:100]) generate
map with right flag.
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |  37 +++--
 ...arget_map_both_pointer_pointee_codegen.cpp | 150 ++++++++++++++++++
 .../test/mapping/map_both_pointer_pointee.c   |  42 +++++
 3 files changed, 220 insertions(+), 9 deletions(-)
 create mode 100644 clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp
 create mode 100644 offload/test/mapping/map_both_pointer_pointee.c

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index e39c7c58d2780e..f56af318ff6ae1 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -6830,7 +6830,8 @@ class MappableExprsHandler {
       const ValueDecl *Mapper = nullptr, bool ForDeviceAddr = false,
       const ValueDecl *BaseDecl = nullptr, const Expr *MapExpr = nullptr,
       ArrayRef<OMPClauseMappableExprCommon::MappableExprComponentListRef>
-          OverlappedElements = std::nullopt) const {
+          OverlappedElements = std::nullopt,
+      bool AreBothBasePtrAndPteeMapped = false) const {
     // The following summarizes what has to be generated for each map and the
     // types below. The generated information is expressed in this order:
     // base pointer, section pointer, size, flags
@@ -7006,6 +7007,10 @@ class MappableExprsHandler {
     // &(ps->p), &(ps->p[0]), 33*sizeof(double), MEMBER_OF(4) | PTR_AND_OBJ | TO
     // (*) the struct this entry pertains to is the 4th element in the list
     //     of arguments, hence MEMBER_OF(4)
+    //
+    // map(p, p[:100])
+    // ===> map(p[:100])
+    // &p, &p[0], 100*sizeof(float), TARGET_PARAM | PTR_AND_OBJ | TO | FROM
 
     // Track if the map information being generated is the first for a capture.
     bool IsCaptureFirstInfo = IsFirstComponentList;
@@ -7029,6 +7034,8 @@ class MappableExprsHandler {
     const auto *OASE = dyn_cast<ArraySectionExpr>(AssocExpr);
     const auto *OAShE = dyn_cast<OMPArrayShapingExpr>(AssocExpr);
 
+    if (AreBothBasePtrAndPteeMapped && std::next(I) == CE)
+      return;
     if (isa<MemberExpr>(AssocExpr)) {
       // The base is the 'this' pointer. The content of the pointer is going
       // to be the base of the field being mapped.
@@ -7071,8 +7078,9 @@ class MappableExprsHandler {
         // can be associated with the combined storage if shared memory mode is
         // active or the base declaration is not global variable.
         const auto *VD = dyn_cast<VarDecl>(I->getAssociatedDeclaration());
-        if (CGF.CGM.getOpenMPRuntime().hasRequiresUnifiedSharedMemory() ||
-            !VD || VD->hasLocalStorage())
+        if (!AreBothBasePtrAndPteeMapped &&
+            (CGF.CGM.getOpenMPRuntime().hasRequiresUnifiedSharedMemory() ||
+             !VD || VD->hasLocalStorage()))
           BP = CGF.EmitLoadOfPointer(BP, Ty->castAs<PointerType>());
         else
           FirstPointerInComplexData = true;
@@ -7394,11 +7402,13 @@ class MappableExprsHandler {
           // same expression except for the first one. We also need to signal
           // this map is the first one that relates with the current capture
           // (there is a set of entries for each capture).
-          OpenMPOffloadMappingFlags Flags = getMapTypeBits(
-              MapType, MapModifiers, MotionModifiers, IsImplicit,
-              !IsExpressionFirstInfo || RequiresReference ||
-                  FirstPointerInComplexData || IsMemberReference,
-              IsCaptureFirstInfo && !RequiresReference, IsNonContiguous);
+          OpenMPOffloadMappingFlags Flags =
+              getMapTypeBits(MapType, MapModifiers, MotionModifiers, IsImplicit,
+                             !IsExpressionFirstInfo || RequiresReference ||
+                                 FirstPointerInComplexData || IsMemberReference,
+                             AreBothBasePtrAndPteeMapped ||
+                                 (IsCaptureFirstInfo && !RequiresReference),
+                             IsNonContiguous);
 
           if (!IsExpressionFirstInfo || IsMemberReference) {
             // If we have a PTR_AND_OBJ pair where the OBJ is a pointer as well,
@@ -8492,6 +8502,8 @@ class MappableExprsHandler {
     assert(CurDir.is<const OMPExecutableDirective *>() &&
            "Expect a executable directive");
     const auto *CurExecDir = CurDir.get<const OMPExecutableDirective *>();
+    bool HasMapBasePtr = false;
+    bool HasMapArraySec = false;
     for (const auto *C : CurExecDir->getClausesOfKind<OMPMapClause>()) {
       const auto *EI = C->getVarRefs().begin();
       for (const auto L : C->decl_component_lists(VD)) {
@@ -8503,6 +8515,11 @@ class MappableExprsHandler {
         assert(VDecl == VD && "We got information for the wrong declaration??");
         assert(!Components.empty() &&
                "Not expecting declaration with no component lists.");
+        if (VD && E && VD->getType()->isAnyPointerType() && isa<DeclRefExpr>(E))
+          HasMapBasePtr = true;
+        if (VD && E && VD->getType()->isAnyPointerType() &&
+            (isa<ArraySectionExpr>(E) || isa<ArraySubscriptExpr>(E)))
+          HasMapArraySec = true;
         DeclComponentLists.emplace_back(Components, C->getMapType(),
                                         C->getMapTypeModifiers(),
                                         C->isImplicit(), Mapper, E);
@@ -8685,7 +8702,9 @@ class MappableExprsHandler {
             MapType, MapModifiers, std::nullopt, Components, CombinedInfo,
             StructBaseCombinedInfo, PartialStruct, IsFirstComponentList,
             IsImplicit, /*GenerateAllInfoForClauses*/ false, Mapper,
-            /*ForDeviceAddr=*/false, VD, VarRef);
+            /*ForDeviceAddr=*/false, VD, VarRef,
+            /*OverlappedElements*/ std::nullopt,
+            HasMapBasePtr && HasMapArraySec);
       IsFirstComponentList = false;
     }
   }
diff --git a/clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp b/clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp
new file mode 100644
index 00000000000000..e2c27f37f5b9db
--- /dev/null
+++ b/clang/test/OpenMP/target_map_both_pointer_pointee_codegen.cpp
@@ -0,0 +1,150 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+extern void *malloc (int __size) throw () __attribute__ ((__malloc__));
+
+void foo() {
+  int *ptr = (int *) malloc(3 * sizeof(int));
+
+  #pragma omp target map(ptr, ptr[0:2])
+  {
+    ptr[1] = 6;
+  }
+  #pragma omp target map(ptr, ptr[2])
+  {
+    ptr[2] = 8;
+  }
+}
+#endif
+// CHECK-LABEL: define {{[^@]+}}@_Z3foov
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS2:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_PTRS3:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_MAPPERS4:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[KERNEL_ARGS5:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call noalias noundef ptr @_Z6malloci(i32 noundef signext 12) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    store ptr [[CALL]], ptr [[PTR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PTR]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[TMP2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[ARRAYIDX]], ptr [[TMP3]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK-NEXT:    store ptr null, ptr [[TMP4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-NEXT:    store i32 3, ptr [[TMP7]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-NEXT:    store i32 1, ptr [[TMP8]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-NEXT:    store ptr [[TMP5]], ptr [[TMP9]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-NEXT:    store ptr [[TMP6]], ptr [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK-NEXT:    store ptr @.offload_sizes, ptr [[TMP11]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK-NEXT:    store ptr @.offload_maptypes, ptr [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK-NEXT:    store ptr null, ptr [[TMP13]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK-NEXT:    store ptr null, ptr [[TMP14]], align 8
+// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK-NEXT:    store i64 0, ptr [[TMP15]], align 8
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK-NEXT:    store i64 0, ptr [[TMP16]], align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP17]], align 4
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP18]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK-NEXT:    store i32 0, ptr [[TMP19]], align 4
+// CHECK-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15.region_id, ptr [[KERNEL_ARGS]])
+// CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
+// CHECK-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK:       omp_offload.failed:
+// CHECK-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15(ptr [[TMP0]]) #[[ATTR3]]
+// CHECK-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK:       omp_offload.cont:
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[PTR]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[PTR]], align 8
+// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 2
+// CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[TMP24]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[ARRAYIDX1]], ptr [[TMP25]], align 8
+// CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS4]], i64 0, i64 0
+// CHECK-NEXT:    store ptr null, ptr [[TMP26]], align 8
+// CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS2]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS3]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 0
+// CHECK-NEXT:    store i32 3, ptr [[TMP29]], align 4
+// CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 1
+// CHECK-NEXT:    store i32 1, ptr [[TMP30]], align 4
+// CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 2
+// CHECK-NEXT:    store ptr [[TMP27]], ptr [[TMP31]], align 8
+// CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 3
+// CHECK-NEXT:    store ptr [[TMP28]], ptr [[TMP32]], align 8
+// CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 4
+// CHECK-NEXT:    store ptr @.offload_sizes.1, ptr [[TMP33]], align 8
+// CHECK-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 5
+// CHECK-NEXT:    store ptr @.offload_maptypes.2, ptr [[TMP34]], align 8
+// CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 6
+// CHECK-NEXT:    store ptr null, ptr [[TMP35]], align 8
+// CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 7
+// CHECK-NEXT:    store ptr null, ptr [[TMP36]], align 8
+// CHECK-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 8
+// CHECK-NEXT:    store i64 0, ptr [[TMP37]], align 8
+// CHECK-NEXT:    [[TMP38:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 9
+// CHECK-NEXT:    store i64 0, ptr [[TMP38]], align 8
+// CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 10
+// CHECK-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP39]], align 4
+// CHECK-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 11
+// CHECK-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP40]], align 4
+// CHECK-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS5]], i32 0, i32 12
+// CHECK-NEXT:    store i32 0, ptr [[TMP41]], align 4
+// CHECK-NEXT:    [[TMP42:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l19.region_id, ptr [[KERNEL_ARGS5]])
+// CHECK-NEXT:    [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
+// CHECK-NEXT:    br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
+// CHECK:       omp_offload.failed6:
+// CHECK-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l19(ptr [[TMP22]]) #[[ATTR3]]
+// CHECK-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
+// CHECK:       omp_offload.cont7:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15
+// CHECK-SAME: (ptr noundef [[PTR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
+// CHECK-NEXT:    store i32 6, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l19
+// CHECK-SAME: (ptr noundef [[PTR:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+// CHECK-NEXT:    store i32 8, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT:    ret void
+//
diff --git a/offload/test/mapping/map_both_pointer_pointee.c b/offload/test/mapping/map_both_pointer_pointee.c
new file mode 100644
index 00000000000000..4b724823e7a40f
--- /dev/null
+++ b/offload/test/mapping/map_both_pointer_pointee.c
@@ -0,0 +1,42 @@
+// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu
+// RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
+
+// REQUIRES: unified_shared_memory
+// UNSUPPORTED: amdgcn-amd-amdhsa
+
+#pragma omp declare target
+int *ptr1;
+#pragma omp end declare target
+
+#include <stdio.h>
+#include <stdlib.h>
+int main() {
+  ptr1 = (int *)malloc(sizeof(int) * 100);
+  int *ptr2;
+  ptr2 = (int *)malloc(sizeof(int) * 100);
+#pragma omp target map(ptr1, ptr1[ : 100])
+  { ptr1[1] = 6; }
+  // CHECK: 6
+  printf(" %d \n", ptr1[1]);
+#pragma omp target data map(ptr1[ : 5])
+  {
+#pragma omp target map(ptr1[2], ptr1, ptr1[3]) map(ptr2, ptr2[2])
+    {
+      ptr1[2] = 7;
+      ptr1[3] = 9;
+      ptr2[2] = 7;
+    }
+  }
+  // CHECK: 7 7 9
+  printf(" %d %d %d \n", ptr2[2], ptr1[2], ptr1[3]);
+  free(ptr1);
+#pragma omp target map(ptr2, ptr2[ : 100])
+  { ptr2[1] = 6; }
+  // CHECK: 6
+  printf(" %d \n", ptr2[1]);
+  free(ptr2);
+  return 0;
+}

From ff313ee70a4f27e3555ee4baef53b9b51c5aa27e Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 15 May 2024 23:37:31 +0800
Subject: [PATCH 41/71] [RISCV] Remove hasSideEffects=1 for vsetvli pseudos
 (#91319)

In a similar vein to #90049, we currently model all of the effects of a
vsetvli pseudo:

* VL and VTYPE are marked as defs
* VL preserving x0,x0 vsetvlis doesn't get emitted until
RISCVInsertVSETVLI, and when they are they have implicit uses on VL
* Regular vector pseudos are fully modelled too: Before
RISCVInsertVSETVLI they can be moved between vsetvli pseudos because we
will eventually insert vsetvlis to correct VL and VTYPE. Afterwards,
they will have implicit uses on VL and VTYPE.

Since we model everything we can remove hasSideEffects=1. This gives us
some improvements like sinking in vsetvli-insert-crossbb.ll.

We need to update RISCVDeadRegisterDefinitions to keep handling vsetvli
pseudos since it only operates on instructions with unmodelled side
effects.
---
 .../RISCV/RISCVDeadRegisterDefinitions.cpp    |   4 +-
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    |   2 +-
 .../CodeGen/RISCV/rvv/calling-conv-fastcc.ll  |  38 +-
 llvm/test/CodeGen/RISCV/rvv/calling-conv.ll   |   8 +-
 ...d-vectors-fnearbyint-constrained-sdnode.ll |  24 +-
 .../RISCV/rvv/fixed-vectors-fp2i-sat.ll       |  24 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll   |  80 ++--
 .../RISCV/rvv/fixed-vectors-int-buildvec.ll   | 154 ++++----
 ...fixed-vectors-interleaved-access-zve32x.ll |  38 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-lrint.ll  |  20 +-
 .../RISCV/rvv/fixed-vectors-masked-gather.ll  | 372 +++++++++---------
 .../RISCV/rvv/fixed-vectors-masked-scatter.ll |  69 ++--
 .../RISCV/rvv/fixed-vectors-nearbyint-vp.ll   |  56 +--
 .../RISCV/rvv/fixed-vectors-vselect.ll        |  24 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll  |   8 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll |   8 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll  |   2 +-
 .../RISCV/rvv/fixed-vectors-vwmulsu.ll        |   6 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll  |   8 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll |   8 +-
 llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll    | 154 +++++---
 llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll    | 154 +++++---
 .../rvv/fnearbyint-constrained-sdnode.ll      |  30 +-
 .../CodeGen/RISCV/rvv/fnearbyint-sdnode.ll    |  30 +-
 llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll   | 294 ++++++--------
 llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll |  19 +-
 .../test/CodeGen/RISCV/rvv/mscatter-sdnode.ll |   2 +-
 llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll   | 197 +++++-----
 llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll    | 268 ++++++-------
 llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll   |  40 +-
 llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll        |  34 +-
 llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll    |  34 +-
 llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll |  14 +-
 llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll     |   6 +-
 .../RISCV/rvv/vsetvli-insert-crossbb.ll       |  39 +-
 .../CodeGen/RISCV/rvv/vsetvli-regression.ll   |   5 +-
 36 files changed, 1107 insertions(+), 1166 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
index 5e6b7891449fec..7de48d8218f06e 100644
--- a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
+++ b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp
@@ -72,7 +72,9 @@ bool RISCVDeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
       // are reserved for HINT instructions.
       const MCInstrDesc &Desc = MI.getDesc();
       if (!Desc.mayLoad() && !Desc.mayStore() &&
-          !Desc.hasUnmodeledSideEffects())
+          !Desc.hasUnmodeledSideEffects() &&
+          MI.getOpcode() != RISCV::PseudoVSETVLI &&
+          MI.getOpcode() != RISCV::PseudoVSETIVLI)
         continue;
       // For PseudoVSETVLIX0, Rd = X0 has special meaning.
       if (MI.getOpcode() == RISCV::PseudoVSETVLIX0)
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 4adc26f6289144..317a6d7d4c52f3 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -6181,7 +6181,7 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 0,
 //===----------------------------------------------------------------------===//
 
 // Pseudos.
-let hasSideEffects = 1, mayLoad = 0, mayStore = 0, Defs = [VL, VTYPE] in {
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Defs = [VL, VTYPE] in {
 // Due to rs1=X0 having special meaning, we need a GPRNoX0 register class for
 // the when we aren't using one of the special X0 encodings. Otherwise it could
 // be accidentally be made X0 by MachineIR optimizations. To satisfy the
diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
index 187f758b780204..0a7fa38b0c8abb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
@@ -236,11 +236,12 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -248,29 +249,40 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a3, a2, a1
-; CHECK-NEXT:    vl8re32.v v8, (a3)
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    vl8re32.v v0, (a0)
 ; CHECK-NEXT:    vl8re32.v v8, (a1)
-; CHECK-NEXT:    vl8re32.v v16, (a2)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re32.v v0, (a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vl8re32.v v8, (a3)
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re32.v v16, (a2)
 ; CHECK-NEXT:    vadd.vv v0, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vadd.vv v8, v24, v8
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vadd.vv v24, v24, v8
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vadd.vv v8, v8, v24
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vadd.vv v8, v24, v8
 ; CHECK-NEXT:    vadd.vv v24, v0, v16
 ; CHECK-NEXT:    vadd.vx v16, v8, a4
 ; CHECK-NEXT:    vadd.vx v8, v24, a4
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
index 647d3158b6167f..fa62143546df60 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
@@ -39,11 +39,11 @@ define <vscale x 32 x i32> @caller_scalable_vector_split_indirect(<vscale x 32 x
 ; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    vs8r.v v16, (a0)
+; RV32-NEXT:    add a1, a0, a1
 ; RV32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    addi a0, sp, 128
+; RV32-NEXT:    vs8r.v v16, (a1)
 ; RV32-NEXT:    vmv.v.i v16, 0
 ; RV32-NEXT:    call callee_scalable_vector_split_indirect
 ; RV32-NEXT:    addi sp, s0, -144
@@ -70,11 +70,11 @@ define <vscale x 32 x i32> @caller_scalable_vector_split_indirect(<vscale x 32 x
 ; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vs8r.v v16, (a0)
+; RV64-NEXT:    add a1, a0, a1
 ; RV64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; RV64-NEXT:    vmv.v.i v8, 0
 ; RV64-NEXT:    addi a0, sp, 128
+; RV64-NEXT:    vs8r.v v16, (a1)
 ; RV64-NEXT:    vmv.v.i v16, 0
 ; RV64-NEXT:    call callee_scalable_vector_split_indirect
 ; RV64-NEXT:    addi sp, s0, -144
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
index 1b50214bbf164d..9e9a8b8a4b644e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
@@ -19,9 +19,9 @@ define <2 x half> @nearbyint_v2f16(<2 x half> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <2 x half> @llvm.experimental.constrained.nearbyint.v2f16(<2 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <2 x half> %r
@@ -42,9 +42,9 @@ define <4 x half> @nearbyint_v4f16(<4 x half> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <4 x half> @llvm.experimental.constrained.nearbyint.v4f16(<4 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <4 x half> %r
@@ -65,9 +65,9 @@ define <8 x half> @nearbyint_v8f16(<8 x half> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <8 x half> @llvm.experimental.constrained.nearbyint.v8f16(<8 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <8 x half> %r
@@ -88,9 +88,9 @@ define <16 x half> @nearbyint_v16f16(<16 x half> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <16 x half> @llvm.experimental.constrained.nearbyint.v16f16(<16 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <16 x half> %r
@@ -112,9 +112,9 @@ define <32 x half> @nearbyint_v32f16(<32 x half> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <32 x half> @llvm.experimental.constrained.nearbyint.v32f16(<32 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <32 x half> %r
@@ -135,9 +135,9 @@ define <2 x float> @nearbyint_v2f32(<2 x float> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <2 x float> @llvm.experimental.constrained.nearbyint.v2f32(<2 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <2 x float> %r
@@ -158,9 +158,9 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <4 x float> %r
@@ -181,9 +181,9 @@ define <8 x float> @nearbyint_v8f32(<8 x float> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <8 x float> @llvm.experimental.constrained.nearbyint.v8f32(<8 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <8 x float> %r
@@ -204,9 +204,9 @@ define <16 x float> @nearbyint_v16f32(<16 x float> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <16 x float> @llvm.experimental.constrained.nearbyint.v16f32(<16 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <16 x float> %r
@@ -227,9 +227,9 @@ define <2 x double> @nearbyint_v2f64(<2 x double> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <2 x double> %r
@@ -250,9 +250,9 @@ define <4 x double> @nearbyint_v4f64(<4 x double> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(<4 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <4 x double> %r
@@ -273,9 +273,9 @@ define <8 x double> @nearbyint_v8f64(<8 x double> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <8 x double> @llvm.experimental.constrained.nearbyint.v8f64(<8 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <8 x double> %r
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
index a8e4af2d7368e8..6320b07125bb0b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
@@ -359,13 +359,13 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV32-NEXT:    feq.d a0, fa3, fa3
 ; RV32-NEXT:    fmax.d fa3, fa3, fa5
 ; RV32-NEXT:    fmin.d fa3, fa3, fa4
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; RV32-NEXT:    fld fa2, 40(sp)
 ; RV32-NEXT:    fcvt.w.d a2, fa3, rtz
+; RV32-NEXT:    fld fa3, 40(sp)
 ; RV32-NEXT:    neg a0, a0
 ; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    feq.d a2, fa2, fa2
-; RV32-NEXT:    fmax.d fa3, fa2, fa5
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; RV32-NEXT:    feq.d a2, fa3, fa3
+; RV32-NEXT:    fmax.d fa3, fa3, fa5
 ; RV32-NEXT:    fmin.d fa3, fa3, fa4
 ; RV32-NEXT:    fcvt.w.d a3, fa3, rtz
 ; RV32-NEXT:    fld fa3, 32(sp)
@@ -460,13 +460,13 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV64-NEXT:    feq.d a0, fa3, fa3
 ; RV64-NEXT:    fmax.d fa3, fa3, fa5
 ; RV64-NEXT:    fmin.d fa3, fa3, fa4
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; RV64-NEXT:    fld fa2, 40(sp)
 ; RV64-NEXT:    fcvt.l.d a2, fa3, rtz
+; RV64-NEXT:    fld fa3, 40(sp)
 ; RV64-NEXT:    neg a0, a0
 ; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    feq.d a2, fa2, fa2
-; RV64-NEXT:    fmax.d fa3, fa2, fa5
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; RV64-NEXT:    feq.d a2, fa3, fa3
+; RV64-NEXT:    fmax.d fa3, fa3, fa5
 ; RV64-NEXT:    fmin.d fa3, fa3, fa4
 ; RV64-NEXT:    fcvt.l.d a3, fa3, rtz
 ; RV64-NEXT:    fld fa3, 32(sp)
@@ -557,7 +557,6 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    vfmv.f.s fa4, v8
 ; RV32-NEXT:    fmax.d fa4, fa4, fa3
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV32-NEXT:    fld fa2, 40(sp)
 ; RV32-NEXT:    fmin.d fa4, fa4, fa5
 ; RV32-NEXT:    fcvt.wu.d a0, fa4, rtz
@@ -566,9 +565,10 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV32-NEXT:    fmin.d fa2, fa2, fa5
 ; RV32-NEXT:    fcvt.wu.d a2, fa2, rtz
 ; RV32-NEXT:    fmax.d fa4, fa4, fa3
-; RV32-NEXT:    fld fa2, 48(sp)
 ; RV32-NEXT:    fmin.d fa4, fa4, fa5
+; RV32-NEXT:    fld fa2, 48(sp)
 ; RV32-NEXT:    fcvt.wu.d a3, fa4, rtz
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV32-NEXT:    vslide1down.vx v8, v10, a0
 ; RV32-NEXT:    fmax.d fa4, fa2, fa3
 ; RV32-NEXT:    fmin.d fa4, fa4, fa5
@@ -633,7 +633,6 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-NEXT:    vfmv.f.s fa4, v8
 ; RV64-NEXT:    fmax.d fa4, fa4, fa3
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV64-NEXT:    fld fa2, 40(sp)
 ; RV64-NEXT:    fmin.d fa4, fa4, fa5
 ; RV64-NEXT:    fcvt.lu.d a0, fa4, rtz
@@ -642,9 +641,10 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
 ; RV64-NEXT:    fmin.d fa2, fa2, fa5
 ; RV64-NEXT:    fcvt.lu.d a2, fa2, rtz
 ; RV64-NEXT:    fmax.d fa4, fa4, fa3
-; RV64-NEXT:    fld fa2, 48(sp)
 ; RV64-NEXT:    fmin.d fa4, fa4, fa5
+; RV64-NEXT:    fld fa2, 48(sp)
 ; RV64-NEXT:    fcvt.lu.d a3, fa4, rtz
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV64-NEXT:    vslide1down.vx v8, v10, a0
 ; RV64-NEXT:    fmax.d fa4, fa2, fa3
 ; RV64-NEXT:    fmin.d fa4, fa4, fa5
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
index 6ffa6ac250ed7f..9c76b83d0974af 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
@@ -132,12 +132,12 @@ define <3 x float> @si2fp_v3i1_v3f32(<3 x i1> %x) {
 define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ; ZVFH32-LABEL: si2fp_v3i7_v3f32:
 ; ZVFH32:       # %bb.0:
-; ZVFH32-NEXT:    lw a1, 4(a0)
-; ZVFH32-NEXT:    lw a2, 0(a0)
-; ZVFH32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFH32-NEXT:    lw a1, 0(a0)
+; ZVFH32-NEXT:    lw a2, 4(a0)
 ; ZVFH32-NEXT:    lw a0, 8(a0)
-; ZVFH32-NEXT:    vmv.v.x v8, a2
-; ZVFH32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFH32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFH32-NEXT:    vmv.v.x v8, a1
+; ZVFH32-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVFH32-NEXT:    vslide1down.vx v8, v8, a0
 ; ZVFH32-NEXT:    vslidedown.vi v8, v8, 1
 ; ZVFH32-NEXT:    vadd.vv v8, v8, v8
@@ -149,12 +149,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; ZVFH64-LABEL: si2fp_v3i7_v3f32:
 ; ZVFH64:       # %bb.0:
-; ZVFH64-NEXT:    ld a1, 8(a0)
-; ZVFH64-NEXT:    ld a2, 0(a0)
-; ZVFH64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFH64-NEXT:    ld a1, 0(a0)
+; ZVFH64-NEXT:    ld a2, 8(a0)
 ; ZVFH64-NEXT:    ld a0, 16(a0)
-; ZVFH64-NEXT:    vmv.v.x v8, a2
-; ZVFH64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFH64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFH64-NEXT:    vmv.v.x v8, a1
+; ZVFH64-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVFH64-NEXT:    vslide1down.vx v8, v8, a0
 ; ZVFH64-NEXT:    vslidedown.vi v8, v8, 1
 ; ZVFH64-NEXT:    vadd.vv v8, v8, v8
@@ -166,12 +166,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; ZVFHMIN32-LABEL: si2fp_v3i7_v3f32:
 ; ZVFHMIN32:       # %bb.0:
-; ZVFHMIN32-NEXT:    lw a1, 4(a0)
-; ZVFHMIN32-NEXT:    lw a2, 0(a0)
-; ZVFHMIN32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFHMIN32-NEXT:    lw a1, 0(a0)
+; ZVFHMIN32-NEXT:    lw a2, 4(a0)
 ; ZVFHMIN32-NEXT:    lw a0, 8(a0)
-; ZVFHMIN32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFHMIN32-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN32-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVFHMIN32-NEXT:    vslide1down.vx v8, v8, a0
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v8, 1
 ; ZVFHMIN32-NEXT:    vadd.vv v8, v8, v8
@@ -183,12 +183,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; ZVFHMIN64-LABEL: si2fp_v3i7_v3f32:
 ; ZVFHMIN64:       # %bb.0:
-; ZVFHMIN64-NEXT:    ld a1, 8(a0)
-; ZVFHMIN64-NEXT:    ld a2, 0(a0)
-; ZVFHMIN64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFHMIN64-NEXT:    ld a1, 0(a0)
+; ZVFHMIN64-NEXT:    ld a2, 8(a0)
 ; ZVFHMIN64-NEXT:    ld a0, 16(a0)
-; ZVFHMIN64-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFHMIN64-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN64-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVFHMIN64-NEXT:    vslide1down.vx v8, v8, a0
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v8, 1
 ; ZVFHMIN64-NEXT:    vadd.vv v8, v8, v8
@@ -205,12 +205,12 @@ define <3 x float> @si2fp_v3i7_v3f32(<3 x i7> %x) {
 define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ; ZVFH32-LABEL: ui2fp_v3i7_v3f32:
 ; ZVFH32:       # %bb.0:
-; ZVFH32-NEXT:    lw a1, 4(a0)
-; ZVFH32-NEXT:    lw a2, 0(a0)
-; ZVFH32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFH32-NEXT:    lw a1, 0(a0)
+; ZVFH32-NEXT:    lw a2, 4(a0)
 ; ZVFH32-NEXT:    lw a0, 8(a0)
-; ZVFH32-NEXT:    vmv.v.x v8, a2
-; ZVFH32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFH32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFH32-NEXT:    vmv.v.x v8, a1
+; ZVFH32-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVFH32-NEXT:    vslide1down.vx v8, v8, a0
 ; ZVFH32-NEXT:    vslidedown.vi v8, v8, 1
 ; ZVFH32-NEXT:    li a0, 127
@@ -222,12 +222,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; ZVFH64-LABEL: ui2fp_v3i7_v3f32:
 ; ZVFH64:       # %bb.0:
-; ZVFH64-NEXT:    ld a1, 8(a0)
-; ZVFH64-NEXT:    ld a2, 0(a0)
-; ZVFH64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFH64-NEXT:    ld a1, 0(a0)
+; ZVFH64-NEXT:    ld a2, 8(a0)
 ; ZVFH64-NEXT:    ld a0, 16(a0)
-; ZVFH64-NEXT:    vmv.v.x v8, a2
-; ZVFH64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFH64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFH64-NEXT:    vmv.v.x v8, a1
+; ZVFH64-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVFH64-NEXT:    vslide1down.vx v8, v8, a0
 ; ZVFH64-NEXT:    vslidedown.vi v8, v8, 1
 ; ZVFH64-NEXT:    li a0, 127
@@ -239,12 +239,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; ZVFHMIN32-LABEL: ui2fp_v3i7_v3f32:
 ; ZVFHMIN32:       # %bb.0:
-; ZVFHMIN32-NEXT:    lw a1, 4(a0)
-; ZVFHMIN32-NEXT:    lw a2, 0(a0)
-; ZVFHMIN32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFHMIN32-NEXT:    lw a1, 0(a0)
+; ZVFHMIN32-NEXT:    lw a2, 4(a0)
 ; ZVFHMIN32-NEXT:    lw a0, 8(a0)
-; ZVFHMIN32-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN32-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFHMIN32-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN32-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVFHMIN32-NEXT:    vslide1down.vx v8, v8, a0
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v8, 1
 ; ZVFHMIN32-NEXT:    li a0, 127
@@ -256,12 +256,12 @@ define <3 x float> @ui2fp_v3i7_v3f32(<3 x i7> %x) {
 ;
 ; ZVFHMIN64-LABEL: ui2fp_v3i7_v3f32:
 ; ZVFHMIN64:       # %bb.0:
-; ZVFHMIN64-NEXT:    ld a1, 8(a0)
-; ZVFHMIN64-NEXT:    ld a2, 0(a0)
-; ZVFHMIN64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFHMIN64-NEXT:    ld a1, 0(a0)
+; ZVFHMIN64-NEXT:    ld a2, 8(a0)
 ; ZVFHMIN64-NEXT:    ld a0, 16(a0)
-; ZVFHMIN64-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN64-NEXT:    vslide1down.vx v8, v8, a1
+; ZVFHMIN64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVFHMIN64-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN64-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVFHMIN64-NEXT:    vslide1down.vx v8, v8, a0
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v8, 1
 ; ZVFHMIN64-NEXT:    li a0, 127
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index 592ce6fc5be0bd..4f4f0a09de7484 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -1183,38 +1183,38 @@ define <8 x i64> @v8xi64_exact_undef_prefix(i64 %a, i64 %b, i64 %c, i64 %d) vsca
 define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; CHECK-LABEL: buildvec_v16i8_loads_contigous:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lbu a1, 1(a0)
-; CHECK-NEXT:    lbu a2, 2(a0)
-; CHECK-NEXT:    lbu a3, 3(a0)
-; CHECK-NEXT:    lbu a4, 4(a0)
-; CHECK-NEXT:    lbu a5, 5(a0)
-; CHECK-NEXT:    lbu a6, 6(a0)
-; CHECK-NEXT:    lbu a7, 7(a0)
-; CHECK-NEXT:    lbu t0, 9(a0)
-; CHECK-NEXT:    lbu t1, 10(a0)
-; CHECK-NEXT:    lbu t2, 11(a0)
-; CHECK-NEXT:    lbu t3, 12(a0)
-; CHECK-NEXT:    lbu t4, 13(a0)
-; CHECK-NEXT:    lbu t5, 14(a0)
-; CHECK-NEXT:    lbu t6, 15(a0)
+; CHECK-NEXT:    addi a1, a0, 8
+; CHECK-NEXT:    lbu a2, 1(a0)
+; CHECK-NEXT:    lbu a3, 2(a0)
+; CHECK-NEXT:    lbu a4, 3(a0)
+; CHECK-NEXT:    lbu a5, 4(a0)
+; CHECK-NEXT:    lbu a6, 5(a0)
+; CHECK-NEXT:    lbu a7, 6(a0)
+; CHECK-NEXT:    lbu t0, 7(a0)
+; CHECK-NEXT:    lbu t1, 9(a0)
+; CHECK-NEXT:    lbu t2, 10(a0)
+; CHECK-NEXT:    lbu t3, 11(a0)
+; CHECK-NEXT:    lbu t4, 12(a0)
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vlse8.v v8, (a0), zero
-; CHECK-NEXT:    addi a0, a0, 8
-; CHECK-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-NEXT:    lbu t5, 13(a0)
+; CHECK-NEXT:    lbu t6, 14(a0)
+; CHECK-NEXT:    lbu a0, 15(a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a4
-; CHECK-NEXT:    vlse8.v v9, (a0), zero
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a5
+; CHECK-NEXT:    vlse8.v v9, (a1), zero
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a6
-; CHECK-NEXT:    vslide1down.vx v10, v8, a7
-; CHECK-NEXT:    vslide1down.vx v8, v9, t0
-; CHECK-NEXT:    vslide1down.vx v8, v8, t1
+; CHECK-NEXT:    vslide1down.vx v8, v8, a7
+; CHECK-NEXT:    vslide1down.vx v10, v8, t0
+; CHECK-NEXT:    vslide1down.vx v8, v9, t1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t3
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t4
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t5
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t6
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 255
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
@@ -1277,38 +1277,38 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; CHECK-LABEL: buildvec_v16i8_loads_gather:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lbu a1, 1(a0)
-; CHECK-NEXT:    lbu a2, 22(a0)
-; CHECK-NEXT:    lbu a3, 31(a0)
-; CHECK-NEXT:    lbu a4, 44(a0)
-; CHECK-NEXT:    lbu a5, 55(a0)
-; CHECK-NEXT:    lbu a6, 623(a0)
-; CHECK-NEXT:    lbu a7, 75(a0)
-; CHECK-NEXT:    lbu t0, 93(a0)
-; CHECK-NEXT:    lbu t1, 105(a0)
-; CHECK-NEXT:    lbu t2, 161(a0)
-; CHECK-NEXT:    lbu t3, 124(a0)
-; CHECK-NEXT:    lbu t4, 163(a0)
-; CHECK-NEXT:    lbu t5, 144(a0)
-; CHECK-NEXT:    lbu t6, 154(a0)
+; CHECK-NEXT:    addi a1, a0, 82
+; CHECK-NEXT:    lbu a2, 1(a0)
+; CHECK-NEXT:    lbu a3, 22(a0)
+; CHECK-NEXT:    lbu a4, 31(a0)
+; CHECK-NEXT:    lbu a5, 44(a0)
+; CHECK-NEXT:    lbu a6, 55(a0)
+; CHECK-NEXT:    lbu a7, 623(a0)
+; CHECK-NEXT:    lbu t0, 75(a0)
+; CHECK-NEXT:    lbu t1, 93(a0)
+; CHECK-NEXT:    lbu t2, 105(a0)
+; CHECK-NEXT:    lbu t3, 161(a0)
+; CHECK-NEXT:    lbu t4, 124(a0)
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vlse8.v v8, (a0), zero
-; CHECK-NEXT:    addi a0, a0, 82
-; CHECK-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-NEXT:    lbu t5, 163(a0)
+; CHECK-NEXT:    lbu t6, 144(a0)
+; CHECK-NEXT:    lbu a0, 154(a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a4
-; CHECK-NEXT:    vlse8.v v9, (a0), zero
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a5
+; CHECK-NEXT:    vlse8.v v9, (a1), zero
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a6
-; CHECK-NEXT:    vslide1down.vx v10, v8, a7
-; CHECK-NEXT:    vslide1down.vx v8, v9, t0
-; CHECK-NEXT:    vslide1down.vx v8, v8, t1
+; CHECK-NEXT:    vslide1down.vx v8, v8, a7
+; CHECK-NEXT:    vslide1down.vx v10, v8, t0
+; CHECK-NEXT:    vslide1down.vx v8, v9, t1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t3
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t4
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t5
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t6
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 255
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
@@ -1375,17 +1375,17 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ; CHECK-NEXT:    lbu a3, 105(a0)
 ; CHECK-NEXT:    lbu a4, 161(a0)
 ; CHECK-NEXT:    lbu a5, 124(a0)
-; CHECK-NEXT:    lbu a6, 163(a0)
-; CHECK-NEXT:    lbu a7, 144(a0)
-; CHECK-NEXT:    lbu a0, 154(a0)
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vlse8.v v8, (a1), zero
+; CHECK-NEXT:    lbu a1, 163(a0)
+; CHECK-NEXT:    lbu a6, 144(a0)
+; CHECK-NEXT:    lbu a0, 154(a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a4
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a5
+; CHECK-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a6
-; CHECK-NEXT:    vslide1down.vx v8, v8, a7
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %p9 = getelementptr i8, ptr %p, i32 82
@@ -1424,18 +1424,18 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ; CHECK-NEXT:    lbu a2, 22(a0)
 ; CHECK-NEXT:    lbu a3, 31(a0)
 ; CHECK-NEXT:    lbu a4, 44(a0)
-; CHECK-NEXT:    lbu a5, 55(a0)
-; CHECK-NEXT:    lbu a6, 623(a0)
-; CHECK-NEXT:    lbu a7, 75(a0)
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vlse8.v v8, (a0), zero
+; CHECK-NEXT:    lbu a5, 55(a0)
+; CHECK-NEXT:    lbu a6, 623(a0)
+; CHECK-NEXT:    lbu a0, 75(a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a4
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a5
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a6
-; CHECK-NEXT:    vslide1down.vx v8, v8, a7
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    ret
   %p2 = getelementptr i8, ptr %p, i32 1
@@ -1470,24 +1470,24 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ; CHECK-LABEL: buildvec_v16i8_undef_edges:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi a1, a0, 31
-; CHECK-NEXT:    lbu a2, 44(a0)
-; CHECK-NEXT:    lbu a3, 55(a0)
-; CHECK-NEXT:    lbu a4, 623(a0)
-; CHECK-NEXT:    lbu a5, 75(a0)
-; CHECK-NEXT:    lbu a6, 93(a0)
-; CHECK-NEXT:    lbu a7, 105(a0)
-; CHECK-NEXT:    lbu t0, 161(a0)
+; CHECK-NEXT:    addi a2, a0, 82
+; CHECK-NEXT:    lbu a3, 44(a0)
+; CHECK-NEXT:    lbu a4, 55(a0)
+; CHECK-NEXT:    lbu a5, 623(a0)
+; CHECK-NEXT:    lbu a6, 75(a0)
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vlse8.v v8, (a1), zero
-; CHECK-NEXT:    addi a0, a0, 82
-; CHECK-NEXT:    vslide1down.vx v8, v8, a2
-; CHECK-NEXT:    vlse8.v v9, (a0), zero
+; CHECK-NEXT:    lbu a1, 93(a0)
+; CHECK-NEXT:    lbu a7, 105(a0)
+; CHECK-NEXT:    lbu a0, 161(a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vlse8.v v9, (a2), zero
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a4
-; CHECK-NEXT:    vslide1down.vx v10, v8, a5
-; CHECK-NEXT:    vslide1down.vx v8, v9, a6
+; CHECK-NEXT:    vslide1down.vx v8, v8, a5
+; CHECK-NEXT:    vslide1down.vx v10, v8, a6
+; CHECK-NEXT:    vslide1down.vx v8, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a7
-; CHECK-NEXT:    vslide1down.vx v8, v8, t0
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 4
 ; CHECK-NEXT:    li a0, 255
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
@@ -1530,30 +1530,30 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; CHECK-LABEL: buildvec_v16i8_loads_undef_scattered:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lbu a1, 1(a0)
-; CHECK-NEXT:    lbu a2, 44(a0)
-; CHECK-NEXT:    lbu a3, 55(a0)
-; CHECK-NEXT:    lbu a4, 75(a0)
-; CHECK-NEXT:    lbu a5, 93(a0)
-; CHECK-NEXT:    lbu a6, 124(a0)
-; CHECK-NEXT:    lbu a7, 144(a0)
-; CHECK-NEXT:    lbu t0, 154(a0)
+; CHECK-NEXT:    addi a1, a0, 82
+; CHECK-NEXT:    lbu a2, 1(a0)
+; CHECK-NEXT:    lbu a3, 44(a0)
+; CHECK-NEXT:    lbu a4, 55(a0)
+; CHECK-NEXT:    lbu a5, 75(a0)
+; CHECK-NEXT:    lbu a6, 93(a0)
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vlse8.v v8, (a0), zero
-; CHECK-NEXT:    addi a0, a0, 82
-; CHECK-NEXT:    vslide1down.vx v8, v8, a1
-; CHECK-NEXT:    vslidedown.vi v8, v8, 2
+; CHECK-NEXT:    lbu a7, 124(a0)
+; CHECK-NEXT:    lbu t0, 144(a0)
+; CHECK-NEXT:    lbu a0, 154(a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
-; CHECK-NEXT:    vlse8.v v9, (a0), zero
+; CHECK-NEXT:    vslidedown.vi v8, v8, 2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vlse8.v v9, (a1), zero
+; CHECK-NEXT:    vslide1down.vx v8, v8, a4
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-NEXT:    vslide1down.vx v10, v8, a4
-; CHECK-NEXT:    vslide1down.vx v8, v9, a5
+; CHECK-NEXT:    vslide1down.vx v10, v8, a5
+; CHECK-NEXT:    vslide1down.vx v8, v9, a6
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vslide1down.vx v8, v8, a6
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a7
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, t0
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 255
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
index 8acc70faaa1fc9..eb95d86e34045b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
@@ -7,25 +7,23 @@
 define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-LABEL: load_large_vector:
 ; ZVE32X:       # %bb.0:
-; ZVE32X-NEXT:    ld a1, 80(a0)
-; ZVE32X-NEXT:    ld a2, 72(a0)
-; ZVE32X-NEXT:    ld a3, 56(a0)
-; ZVE32X-NEXT:    ld a4, 32(a0)
-; ZVE32X-NEXT:    ld a5, 24(a0)
-; ZVE32X-NEXT:    ld a6, 48(a0)
-; ZVE32X-NEXT:    ld a7, 8(a0)
-; ZVE32X-NEXT:    ld a0, 0(a0)
-; ZVE32X-NEXT:    xor a4, a5, a4
-; ZVE32X-NEXT:    snez a4, a4
+; ZVE32X-NEXT:    ld a1, 56(a0)
+; ZVE32X-NEXT:    ld a2, 32(a0)
+; ZVE32X-NEXT:    ld a3, 24(a0)
+; ZVE32X-NEXT:    ld a4, 48(a0)
+; ZVE32X-NEXT:    ld a5, 8(a0)
+; ZVE32X-NEXT:    ld a6, 0(a0)
+; ZVE32X-NEXT:    xor a2, a3, a2
+; ZVE32X-NEXT:    snez a2, a2
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vmv.s.x v8, a4
+; ZVE32X-NEXT:    vmv.s.x v8, a2
 ; ZVE32X-NEXT:    vand.vi v8, v8, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v8, 0
 ; ZVE32X-NEXT:    vmv.s.x v8, zero
 ; ZVE32X-NEXT:    vmerge.vim v9, v8, 1, v0
-; ZVE32X-NEXT:    xor a0, a0, a7
-; ZVE32X-NEXT:    snez a0, a0
-; ZVE32X-NEXT:    vmv.s.x v10, a0
+; ZVE32X-NEXT:    xor a2, a6, a5
+; ZVE32X-NEXT:    snez a2, a2
+; ZVE32X-NEXT:    vmv.s.x v10, a2
 ; ZVE32X-NEXT:    vand.vi v10, v10, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v10, 0
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
@@ -35,21 +33,23 @@ define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-NEXT:    vslideup.vi v11, v9, 1
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
+; ZVE32X-NEXT:    ld a2, 80(a0)
 ; ZVE32X-NEXT:    vmerge.vim v9, v10, 1, v0
-; ZVE32X-NEXT:    xor a0, a6, a3
-; ZVE32X-NEXT:    snez a0, a0
-; ZVE32X-NEXT:    vmv.s.x v11, a0
+; ZVE32X-NEXT:    xor a1, a4, a1
+; ZVE32X-NEXT:    snez a1, a1
+; ZVE32X-NEXT:    vmv.s.x v11, a1
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vand.vi v11, v11, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
+; ZVE32X-NEXT:    ld a0, 72(a0)
 ; ZVE32X-NEXT:    vmerge.vim v11, v8, 1, v0
 ; ZVE32X-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
 ; ZVE32X-NEXT:    vslideup.vi v9, v11, 2
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmsne.vi v0, v9, 0
 ; ZVE32X-NEXT:    vmerge.vim v9, v10, 1, v0
-; ZVE32X-NEXT:    xor a1, a2, a1
-; ZVE32X-NEXT:    snez a0, a1
+; ZVE32X-NEXT:    xor a0, a0, a2
+; ZVE32X-NEXT:    snez a0, a0
 ; ZVE32X-NEXT:    vmv.s.x v10, a0
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vand.vi v10, v10, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
index 35baa6808db603..e2075e074179c8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
@@ -812,14 +812,14 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 3
-; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    fld fa4, 32(sp)
+; RV32-NEXT:    fld fa5, 32(sp)
+; RV32-NEXT:    vfmv.f.s fa4, v8
 ; RV32-NEXT:    fld fa3, 40(sp)
-; RV32-NEXT:    fcvt.w.d a0, fa5
+; RV32-NEXT:    fcvt.w.d a0, fa4
+; RV32-NEXT:    fcvt.w.d a1, fa5
 ; RV32-NEXT:    fld fa5, 48(sp)
-; RV32-NEXT:    fcvt.w.d a1, fa4
 ; RV32-NEXT:    fcvt.w.d a2, fa3
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vslide1down.vx v8, v10, a0
 ; RV32-NEXT:    fcvt.w.d a0, fa5
 ; RV32-NEXT:    fld fa5, 56(sp)
@@ -865,14 +865,14 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV64-i32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-i32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
-; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-i32-NEXT:    fld fa4, 32(sp)
+; RV64-i32-NEXT:    fld fa5, 32(sp)
+; RV64-i32-NEXT:    vfmv.f.s fa4, v8
 ; RV64-i32-NEXT:    fld fa3, 40(sp)
-; RV64-i32-NEXT:    fcvt.l.d a0, fa5
+; RV64-i32-NEXT:    fcvt.l.d a0, fa4
+; RV64-i32-NEXT:    fcvt.l.d a1, fa5
 ; RV64-i32-NEXT:    fld fa5, 48(sp)
-; RV64-i32-NEXT:    fcvt.l.d a1, fa4
 ; RV64-i32-NEXT:    fcvt.l.d a2, fa3
+; RV64-i32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV64-i32-NEXT:    vslide1down.vx v8, v10, a0
 ; RV64-i32-NEXT:    fcvt.l.d a0, fa5
 ; RV64-i32-NEXT:    fld fa5, 56(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index f42f32e246585e..08cad29ab1b880 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -519,17 +519,17 @@ define <4 x i8> @mgather_truemask_v4i8(<4 x ptr> %ptrs, <4 x i8> %passthru) {
 ; RV64ZVE32F-LABEL: mgather_truemask_v4i8:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    ld a1, 8(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 0(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 24(a0)
-; RV64ZVE32F-NEXT:    ld a0, 0(a0)
+; RV64ZVE32F-NEXT:    ld a0, 16(a0)
 ; RV64ZVE32F-NEXT:    lbu a1, 0(a1)
-; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
-; RV64ZVE32F-NEXT:    lbu a3, 0(a3)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT:    vlse8.v v8, (a0), zero
+; RV64ZVE32F-NEXT:    vlse8.v v8, (a2), zero
+; RV64ZVE32F-NEXT:    lbu a0, 0(a0)
+; RV64ZVE32F-NEXT:    lbu a2, 0(a3)
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64ZVE32F-NEXT:    ret
   %v = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 1), <4 x i8> %passthru)
   ret <4 x i8> %v
@@ -1208,17 +1208,17 @@ define <4 x i16> @mgather_truemask_v4i16(<4 x ptr> %ptrs, <4 x i16> %passthru) {
 ; RV64ZVE32F-LABEL: mgather_truemask_v4i16:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    ld a1, 8(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 0(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 24(a0)
-; RV64ZVE32F-NEXT:    ld a0, 0(a0)
+; RV64ZVE32F-NEXT:    ld a0, 16(a0)
 ; RV64ZVE32F-NEXT:    lh a1, 0(a1)
-; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    lh a3, 0(a3)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
+; RV64ZVE32F-NEXT:    vlse16.v v8, (a2), zero
+; RV64ZVE32F-NEXT:    lh a0, 0(a0)
+; RV64ZVE32F-NEXT:    lh a2, 0(a3)
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64ZVE32F-NEXT:    ret
   %v = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1), <4 x i16> %passthru)
   ret <4 x i16> %v
@@ -2257,17 +2257,17 @@ define <4 x i32> @mgather_truemask_v4i32(<4 x ptr> %ptrs, <4 x i32> %passthru) {
 ; RV64ZVE32F-LABEL: mgather_truemask_v4i32:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    ld a1, 8(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 0(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 24(a0)
-; RV64ZVE32F-NEXT:    ld a0, 0(a0)
+; RV64ZVE32F-NEXT:    ld a0, 16(a0)
 ; RV64ZVE32F-NEXT:    lw a1, 0(a1)
-; RV64ZVE32F-NEXT:    lw a2, 0(a2)
-; RV64ZVE32F-NEXT:    lw a3, 0(a3)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vlse32.v v8, (a0), zero
+; RV64ZVE32F-NEXT:    vlse32.v v8, (a2), zero
+; RV64ZVE32F-NEXT:    lw a0, 0(a0)
+; RV64ZVE32F-NEXT:    lw a2, 0(a3)
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64ZVE32F-NEXT:    ret
   %v = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 1), <4 x i32> %passthru)
   ret <4 x i32> %v
@@ -6589,16 +6589,16 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:    lw a4, 56(a2)
 ; RV32ZVE32F-NEXT:    lw a5, 48(a2)
 ; RV32ZVE32F-NEXT:    lw a6, 40(a2)
-; RV32ZVE32F-NEXT:    lw a7, 32(a2)
-; RV32ZVE32F-NEXT:    lw t0, 24(a2)
-; RV32ZVE32F-NEXT:    lw t1, 16(a2)
-; RV32ZVE32F-NEXT:    lw t2, 8(a2)
+; RV32ZVE32F-NEXT:    lw a7, 8(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vlse32.v v8, (a2), zero
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t2
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
+; RV32ZVE32F-NEXT:    lw t0, 16(a2)
+; RV32ZVE32F-NEXT:    lw t1, 24(a2)
+; RV32ZVE32F-NEXT:    lw a2, 32(a2)
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
@@ -7017,14 +7017,14 @@ define <4 x half> @mgather_truemask_v4f16(<4 x ptr> %ptrs, <4 x half> %passthru)
 ; RV64ZVE32F-LABEL: mgather_truemask_v4f16:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    ld a1, 8(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 0(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 24(a0)
-; RV64ZVE32F-NEXT:    ld a0, 0(a0)
+; RV64ZVE32F-NEXT:    ld a0, 16(a0)
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a1)
-; RV64ZVE32F-NEXT:    flh fa4, 0(a2)
-; RV64ZVE32F-NEXT:    flh fa3, 0(a3)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
+; RV64ZVE32F-NEXT:    vlse16.v v8, (a2), zero
+; RV64ZVE32F-NEXT:    flh fa4, 0(a0)
+; RV64ZVE32F-NEXT:    flh fa3, 0(a3)
 ; RV64ZVE32F-NEXT:    vfslide1down.vf v8, v8, fa5
 ; RV64ZVE32F-NEXT:    vfslide1down.vf v8, v8, fa4
 ; RV64ZVE32F-NEXT:    vfslide1down.vf v8, v8, fa3
@@ -7940,14 +7940,14 @@ define <4 x float> @mgather_truemask_v4f32(<4 x ptr> %ptrs, <4 x float> %passthr
 ; RV64ZVE32F-LABEL: mgather_truemask_v4f32:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    ld a1, 8(a0)
-; RV64ZVE32F-NEXT:    ld a2, 16(a0)
+; RV64ZVE32F-NEXT:    ld a2, 0(a0)
 ; RV64ZVE32F-NEXT:    ld a3, 24(a0)
-; RV64ZVE32F-NEXT:    ld a0, 0(a0)
+; RV64ZVE32F-NEXT:    ld a0, 16(a0)
 ; RV64ZVE32F-NEXT:    flw fa5, 0(a1)
-; RV64ZVE32F-NEXT:    flw fa4, 0(a2)
-; RV64ZVE32F-NEXT:    flw fa3, 0(a3)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vlse32.v v8, (a0), zero
+; RV64ZVE32F-NEXT:    vlse32.v v8, (a2), zero
+; RV64ZVE32F-NEXT:    flw fa4, 0(a0)
+; RV64ZVE32F-NEXT:    flw fa3, 0(a3)
 ; RV64ZVE32F-NEXT:    vfslide1down.vf v8, v8, fa5
 ; RV64ZVE32F-NEXT:    vfslide1down.vf v8, v8, fa4
 ; RV64ZVE32F-NEXT:    vfslide1down.vf v8, v8, fa3
@@ -11632,16 +11632,16 @@ define <8 x double> @mgather_baseidx_v8f64(ptr %base, <8 x i64> %idxs, <8 x i1>
 ; RV32ZVE32F-NEXT:    lw a3, 56(a2)
 ; RV32ZVE32F-NEXT:    lw a4, 48(a2)
 ; RV32ZVE32F-NEXT:    lw a5, 40(a2)
-; RV32ZVE32F-NEXT:    lw a6, 32(a2)
-; RV32ZVE32F-NEXT:    lw a7, 24(a2)
-; RV32ZVE32F-NEXT:    lw t0, 16(a2)
-; RV32ZVE32F-NEXT:    lw t1, 8(a2)
+; RV32ZVE32F-NEXT:    lw a6, 8(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vlse32.v v8, (a2), zero
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV32ZVE32F-NEXT:    lw a7, 16(a2)
+; RV32ZVE32F-NEXT:    lw t0, 24(a2)
+; RV32ZVE32F-NEXT:    lw a2, 32(a2)
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
@@ -12881,22 +12881,22 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_strided_2xSEW:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 2(a0)
-; RV64ZVE32F-NEXT:    lh a2, 8(a0)
-; RV64ZVE32F-NEXT:    lh a3, 10(a0)
-; RV64ZVE32F-NEXT:    lh a4, 18(a0)
-; RV64ZVE32F-NEXT:    lh a5, 24(a0)
-; RV64ZVE32F-NEXT:    lh a6, 26(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 16
+; RV64ZVE32F-NEXT:    lh a2, 2(a0)
+; RV64ZVE32F-NEXT:    lh a3, 8(a0)
+; RV64ZVE32F-NEXT:    lh a4, 10(a0)
+; RV64ZVE32F-NEXT:    lh a5, 18(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 16
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    lh a6, 24(a0)
+; RV64ZVE32F-NEXT:    lh a0, 26(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -12925,23 +12925,23 @@ define <8 x i16> @mgather_strided_2xSEW_with_offset(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_strided_2xSEW_with_offset:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    addi a1, a0, 4
-; RV64ZVE32F-NEXT:    lh a2, 6(a0)
-; RV64ZVE32F-NEXT:    lh a3, 12(a0)
-; RV64ZVE32F-NEXT:    lh a4, 14(a0)
-; RV64ZVE32F-NEXT:    lh a5, 22(a0)
-; RV64ZVE32F-NEXT:    lh a6, 28(a0)
-; RV64ZVE32F-NEXT:    lh a7, 30(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 20
+; RV64ZVE32F-NEXT:    addi a2, a0, 4
+; RV64ZVE32F-NEXT:    lh a3, 6(a0)
+; RV64ZVE32F-NEXT:    lh a4, 12(a0)
+; RV64ZVE32F-NEXT:    lh a5, 14(a0)
+; RV64ZVE32F-NEXT:    lh a6, 22(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vlse16.v v8, (a1), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 20
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vlse16.v v8, (a2), zero
+; RV64ZVE32F-NEXT:    lh a2, 28(a0)
+; RV64ZVE32F-NEXT:    lh a0, 30(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -12970,23 +12970,23 @@ define <8 x i16> @mgather_reverse_unit_strided_2xSEW(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_reverse_unit_strided_2xSEW:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    addi a1, a0, 28
-; RV64ZVE32F-NEXT:    lh a2, 30(a0)
-; RV64ZVE32F-NEXT:    lh a3, 24(a0)
-; RV64ZVE32F-NEXT:    lh a4, 26(a0)
-; RV64ZVE32F-NEXT:    lh a5, 22(a0)
-; RV64ZVE32F-NEXT:    lh a6, 16(a0)
-; RV64ZVE32F-NEXT:    lh a7, 18(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 20
+; RV64ZVE32F-NEXT:    addi a2, a0, 28
+; RV64ZVE32F-NEXT:    lh a3, 30(a0)
+; RV64ZVE32F-NEXT:    lh a4, 24(a0)
+; RV64ZVE32F-NEXT:    lh a5, 26(a0)
+; RV64ZVE32F-NEXT:    lh a6, 22(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vlse16.v v8, (a1), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 20
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vlse16.v v8, (a2), zero
+; RV64ZVE32F-NEXT:    lh a2, 16(a0)
+; RV64ZVE32F-NEXT:    lh a0, 18(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -13015,23 +13015,23 @@ define <8 x i16> @mgather_reverse_strided_2xSEW(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_reverse_strided_2xSEW:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    addi a1, a0, 28
-; RV64ZVE32F-NEXT:    lh a2, 30(a0)
-; RV64ZVE32F-NEXT:    lh a3, 20(a0)
-; RV64ZVE32F-NEXT:    lh a4, 22(a0)
-; RV64ZVE32F-NEXT:    lh a5, 14(a0)
-; RV64ZVE32F-NEXT:    lh a6, 4(a0)
-; RV64ZVE32F-NEXT:    lh a7, 6(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 12
+; RV64ZVE32F-NEXT:    addi a2, a0, 28
+; RV64ZVE32F-NEXT:    lh a3, 30(a0)
+; RV64ZVE32F-NEXT:    lh a4, 20(a0)
+; RV64ZVE32F-NEXT:    lh a5, 22(a0)
+; RV64ZVE32F-NEXT:    lh a6, 14(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vlse16.v v8, (a1), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 12
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vlse16.v v8, (a2), zero
+; RV64ZVE32F-NEXT:    lh a2, 4(a0)
+; RV64ZVE32F-NEXT:    lh a0, 6(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -13059,22 +13059,22 @@ define <8 x i16> @mgather_gather_2xSEW(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_gather_2xSEW:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 2(a0)
-; RV64ZVE32F-NEXT:    lh a2, 16(a0)
-; RV64ZVE32F-NEXT:    lh a3, 18(a0)
-; RV64ZVE32F-NEXT:    lh a4, 10(a0)
-; RV64ZVE32F-NEXT:    lh a5, 4(a0)
-; RV64ZVE32F-NEXT:    lh a6, 6(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 8
+; RV64ZVE32F-NEXT:    lh a2, 2(a0)
+; RV64ZVE32F-NEXT:    lh a3, 16(a0)
+; RV64ZVE32F-NEXT:    lh a4, 18(a0)
+; RV64ZVE32F-NEXT:    lh a5, 10(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 8
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    lh a6, 4(a0)
+; RV64ZVE32F-NEXT:    lh a0, 6(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -13105,22 +13105,22 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_gather_2xSEW_unaligned:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 2(a0)
-; RV64ZVE32F-NEXT:    lh a2, 18(a0)
-; RV64ZVE32F-NEXT:    lh a3, 20(a0)
-; RV64ZVE32F-NEXT:    lh a4, 10(a0)
-; RV64ZVE32F-NEXT:    lh a5, 4(a0)
-; RV64ZVE32F-NEXT:    lh a6, 6(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 8
+; RV64ZVE32F-NEXT:    lh a2, 2(a0)
+; RV64ZVE32F-NEXT:    lh a3, 18(a0)
+; RV64ZVE32F-NEXT:    lh a4, 20(a0)
+; RV64ZVE32F-NEXT:    lh a5, 10(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 8
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    lh a6, 4(a0)
+; RV64ZVE32F-NEXT:    lh a0, 6(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -13152,22 +13152,22 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned2(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_gather_2xSEW_unaligned2:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    addi a1, a0, 2
-; RV64ZVE32F-NEXT:    lh a2, 4(a0)
-; RV64ZVE32F-NEXT:    lh a3, 18(a0)
-; RV64ZVE32F-NEXT:    lh a4, 20(a0)
-; RV64ZVE32F-NEXT:    lh a5, 10(a0)
-; RV64ZVE32F-NEXT:    lh a6, 6(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 8
+; RV64ZVE32F-NEXT:    addi a2, a0, 2
+; RV64ZVE32F-NEXT:    lh a3, 4(a0)
+; RV64ZVE32F-NEXT:    lh a4, 18(a0)
+; RV64ZVE32F-NEXT:    lh a5, 20(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vlse16.v v8, (a1), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 8
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vlse16.v v8, (a2), zero
+; RV64ZVE32F-NEXT:    lh a2, 10(a0)
+; RV64ZVE32F-NEXT:    lh a0, 6(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -13202,22 +13202,22 @@ define <8 x i16> @mgather_gather_4xSEW(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_gather_4xSEW:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 2(a0)
-; RV64ZVE32F-NEXT:    lh a2, 4(a0)
-; RV64ZVE32F-NEXT:    lh a3, 6(a0)
-; RV64ZVE32F-NEXT:    lh a4, 18(a0)
-; RV64ZVE32F-NEXT:    lh a5, 20(a0)
-; RV64ZVE32F-NEXT:    lh a6, 22(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 16
+; RV64ZVE32F-NEXT:    lh a2, 2(a0)
+; RV64ZVE32F-NEXT:    lh a3, 4(a0)
+; RV64ZVE32F-NEXT:    lh a4, 6(a0)
+; RV64ZVE32F-NEXT:    lh a5, 18(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 16
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    lh a6, 20(a0)
+; RV64ZVE32F-NEXT:    lh a0, 22(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -13249,22 +13249,22 @@ define <8 x i16> @mgather_gather_4xSEW_partial_align(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_gather_4xSEW_partial_align:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 2(a0)
-; RV64ZVE32F-NEXT:    lh a2, 4(a0)
-; RV64ZVE32F-NEXT:    lh a3, 6(a0)
-; RV64ZVE32F-NEXT:    lh a4, 18(a0)
-; RV64ZVE32F-NEXT:    lh a5, 20(a0)
-; RV64ZVE32F-NEXT:    lh a6, 22(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 16
+; RV64ZVE32F-NEXT:    lh a2, 2(a0)
+; RV64ZVE32F-NEXT:    lh a3, 4(a0)
+; RV64ZVE32F-NEXT:    lh a4, 6(a0)
+; RV64ZVE32F-NEXT:    lh a5, 18(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 16
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    lh a6, 20(a0)
+; RV64ZVE32F-NEXT:    lh a0, 22(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -13305,22 +13305,22 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_shuffle_rotate:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 10(a0)
-; RV64ZVE32F-NEXT:    lh a2, 12(a0)
-; RV64ZVE32F-NEXT:    lh a3, 14(a0)
-; RV64ZVE32F-NEXT:    lh a4, 2(a0)
-; RV64ZVE32F-NEXT:    lh a5, 4(a0)
-; RV64ZVE32F-NEXT:    lh a6, 6(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 8
+; RV64ZVE32F-NEXT:    lh a2, 10(a0)
+; RV64ZVE32F-NEXT:    lh a3, 12(a0)
+; RV64ZVE32F-NEXT:    lh a4, 14(a0)
+; RV64ZVE32F-NEXT:    lh a5, 2(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 8
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32F-NEXT:    lh a6, 4(a0)
+; RV64ZVE32F-NEXT:    lh a0, 6(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a2
 ; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a4
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -13352,22 +13352,22 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_shuffle_vrgather:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 4(a0)
-; RV64ZVE32F-NEXT:    lh a2, 6(a0)
-; RV64ZVE32F-NEXT:    lh a3, 2(a0)
-; RV64ZVE32F-NEXT:    lh a4, 10(a0)
-; RV64ZVE32F-NEXT:    lh a5, 12(a0)
-; RV64ZVE32F-NEXT:    lh a6, 14(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 8
+; RV64ZVE32F-NEXT:    lh a2, 4(a0)
+; RV64ZVE32F-NEXT:    lh a3, 6(a0)
+; RV64ZVE32F-NEXT:    lh a4, 2(a0)
+; RV64ZVE32F-NEXT:    lh a5, 10(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV64ZVE32F-NEXT:    vlse16.v v8, (a0), zero
-; RV64ZVE32F-NEXT:    addi a0, a0, 8
-; RV64ZVE32F-NEXT:    vlse16.v v9, (a0), zero
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32F-NEXT:    lh a6, 12(a0)
+; RV64ZVE32F-NEXT:    lh a0, 14(a0)
+; RV64ZVE32F-NEXT:    vlse16.v v9, (a1), zero
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
@@ -13853,12 +13853,12 @@ define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) {
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    addi a1, a0, 136
 ; RV64ZVE32F-NEXT:    lw a2, 140(a0)
-; RV64ZVE32F-NEXT:    lw a3, 0(a0)
-; RV64ZVE32F-NEXT:    lw a0, 4(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vlse32.v v8, (a1), zero
+; RV64ZVE32F-NEXT:    lw a1, 0(a0)
+; RV64ZVE32F-NEXT:    lw a0, 4(a0)
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr i32, ptr %base, <4 x i64> <i64 34, i64 35, i64 0, i64 1>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index aa815e18ac1014..42e52436a7da09 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -5598,17 +5598,16 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ;
 ; RV32ZVE32F-LABEL: mscatter_baseidx_v8i64:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    addi sp, sp, -48
-; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 48
-; RV32ZVE32F-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    addi sp, sp, -32
+; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 32
+; RV32ZVE32F-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
@@ -5617,7 +5616,6 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    .cfi_offset s5, -24
 ; RV32ZVE32F-NEXT:    .cfi_offset s6, -28
 ; RV32ZVE32F-NEXT:    .cfi_offset s7, -32
-; RV32ZVE32F-NEXT:    .cfi_offset s8, -36
 ; RV32ZVE32F-NEXT:    lw a3, 60(a0)
 ; RV32ZVE32F-NEXT:    lw a4, 56(a0)
 ; RV32ZVE32F-NEXT:    lw a5, 52(a0)
@@ -5635,16 +5633,16 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    lw s2, 56(a2)
 ; RV32ZVE32F-NEXT:    lw s3, 48(a2)
 ; RV32ZVE32F-NEXT:    lw s4, 40(a2)
-; RV32ZVE32F-NEXT:    lw s5, 32(a2)
-; RV32ZVE32F-NEXT:    lw s6, 24(a2)
-; RV32ZVE32F-NEXT:    lw s7, 16(a2)
-; RV32ZVE32F-NEXT:    lw s8, 8(a2)
+; RV32ZVE32F-NEXT:    lw s5, 8(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vlse32.v v8, (a2), zero
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s8
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s7
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s6
+; RV32ZVE32F-NEXT:    lw s6, 16(a2)
+; RV32ZVE32F-NEXT:    lw s7, 24(a2)
+; RV32ZVE32F-NEXT:    lw a2, 32(a2)
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s6
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s7
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s4
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s3
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s2
@@ -5682,16 +5680,15 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    sw a4, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a3, 4(a0)
 ; RV32ZVE32F-NEXT:  .LBB51_9: # %else14
-; RV32ZVE32F-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    addi sp, sp, 48
+; RV32ZVE32F-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    addi sp, sp, 32
 ; RV32ZVE32F-NEXT:    ret
 ; RV32ZVE32F-NEXT:  .LBB51_10: # %cond.store
 ; RV32ZVE32F-NEXT:    lw a2, 4(a0)
@@ -10227,16 +10224,16 @@ define void @mscatter_baseidx_v8f64(<8 x double> %val, ptr %base, <8 x i64> %idx
 ; RV32ZVE32F-NEXT:    lw a2, 56(a1)
 ; RV32ZVE32F-NEXT:    lw a3, 48(a1)
 ; RV32ZVE32F-NEXT:    lw a4, 40(a1)
-; RV32ZVE32F-NEXT:    lw a5, 32(a1)
-; RV32ZVE32F-NEXT:    lw a6, 24(a1)
-; RV32ZVE32F-NEXT:    lw a7, 16(a1)
-; RV32ZVE32F-NEXT:    lw t0, 8(a1)
+; RV32ZVE32F-NEXT:    lw a5, 8(a1)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vlse32.v v8, (a1), zero
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV32ZVE32F-NEXT:    lw a6, 16(a1)
+; RV32ZVE32F-NEXT:    lw a7, 24(a1)
+; RV32ZVE32F-NEXT:    lw a1, 32(a1)
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
index 19f3d3ce19fa4c..7be015e26b0983 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
@@ -19,9 +19,9 @@ define <2 x half> @vp_nearbyint_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <2 x half> @llvm.vp.nearbyint.v2f16(<2 x half> %va, <2 x i1> %m, i32 %evl)
   ret <2 x half> %v
@@ -38,9 +38,9 @@ define <2 x half> @vp_nearbyint_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <2 x half> @llvm.vp.nearbyint.v2f16(<2 x half> %va, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x half> %v
@@ -61,9 +61,9 @@ define <4 x half> @vp_nearbyint_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <4 x half> @llvm.vp.nearbyint.v4f16(<4 x half> %va, <4 x i1> %m, i32 %evl)
   ret <4 x half> %v
@@ -80,9 +80,9 @@ define <4 x half> @vp_nearbyint_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <4 x half> @llvm.vp.nearbyint.v4f16(<4 x half> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x half> %v
@@ -103,9 +103,9 @@ define <8 x half> @vp_nearbyint_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <8 x half> @llvm.vp.nearbyint.v8f16(<8 x half> %va, <8 x i1> %m, i32 %evl)
   ret <8 x half> %v
@@ -122,9 +122,9 @@ define <8 x half> @vp_nearbyint_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <8 x half> @llvm.vp.nearbyint.v8f16(<8 x half> %va, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x half> %v
@@ -147,9 +147,9 @@ define <16 x half> @vp_nearbyint_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <16 x half> @llvm.vp.nearbyint.v16f16(<16 x half> %va, <16 x i1> %m, i32 %evl)
   ret <16 x half> %v
@@ -166,9 +166,9 @@ define <16 x half> @vp_nearbyint_v16f16_unmasked(<16 x half> %va, i32 zeroext %e
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <16 x half> @llvm.vp.nearbyint.v16f16(<16 x half> %va, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x half> %v
@@ -189,9 +189,9 @@ define <2 x float> @vp_nearbyint_v2f32(<2 x float> %va, <2 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <2 x float> @llvm.vp.nearbyint.v2f32(<2 x float> %va, <2 x i1> %m, i32 %evl)
   ret <2 x float> %v
@@ -208,9 +208,9 @@ define <2 x float> @vp_nearbyint_v2f32_unmasked(<2 x float> %va, i32 zeroext %ev
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <2 x float> @llvm.vp.nearbyint.v2f32(<2 x float> %va, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x float> %v
@@ -231,9 +231,9 @@ define <4 x float> @vp_nearbyint_v4f32(<4 x float> %va, <4 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <4 x float> @llvm.vp.nearbyint.v4f32(<4 x float> %va, <4 x i1> %m, i32 %evl)
   ret <4 x float> %v
@@ -250,9 +250,9 @@ define <4 x float> @vp_nearbyint_v4f32_unmasked(<4 x float> %va, i32 zeroext %ev
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <4 x float> @llvm.vp.nearbyint.v4f32(<4 x float> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x float> %v
@@ -275,9 +275,9 @@ define <8 x float> @vp_nearbyint_v8f32(<8 x float> %va, <8 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <8 x float> @llvm.vp.nearbyint.v8f32(<8 x float> %va, <8 x i1> %m, i32 %evl)
   ret <8 x float> %v
@@ -294,9 +294,9 @@ define <8 x float> @vp_nearbyint_v8f32_unmasked(<8 x float> %va, i32 zeroext %ev
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <8 x float> @llvm.vp.nearbyint.v8f32(<8 x float> %va, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x float> %v
@@ -319,9 +319,9 @@ define <16 x float> @vp_nearbyint_v16f32(<16 x float> %va, <16 x i1> %m, i32 zer
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <16 x float> @llvm.vp.nearbyint.v16f32(<16 x float> %va, <16 x i1> %m, i32 %evl)
   ret <16 x float> %v
@@ -338,9 +338,9 @@ define <16 x float> @vp_nearbyint_v16f32_unmasked(<16 x float> %va, i32 zeroext
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <16 x float> @llvm.vp.nearbyint.v16f32(<16 x float> %va, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x float> %v
@@ -361,9 +361,9 @@ define <2 x double> @vp_nearbyint_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <2 x double> @llvm.vp.nearbyint.v2f64(<2 x double> %va, <2 x i1> %m, i32 %evl)
   ret <2 x double> %v
@@ -380,9 +380,9 @@ define <2 x double> @vp_nearbyint_v2f64_unmasked(<2 x double> %va, i32 zeroext %
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <2 x double> @llvm.vp.nearbyint.v2f64(<2 x double> %va, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x double> %v
@@ -405,9 +405,9 @@ define <4 x double> @vp_nearbyint_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.vp.nearbyint.v4f64(<4 x double> %va, <4 x i1> %m, i32 %evl)
   ret <4 x double> %v
@@ -424,9 +424,9 @@ define <4 x double> @vp_nearbyint_v4f64_unmasked(<4 x double> %va, i32 zeroext %
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <4 x double> @llvm.vp.nearbyint.v4f64(<4 x double> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x double> %v
@@ -449,9 +449,9 @@ define <8 x double> @vp_nearbyint_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double> %va, <8 x i1> %m, i32 %evl)
   ret <8 x double> %v
@@ -468,9 +468,9 @@ define <8 x double> @vp_nearbyint_v8f64_unmasked(<8 x double> %va, i32 zeroext %
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double> %va, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x double> %v
@@ -493,9 +493,9 @@ define <15 x double> @vp_nearbyint_v15f64(<15 x double> %va, <15 x i1> %m, i32 z
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <15 x double> @llvm.vp.nearbyint.v15f64(<15 x double> %va, <15 x i1> %m, i32 %evl)
   ret <15 x double> %v
@@ -512,9 +512,9 @@ define <15 x double> @vp_nearbyint_v15f64_unmasked(<15 x double> %va, i32 zeroex
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <15 x double> @llvm.vp.nearbyint.v15f64(<15 x double> %va, <15 x i1> splat (i1 true), i32 %evl)
   ret <15 x double> %v
@@ -537,9 +537,9 @@ define <16 x double> @vp_nearbyint_v16f64(<16 x double> %va, <16 x i1> %m, i32 z
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <16 x double> @llvm.vp.nearbyint.v16f64(<16 x double> %va, <16 x i1> %m, i32 %evl)
   ret <16 x double> %v
@@ -556,9 +556,9 @@ define <16 x double> @vp_nearbyint_v16f64_unmasked(<16 x double> %va, i32 zeroex
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <16 x double> @llvm.vp.nearbyint.v16f64(<16 x double> %va, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x double> %v
@@ -617,9 +617,9 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vmv.v.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -660,9 +660,9 @@ define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double> %va, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
index 7dcd4c41998279..ed2ed2a2ebfaa0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
@@ -5,8 +5,8 @@
 define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-LABEL: vselect_vv_v6i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    lbu a2, 0(a2)
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    slli a1, a2, 30
 ; RV32-NEXT:    srli a1, a1, 31
@@ -35,8 +35,8 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ;
 ; RV64-LABEL: vselect_vv_v6i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    lbu a2, 0(a2)
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a1)
 ; RV64-NEXT:    slli a1, a2, 62
 ; RV64-NEXT:    srli a1, a1, 63
@@ -73,8 +73,8 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-LABEL: vselect_vx_v6i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    lbu a2, 0(a2)
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    slli a1, a2, 30
 ; RV32-NEXT:    srli a1, a1, 31
@@ -104,8 +104,8 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
 ;
 ; RV64-LABEL: vselect_vx_v6i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    lbu a2, 0(a2)
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a1)
 ; RV64-NEXT:    slli a1, a2, 62
 ; RV64-NEXT:    srli a1, a1, 63
@@ -144,8 +144,8 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
 define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) {
 ; RV32-LABEL: vselect_vi_v6i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    slli a0, a1, 30
 ; RV32-NEXT:    srli a0, a0, 31
@@ -175,8 +175,8 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) {
 ;
 ; RV64-LABEL: vselect_vi_v6i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
 ; RV64-NEXT:    slli a0, a1, 62
 ; RV64-NEXT:    srli a0, a0, 63
@@ -214,8 +214,8 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) {
 define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-LABEL: vselect_vv_v6f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    lbu a2, 0(a2)
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    slli a1, a2, 30
 ; RV32-NEXT:    srli a1, a1, 31
@@ -244,8 +244,8 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ;
 ; RV64-LABEL: vselect_vv_v6f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    lbu a2, 0(a2)
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a1)
 ; RV64-NEXT:    slli a1, a2, 62
 ; RV64-NEXT:    srli a1, a1, 63
@@ -282,8 +282,8 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-LABEL: vselect_vx_v6f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    slli a0, a1, 30
 ; RV32-NEXT:    srli a0, a0, 31
@@ -313,8 +313,8 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) {
 ;
 ; RV64-LABEL: vselect_vx_v6f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
 ; RV64-NEXT:    slli a0, a1, 62
 ; RV64-NEXT:    srli a0, a0, 63
@@ -353,8 +353,8 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) {
 define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) {
 ; RV32-LABEL: vselect_vfpzero_v6f32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    slli a0, a1, 30
 ; RV32-NEXT:    srli a0, a0, 31
@@ -384,8 +384,8 @@ define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) {
 ;
 ; RV64-LABEL: vselect_vfpzero_v6f32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    lbu a1, 0(a1)
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
 ; RV64-NEXT:    slli a0, a1, 62
 ; RV64-NEXT:    srli a0, a0, 63
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll
index a4a5917fd4f9e6..b1726be941e3ef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll
@@ -768,8 +768,8 @@ define <4 x i32> @vwadd_vx_v4i32_i32(ptr %x, ptr %y) {
 define <2 x i64> @vwadd_vx_v2i64_i8(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwadd_vx_v2i64_i8:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lb a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -796,8 +796,8 @@ define <2 x i64> @vwadd_vx_v2i64_i8(ptr %x, ptr %y) nounwind {
 define <2 x i64> @vwadd_vx_v2i64_i16(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwadd_vx_v2i64_i16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lh a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -824,8 +824,8 @@ define <2 x i64> @vwadd_vx_v2i64_i16(ptr %x, ptr %y) nounwind {
 define <2 x i64> @vwadd_vx_v2i64_i32(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwadd_vx_v2i64_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -853,9 +853,9 @@ define <2 x i64> @vwadd_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwadd_vx_v2i64_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lw a2, 4(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw a2, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
index bc0bf5dd76ad45..f6d9695c514907 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
@@ -769,8 +769,8 @@ define <2 x i64> @vwaddu_vx_v2i64_i8(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwaddu_vx_v2i64_i8:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
@@ -801,8 +801,8 @@ define <2 x i64> @vwaddu_vx_v2i64_i16(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwaddu_vx_v2i64_i16:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lhu a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
@@ -833,8 +833,8 @@ define <2 x i64> @vwaddu_vx_v2i64_i32(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwaddu_vx_v2i64_i32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
@@ -865,9 +865,9 @@ define <2 x i64> @vwaddu_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwaddu_vx_v2i64_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lw a2, 4(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw a2, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
index 2abd34f01c14c0..c87584ab635135 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
@@ -883,9 +883,9 @@ define <2 x i64> @vwmul_vx_v2i64_i64(ptr %x, ptr %y) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lw a2, 4(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    sw a2, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
index 921037db2ea99e..a56984577ea749 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
@@ -793,8 +793,8 @@ define <2 x i64> @vwmulsu_vx_v2i64_i8(ptr %x, ptr %y) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
@@ -827,8 +827,8 @@ define <2 x i64> @vwmulsu_vx_v2i64_i16(ptr %x, ptr %y) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lhu a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
@@ -861,8 +861,8 @@ define <2 x i64> @vwmulsu_vx_v2i64_i32(ptr %x, ptr %y) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll
index 154093d759d6dd..2782a5fbb1eaed 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll
@@ -769,8 +769,8 @@ define <4 x i32> @vwsub_vx_v4i32_i32(ptr %x, ptr %y) {
 define <2 x i64> @vwsub_vx_v2i64_i8(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwsub_vx_v2i64_i8:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lb a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -798,8 +798,8 @@ define <2 x i64> @vwsub_vx_v2i64_i8(ptr %x, ptr %y) nounwind {
 define <2 x i64> @vwsub_vx_v2i64_i16(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwsub_vx_v2i64_i16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lh a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -827,8 +827,8 @@ define <2 x i64> @vwsub_vx_v2i64_i16(ptr %x, ptr %y) nounwind {
 define <2 x i64> @vwsub_vx_v2i64_i32(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwsub_vx_v2i64_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -856,9 +856,9 @@ define <2 x i64> @vwsub_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwsub_vx_v2i64_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lw a2, 4(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw a2, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
index a084b5383b4030..ccbc26c84d80d4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
@@ -770,8 +770,8 @@ define <2 x i64> @vwsubu_vx_v2i64_i8(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwsubu_vx_v2i64_i8:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lbu a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
@@ -803,8 +803,8 @@ define <2 x i64> @vwsubu_vx_v2i64_i16(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwsubu_vx_v2i64_i16:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lhu a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
@@ -836,8 +836,8 @@ define <2 x i64> @vwsubu_vx_v2i64_i32(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwsubu_vx_v2i64_i32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw zero, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
@@ -868,9 +868,9 @@ define <2 x i64> @vwsubu_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 ; RV32-LABEL: vwsubu_vx_v2i64_i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    lw a2, 4(a1)
 ; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a0)
 ; RV32-NEXT:    sw a2, 12(sp)
 ; RV32-NEXT:    sw a1, 8(sp)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index b78b8663eac90b..02cfd3de6b4db7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -1012,77 +1012,99 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 42
+; CHECK-NEXT:    li a3, 36
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x24, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 36 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    li a4, 27
+; CHECK-NEXT:    mul a3, a3, a4
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    srli a3, a1, 3
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 5
+; CHECK-NEXT:    li a5, 18
+; CHECK-NEXT:    mul a4, a4, a5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
 ; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v24, v0, a3
+; CHECK-NEXT:    vslidedown.vx v7, v0, a3
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs1r.v v7, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    sub a3, a2, a1
 ; CHECK-NEXT:    sltu a4, a2, a3
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
-; CHECK-NEXT:    vl8re64.v v0, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a4, a0, 5
-; CHECK-NEXT:    add a0, a4, a0
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 1
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmfeq.vv v26, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    li a4, 27
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v24, v16, v24, v0
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    li a4, 10
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    li a4, 27
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v16, v24, v24, v0.t
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs1r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v0, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a3, 19
+; CHECK-NEXT:    mul a0, a0, a3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmfeq.vv v17, v24, v24, v0.t
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a0, a0, a3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v17
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    li a3, 10
+; CHECK-NEXT:    mul a0, a0, a3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a3, 10
+; CHECK-NEXT:    mul a0, a0, a3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -1092,49 +1114,63 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:  .LBB28_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v7, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    li a1, 18
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl1r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 5
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    li a1, 19
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    li a1, 27
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 27
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 27
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmax.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 10
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 42
+; CHECK-NEXT:    li a1, 36
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -1162,19 +1198,19 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
+; CHECK-NEXT:    sub a3, a2, a1
+; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    sub a0, a2, a1
-; CHECK-NEXT:    sltu a3, a2, a0
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a0, a3, a0
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index 69c76152910e86..72a47ca2a60572 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -1012,77 +1012,99 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 42
+; CHECK-NEXT:    li a3, 36
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x24, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 36 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    li a4, 27
+; CHECK-NEXT:    mul a3, a3, a4
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    srli a3, a1, 3
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 5
+; CHECK-NEXT:    li a5, 18
+; CHECK-NEXT:    mul a4, a4, a5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
 ; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v24, v0, a3
+; CHECK-NEXT:    vslidedown.vx v7, v0, a3
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs1r.v v7, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    sub a3, a2, a1
 ; CHECK-NEXT:    sltu a4, a2, a3
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
-; CHECK-NEXT:    vl8re64.v v0, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a4, a0, 5
-; CHECK-NEXT:    add a0, a4, a0
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v24
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 1
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmfeq.vv v26, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    li a4, 27
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v24, v16, v24, v0
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    li a4, 10
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    li a4, 27
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v16, v24, v24, v0.t
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs1r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v0, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a3, 19
+; CHECK-NEXT:    mul a0, a0, a3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmfeq.vv v17, v24, v24, v0.t
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a0, a0, a3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v17
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    li a3, 10
+; CHECK-NEXT:    mul a0, a0, a3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a3, 10
+; CHECK-NEXT:    mul a0, a0, a3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -1092,49 +1114,63 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:  .LBB28_2:
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v7, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    li a1, 18
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl1r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 5
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    li a1, 19
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    li a1, 27
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 27
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 27
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmin.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 10
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 42
+; CHECK-NEXT:    li a1, 36
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -1162,19 +1198,19 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
+; CHECK-NEXT:    sub a3, a2, a1
+; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    sub a0, a2, a1
-; CHECK-NEXT:    sltu a3, a2, a0
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a0, a3, a0
-; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
index f90237b8d7e95d..f88a9b3081a1a8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
@@ -19,9 +19,9 @@ define <vscale x 1 x half> @nearbyint_nxv1f16(<vscale x 1 x half> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 1 x half> @llvm.experimental.constrained.nearbyint.nxv1f16(<vscale x 1 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 1 x half> %r
@@ -42,9 +42,9 @@ define <vscale x 2 x half> @nearbyint_nxv2f16(<vscale x 2 x half> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 2 x half> @llvm.experimental.constrained.nearbyint.nxv2f16(<vscale x 2 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 2 x half> %r
@@ -65,9 +65,9 @@ define <vscale x 4 x half> @nearbyint_nxv4f16(<vscale x 4 x half> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 4 x half> @llvm.experimental.constrained.nearbyint.nxv4f16(<vscale x 4 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 4 x half> %r
@@ -88,9 +88,9 @@ define <vscale x 8 x half> @nearbyint_nxv8f16(<vscale x 8 x half> %v) strictfp {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 8 x half> @llvm.experimental.constrained.nearbyint.nxv8f16(<vscale x 8 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 8 x half> %r
@@ -111,9 +111,9 @@ define <vscale x 16 x half> @nearbyint_nxv16f16(<vscale x 16 x half> %v) strictf
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 16 x half> @llvm.experimental.constrained.nearbyint.nxv16f16(<vscale x 16 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 16 x half> %r
@@ -134,9 +134,9 @@ define <vscale x 32 x half> @nearbyint_nxv32f16(<vscale x 32 x half> %v) strictf
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 32 x half> @llvm.experimental.constrained.nearbyint.nxv32f16(<vscale x 32 x half> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 32 x half> %r
@@ -157,9 +157,9 @@ define <vscale x 1 x float> @nearbyint_nxv1f32(<vscale x 1 x float> %v) strictfp
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 1 x float> @llvm.experimental.constrained.nearbyint.nxv1f32(<vscale x 1 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 1 x float> %r
@@ -180,9 +180,9 @@ define <vscale x 2 x float> @nearbyint_nxv2f32(<vscale x 2 x float> %v) strictfp
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 2 x float> @llvm.experimental.constrained.nearbyint.nxv2f32(<vscale x 2 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 2 x float> %r
@@ -203,9 +203,9 @@ define <vscale x 4 x float> @nearbyint_nxv4f32(<vscale x 4 x float> %v) strictfp
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 4 x float> @llvm.experimental.constrained.nearbyint.nxv4f32(<vscale x 4 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 4 x float> %r
@@ -226,9 +226,9 @@ define <vscale x 8 x float> @nearbyint_nxv8f32(<vscale x 8 x float> %v) strictfp
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 8 x float> @llvm.experimental.constrained.nearbyint.nxv8f32(<vscale x 8 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 8 x float> %r
@@ -249,9 +249,9 @@ define <vscale x 16 x float> @nearbyint_nxv16f32(<vscale x 16 x float> %v) stric
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 16 x float> @llvm.experimental.constrained.nearbyint.nxv16f32(<vscale x 16 x float> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 16 x float> %r
@@ -272,9 +272,9 @@ define <vscale x 1 x double> @nearbyint_nxv1f64(<vscale x 1 x double> %v) strict
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 1 x double> @llvm.experimental.constrained.nearbyint.nxv1f64(<vscale x 1 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 1 x double> %r
@@ -295,9 +295,9 @@ define <vscale x 2 x double> @nearbyint_nxv2f64(<vscale x 2 x double> %v) strict
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 2 x double> @llvm.experimental.constrained.nearbyint.nxv2f64(<vscale x 2 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 2 x double> %r
@@ -318,9 +318,9 @@ define <vscale x 4 x double> @nearbyint_nxv4f64(<vscale x 4 x double> %v) strict
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 4 x double> @llvm.experimental.constrained.nearbyint.nxv4f64(<vscale x 4 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 4 x double> %r
@@ -341,9 +341,9 @@ define <vscale x 8 x double> @nearbyint_nxv8f64(<vscale x 8 x double> %v) strict
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %r = call <vscale x 8 x double> @llvm.experimental.constrained.nearbyint.nxv8f64(<vscale x 8 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret <vscale x 8 x double> %r
diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
index 9aa356b9b65e0b..9e14852305caa1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
@@ -15,9 +15,9 @@ define <vscale x 1 x half> @nearbyint_nxv1f16(<vscale x 1 x half> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x half> @llvm.nearbyint.nxv1f16(<vscale x 1 x half> %x)
   ret <vscale x 1 x half> %a
@@ -35,9 +35,9 @@ define <vscale x 2 x half> @nearbyint_nxv2f16(<vscale x 2 x half> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x half> @llvm.nearbyint.nxv2f16(<vscale x 2 x half> %x)
   ret <vscale x 2 x half> %a
@@ -55,9 +55,9 @@ define <vscale x 4 x half> @nearbyint_nxv4f16(<vscale x 4 x half> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.nearbyint.nxv4f16(<vscale x 4 x half> %x)
   ret <vscale x 4 x half> %a
@@ -75,9 +75,9 @@ define <vscale x 8 x half> @nearbyint_nxv8f16(<vscale x 8 x half> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 8 x half> @llvm.nearbyint.nxv8f16(<vscale x 8 x half> %x)
   ret <vscale x 8 x half> %a
@@ -95,9 +95,9 @@ define <vscale x 16 x half> @nearbyint_nxv16f16(<vscale x 16 x half> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 16 x half> @llvm.nearbyint.nxv16f16(<vscale x 16 x half> %x)
   ret <vscale x 16 x half> %a
@@ -115,9 +115,9 @@ define <vscale x 32 x half> @nearbyint_nxv32f16(<vscale x 32 x half> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 32 x half> @llvm.nearbyint.nxv32f16(<vscale x 32 x half> %x)
   ret <vscale x 32 x half> %a
@@ -135,9 +135,9 @@ define <vscale x 1 x float> @nearbyint_nxv1f32(<vscale x 1 x float> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x float> @llvm.nearbyint.nxv1f32(<vscale x 1 x float> %x)
   ret <vscale x 1 x float> %a
@@ -155,9 +155,9 @@ define <vscale x 2 x float> @nearbyint_nxv2f32(<vscale x 2 x float> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x float> @llvm.nearbyint.nxv2f32(<vscale x 2 x float> %x)
   ret <vscale x 2 x float> %a
@@ -175,9 +175,9 @@ define <vscale x 4 x float> @nearbyint_nxv4f32(<vscale x 4 x float> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> %x)
   ret <vscale x 4 x float> %a
@@ -195,9 +195,9 @@ define <vscale x 8 x float> @nearbyint_nxv8f32(<vscale x 8 x float> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 8 x float> @llvm.nearbyint.nxv8f32(<vscale x 8 x float> %x)
   ret <vscale x 8 x float> %a
@@ -215,9 +215,9 @@ define <vscale x 16 x float> @nearbyint_nxv16f32(<vscale x 16 x float> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 16 x float> @llvm.nearbyint.nxv16f32(<vscale x 16 x float> %x)
   ret <vscale x 16 x float> %a
@@ -235,9 +235,9 @@ define <vscale x 1 x double> @nearbyint_nxv1f64(<vscale x 1 x double> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x double> @llvm.nearbyint.nxv1f64(<vscale x 1 x double> %x)
   ret <vscale x 1 x double> %a
@@ -255,9 +255,9 @@ define <vscale x 2 x double> @nearbyint_nxv2f64(<vscale x 2 x double> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> %x)
   ret <vscale x 2 x double> %a
@@ -275,9 +275,9 @@ define <vscale x 4 x double> @nearbyint_nxv4f64(<vscale x 4 x double> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x double> @llvm.nearbyint.nxv4f64(<vscale x 4 x double> %x)
   ret <vscale x 4 x double> %a
@@ -295,9 +295,9 @@ define <vscale x 8 x double> @nearbyint_nxv8f64(<vscale x 8 x double> %x) {
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %a = call <vscale x 8 x double> @llvm.nearbyint.nxv8f64(<vscale x 8 x double> %x)
   ret <vscale x 8 x double> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
index 277cd7dcdabce5..249f765971b095 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
@@ -960,108 +960,87 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 56
+; CHECK-NEXT:    li a3, 40
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a1, a3, 3
 ; CHECK-NEXT:    add a5, a0, a1
+; CHECK-NEXT:    srli a6, a3, 3
+; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    vl8re64.v v16, (a1)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    sub a1, a4, a3
+; CHECK-NEXT:    sltu a7, a4, a1
+; CHECK-NEXT:    addi a7, a7, -1
+; CHECK-NEXT:    and a7, a7, a1
+; CHECK-NEXT:    li a1, 63
 ; CHECK-NEXT:    vl8re64.v v8, (a5)
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 3
+; CHECK-NEXT:    slli a5, a5, 5
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    srli a5, a3, 3
-; CHECK-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a5
-; CHECK-NEXT:    add a5, a2, a1
-; CHECK-NEXT:    sub a1, a4, a3
-; CHECK-NEXT:    sltu a6, a4, a1
-; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a6, a6, a1
-; CHECK-NEXT:    li a1, 63
-; CHECK-NEXT:    vl8re64.v v8, (a5)
+; CHECK-NEXT:    vslidedown.vx v0, v0, a6
+; CHECK-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
+; CHECK-NEXT:    vand.vx v8, v16, a1, v0.t
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a7, 40
-; CHECK-NEXT:    mul a5, a5, a7
+; CHECK-NEXT:    slli a5, a5, 5
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsrl.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a5, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v16, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v16, (a2)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 48
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a5, a5, 3
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    vand.vx v16, v8, a1, v0.t
+; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v8, (a2)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vx v16, v8, a1, v0.t
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsll.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -1072,66 +1051,50 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 48
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 48
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vnot.v v16, v8, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsll.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 40
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 56
+; CHECK-NEXT:    li a1, 40
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -1147,109 +1110,89 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 56
+; CHECK-NEXT:    li a3, 40
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a1, a3, 3
-; CHECK-NEXT:    add a5, a0, a1
-; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 4
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    srli a5, a3, 3
+; CHECK-NEXT:    slli a5, a3, 3
+; CHECK-NEXT:    srli a1, a3, 3
 ; CHECK-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a5
-; CHECK-NEXT:    add a5, a2, a1
+; CHECK-NEXT:    vslidedown.vx v0, v0, a1
+; CHECK-NEXT:    add a1, a2, a5
+; CHECK-NEXT:    vl8re64.v v8, (a1)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    sub a1, a4, a3
 ; CHECK-NEXT:    sltu a6, a4, a1
 ; CHECK-NEXT:    addi a6, a6, -1
 ; CHECK-NEXT:    and a6, a6, a1
 ; CHECK-NEXT:    li a1, 63
-; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a7, 40
-; CHECK-NEXT:    mul a5, a5, a7
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v16, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a5, 24
-; CHECK-NEXT:    mul a0, a0, a5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v16, (a2)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 48
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    csrr a6, vlenb
+; CHECK-NEXT:    slli a6, a6, 5
+; CHECK-NEXT:    add a6, sp, a6
+; CHECK-NEXT:    addi a6, a6, 16
+; CHECK-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsll.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    csrr a6, vlenb
+; CHECK-NEXT:    slli a6, a6, 3
+; CHECK-NEXT:    add a6, sp, a6
+; CHECK-NEXT:    addi a6, a6, 16
+; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; CHECK-NEXT:    add a5, a0, a5
+; CHECK-NEXT:    csrr a6, vlenb
+; CHECK-NEXT:    slli a6, a6, 4
+; CHECK-NEXT:    add a6, sp, a6
+; CHECK-NEXT:    addi a6, a6, 16
+; CHECK-NEXT:    vl8r.v v8, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    vl8re64.v v16, (a5)
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    addi a5, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v8, (a2)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsrl.vi v16, v16, 1, v0.t
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    vsrl.vv v16, v16, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vx v16, v8, a1, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vsrl.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a4, a3, .LBB47_2
 ; CHECK-NEXT:  # %bb.1:
@@ -1258,63 +1201,56 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 48
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    mul a0, a0, a2
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsll.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    mul a0, a0, a2
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 48
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnot.v v16, v8, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    vand.vx v16, v8, a1, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsrl.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 40
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 56
+; CHECK-NEXT:    li a1, 40
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
index e12f1cf7603b84..e260ae5344e446 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
@@ -1216,35 +1216,20 @@ define void @mgather_nxv16i64(<vscale x 8 x ptr> %ptrs0, <vscale x 8 x ptr> %ptr
 ;
 ; RV64-LABEL: mgather_nxv16i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a3, a3, 3
-; RV64-NEXT:    sub sp, sp, a3
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    vl8re64.v v24, (a0)
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv8r.v v16, v8
-; RV64-NEXT:    vl8re64.v v8, (a1)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
-; RV64-NEXT:    vluxei64.v v24, (zero), v16, v0.t
+; RV64-NEXT:    vluxei64.v v24, (zero), v8, v0.t
+; RV64-NEXT:    vl8re64.v v8, (a1)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    srli a1, a0, 3
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vluxei64.v v8, (zero), v16, v0.t
 ; RV64-NEXT:    slli a0, a0, 3
 ; RV64-NEXT:    add a0, a2, a0
 ; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    vs8r.v v24, (a2)
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    addi sp, sp, 16
 ; RV64-NEXT:    ret
   %p0 = call <vscale x 16 x ptr> @llvm.vector.insert.nxv8p0.nxv16p0(<vscale x 16 x ptr> undef, <vscale x 8 x ptr> %ptrs0, i64 0)
   %p1 = call <vscale x 16 x ptr> @llvm.vector.insert.nxv8p0.nxv16p0(<vscale x 16 x ptr> %p0, <vscale x 8 x ptr> %ptrs1, i64 8)
diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
index 0e09f59b6a20fe..fc8fdf4aaafe24 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
@@ -1714,8 +1714,8 @@ define void @mscatter_nxv16f64(<vscale x 8 x double> %val0, <vscale x 8 x double
 ; RV64-NEXT:    vl8re64.v v24, (a0)
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV64-NEXT:    vl8re64.v v16, (a1)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64-NEXT:    vl8re64.v v16, (a1)
 ; RV64-NEXT:    vsoxei64.v v8, (zero), v24, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    srli a0, a0, 3
diff --git a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
index 8bc2334282653f..ebe89817630d02 100644
--- a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
@@ -23,9 +23,9 @@ define <vscale x 1 x half> @vp_nearbyint_nxv1f16(<vscale x 1 x half> %va, <vscal
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv1f16:
@@ -42,11 +42,11 @@ define <vscale x 1 x half> @vp_nearbyint_nxv1f16(<vscale x 1 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.nearbyint.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
@@ -63,9 +63,9 @@ define <vscale x 1 x half> @vp_nearbyint_nxv1f16_unmasked(<vscale x 1 x half> %v
 ; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv1f16_unmasked:
@@ -80,11 +80,11 @@ define <vscale x 1 x half> @vp_nearbyint_nxv1f16_unmasked(<vscale x 1 x half> %v
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.nearbyint.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x half> %v
@@ -105,9 +105,9 @@ define <vscale x 2 x half> @vp_nearbyint_nxv2f16(<vscale x 2 x half> %va, <vscal
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv2f16:
@@ -124,11 +124,11 @@ define <vscale x 2 x half> @vp_nearbyint_nxv2f16(<vscale x 2 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.nearbyint.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
@@ -145,9 +145,9 @@ define <vscale x 2 x half> @vp_nearbyint_nxv2f16_unmasked(<vscale x 2 x half> %v
 ; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv2f16_unmasked:
@@ -162,11 +162,11 @@ define <vscale x 2 x half> @vp_nearbyint_nxv2f16_unmasked(<vscale x 2 x half> %v
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v9, v8, v9, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.nearbyint.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
@@ -187,9 +187,9 @@ define <vscale x 4 x half> @vp_nearbyint_nxv4f16(<vscale x 4 x half> %va, <vscal
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv4f16:
@@ -208,11 +208,11 @@ define <vscale x 4 x half> @vp_nearbyint_nxv4f16(<vscale x 4 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v12, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.nearbyint.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
@@ -229,9 +229,9 @@ define <vscale x 4 x half> @vp_nearbyint_nxv4f16_unmasked(<vscale x 4 x half> %v
 ; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv4f16_unmasked:
@@ -246,11 +246,11 @@ define <vscale x 4 x half> @vp_nearbyint_nxv4f16_unmasked(<vscale x 4 x half> %v
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.nearbyint.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x half> %v
@@ -273,9 +273,9 @@ define <vscale x 8 x half> @vp_nearbyint_nxv8f16(<vscale x 8 x half> %va, <vscal
 ; ZVFH-NEXT:    vmv1r.v v0, v10
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv8f16:
@@ -294,11 +294,11 @@ define <vscale x 8 x half> @vp_nearbyint_nxv8f16(<vscale x 8 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v16, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.nearbyint.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
@@ -315,9 +315,9 @@ define <vscale x 8 x half> @vp_nearbyint_nxv8f16_unmasked(<vscale x 8 x half> %v
 ; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv8f16_unmasked:
@@ -332,11 +332,11 @@ define <vscale x 8 x half> @vp_nearbyint_nxv8f16_unmasked(<vscale x 8 x half> %v
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v12, v8, v12, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.nearbyint.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x half> %v
@@ -359,9 +359,9 @@ define <vscale x 16 x half> @vp_nearbyint_nxv16f16(<vscale x 16 x half> %va, <vs
 ; ZVFH-NEXT:    vmv1r.v v0, v12
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv16f16:
@@ -380,11 +380,11 @@ define <vscale x 16 x half> @vp_nearbyint_nxv16f16(<vscale x 16 x half> %va, <vs
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.nearbyint.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -401,9 +401,9 @@ define <vscale x 16 x half> @vp_nearbyint_nxv16f16_unmasked(<vscale x 16 x half>
 ; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv16f16_unmasked:
@@ -418,11 +418,11 @@ define <vscale x 16 x half> @vp_nearbyint_nxv16f16_unmasked(<vscale x 16 x half>
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.nearbyint.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x half> %v
@@ -445,9 +445,9 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFH-NEXT:    vmv1r.v v0, v16
 ; ZVFH-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv32f16:
@@ -458,7 +458,7 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    vmv1r.v v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
@@ -488,29 +488,30 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v24
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB10_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB10_2:
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
+; ZVFHMIN-NEXT:    fsflags a0
+; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -531,9 +532,9 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFH-NEXT:    frflags a0
 ; ZVFH-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; ZVFH-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
 ; ZVFH-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; ZVFH-NEXT:    fsflags a0
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv32f16_unmasked:
@@ -589,11 +590,11 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    fsflags a0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -618,9 +619,9 @@ define <vscale x 1 x float> @vp_nearbyint_nxv1f32(<vscale x 1 x float> %va, <vsc
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x float> @llvm.vp.nearbyint.nxv1f32(<vscale x 1 x float> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x float> %v
@@ -637,9 +638,9 @@ define <vscale x 1 x float> @vp_nearbyint_nxv1f32_unmasked(<vscale x 1 x float>
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x float> @llvm.vp.nearbyint.nxv1f32(<vscale x 1 x float> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x float> %v
@@ -660,9 +661,9 @@ define <vscale x 2 x float> @vp_nearbyint_nxv2f32(<vscale x 2 x float> %va, <vsc
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x float> @llvm.vp.nearbyint.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x float> %v
@@ -679,9 +680,9 @@ define <vscale x 2 x float> @vp_nearbyint_nxv2f32_unmasked(<vscale x 2 x float>
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x float> @llvm.vp.nearbyint.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
@@ -704,9 +705,9 @@ define <vscale x 4 x float> @vp_nearbyint_nxv4f32(<vscale x 4 x float> %va, <vsc
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x float> @llvm.vp.nearbyint.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x float> %v
@@ -723,9 +724,9 @@ define <vscale x 4 x float> @vp_nearbyint_nxv4f32_unmasked(<vscale x 4 x float>
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x float> @llvm.vp.nearbyint.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x float> %v
@@ -748,9 +749,9 @@ define <vscale x 8 x float> @vp_nearbyint_nxv8f32(<vscale x 8 x float> %va, <vsc
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x float> @llvm.vp.nearbyint.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x float> %v
@@ -767,9 +768,9 @@ define <vscale x 8 x float> @vp_nearbyint_nxv8f32_unmasked(<vscale x 8 x float>
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x float> @llvm.vp.nearbyint.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x float> %v
@@ -792,9 +793,9 @@ define <vscale x 16 x float> @vp_nearbyint_nxv16f32(<vscale x 16 x float> %va, <
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x float> @llvm.vp.nearbyint.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x float> %v
@@ -811,9 +812,9 @@ define <vscale x 16 x float> @vp_nearbyint_nxv16f32_unmasked(<vscale x 16 x floa
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x float> @llvm.vp.nearbyint.nxv16f32(<vscale x 16 x float> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x float> %v
@@ -834,9 +835,9 @@ define <vscale x 1 x double> @vp_nearbyint_nxv1f64(<vscale x 1 x double> %va, <v
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x double> @llvm.vp.nearbyint.nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x double> %v
@@ -853,9 +854,9 @@ define <vscale x 1 x double> @vp_nearbyint_nxv1f64_unmasked(<vscale x 1 x double
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x double> @llvm.vp.nearbyint.nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x double> %v
@@ -878,9 +879,9 @@ define <vscale x 2 x double> @vp_nearbyint_nxv2f64(<vscale x 2 x double> %va, <v
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.nearbyint.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x double> %v
@@ -897,9 +898,9 @@ define <vscale x 2 x double> @vp_nearbyint_nxv2f64_unmasked(<vscale x 2 x double
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v10, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 2 x double> @llvm.vp.nearbyint.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
@@ -922,9 +923,9 @@ define <vscale x 4 x double> @vp_nearbyint_nxv4f64(<vscale x 4 x double> %va, <v
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x double> @llvm.vp.nearbyint.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x double> %v
@@ -941,9 +942,9 @@ define <vscale x 4 x double> @vp_nearbyint_nxv4f64_unmasked(<vscale x 4 x double
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v12, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x double> @llvm.vp.nearbyint.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x double> %v
@@ -966,9 +967,9 @@ define <vscale x 7 x double> @vp_nearbyint_nxv7f64(<vscale x 7 x double> %va, <v
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 7 x double> @llvm.vp.nearbyint.nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 %evl)
   ret <vscale x 7 x double> %v
@@ -985,9 +986,9 @@ define <vscale x 7 x double> @vp_nearbyint_nxv7f64_unmasked(<vscale x 7 x double
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 7 x double> @llvm.vp.nearbyint.nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 7 x double> %v
@@ -1010,9 +1011,9 @@ define <vscale x 8 x double> @vp_nearbyint_nxv8f64(<vscale x 8 x double> %va, <v
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x double> @llvm.vp.nearbyint.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x double> %v
@@ -1029,9 +1030,9 @@ define <vscale x 8 x double> @vp_nearbyint_nxv8f64_unmasked(<vscale x 8 x double
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 8 x double> @llvm.vp.nearbyint.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x double> %v
@@ -1046,16 +1047,15 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vmv8r.v v24, v16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v25, v0, a2
+; CHECK-NEXT:    vslidedown.vx v6, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
@@ -1063,60 +1063,41 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
 ; CHECK-NEXT:    lui a3, %hi(.LCPI32_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI32_0)(a3)
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vmv8r.v v8, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vfabs.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v6, v16, fa5, v0.t
 ; CHECK-NEXT:    frflags a2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v8, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    fsflags a2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB32_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    fsflags a0
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
@@ -1153,9 +1134,9 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    fsflags a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.nearbyint.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 897bfdea69f1f4..cc967396153bc1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -2203,17 +2203,17 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFH-NEXT:    add a1, sp, a1
 ; ZVFH-NEXT:    addi a1, a1, 16
 ; ZVFH-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFH-NEXT:    csrr a1, vlenb
-; ZVFH-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; ZVFH-NEXT:    slli a3, a1, 3
-; ZVFH-NEXT:    add a3, a0, a3
-; ZVFH-NEXT:    vl8re16.v v8, (a3)
-; ZVFH-NEXT:    slli a3, a1, 2
+; ZVFH-NEXT:    csrr a3, vlenb
+; ZVFH-NEXT:    srli a1, a3, 1
+; ZVFH-NEXT:    slli a4, a3, 3
+; ZVFH-NEXT:    add a4, a0, a4
+; ZVFH-NEXT:    vl8re16.v v8, (a4)
+; ZVFH-NEXT:    slli a3, a3, 2
 ; ZVFH-NEXT:    sub a4, a2, a3
 ; ZVFH-NEXT:    sltu a5, a2, a4
 ; ZVFH-NEXT:    addi a5, a5, -1
 ; ZVFH-NEXT:    and a4, a5, a4
-; ZVFH-NEXT:    srli a1, a1, 1
+; ZVFH-NEXT:    vsetvli a5, zero, e8, m1, ta, ma
 ; ZVFH-NEXT:    vl8re16.v v0, (a0)
 ; ZVFH-NEXT:    addi a0, sp, 16
 ; ZVFH-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
@@ -2249,152 +2249,133 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 34
-; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 18
-; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; ZVFHMIN-NEXT:    slli a1, a3, 3
-; ZVFHMIN-NEXT:    add a1, a0, a1
-; ZVFHMIN-NEXT:    vl8re16.v v16, (a1)
+; ZVFHMIN-NEXT:    srli a1, a3, 1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v17, v0
+; ZVFHMIN-NEXT:    vslidedown.vx v18, v0, a1
+; ZVFHMIN-NEXT:    slli a4, a3, 3
+; ZVFHMIN-NEXT:    add a4, a0, a4
+; ZVFHMIN-NEXT:    vl8re16.v v0, (a4)
 ; ZVFHMIN-NEXT:    slli a5, a3, 2
-; ZVFHMIN-NEXT:    sub a1, a2, a5
-; ZVFHMIN-NEXT:    sltu a4, a2, a1
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a6, a4, a1
+; ZVFHMIN-NEXT:    sub a4, a2, a5
+; ZVFHMIN-NEXT:    sltu a6, a2, a4
+; ZVFHMIN-NEXT:    addi a6, a6, -1
+; ZVFHMIN-NEXT:    and a6, a6, a4
 ; ZVFHMIN-NEXT:    slli a4, a3, 1
-; ZVFHMIN-NEXT:    sub a1, a6, a4
-; ZVFHMIN-NEXT:    sltu a7, a6, a1
-; ZVFHMIN-NEXT:    addi a7, a7, -1
-; ZVFHMIN-NEXT:    and a7, a7, a1
-; ZVFHMIN-NEXT:    srli a1, a3, 1
-; ZVFHMIN-NEXT:    csrr t0, vlenb
-; ZVFHMIN-NEXT:    add t0, sp, t0
-; ZVFHMIN-NEXT:    addi t0, t0, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (t0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vslidedown.vx v7, v0, a1
+; ZVFHMIN-NEXT:    sub a7, a6, a4
+; ZVFHMIN-NEXT:    sltu t0, a6, a7
+; ZVFHMIN-NEXT:    addi t0, t0, -1
+; ZVFHMIN-NEXT:    and a7, t0, a7
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    vsetvli t0, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v16, v18, a3
+; ZVFHMIN-NEXT:    vsetvli t0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li t0, 26
+; ZVFHMIN-NEXT:    li t0, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, t0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs1r.v v7, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vmv4r.v v16, v24
+; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li t0, 10
-; ZVFHMIN-NEXT:    mul a0, a0, t0
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, a7, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v26, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vmfeq.vv v20, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    bltu a6, a4, .LBB85_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a6, a4
 ; ZVFHMIN-NEXT:  .LBB85_2:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a7, 10
-; ZVFHMIN-NEXT:    mul a0, a0, a7
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, a6, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmfeq.vv v6, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v18
+; ZVFHMIN-NEXT:    vmfeq.vv v6, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    add a0, a3, a3
 ; ZVFHMIN-NEXT:    bltu a2, a5, .LBB85_4
 ; ZVFHMIN-NEXT:  # %bb.3:
 ; ZVFHMIN-NEXT:    mv a2, a5
 ; ZVFHMIN-NEXT:  .LBB85_4:
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v6, v26, a3
+; ZVFHMIN-NEXT:    vslideup.vx v6, v20, a3
 ; ZVFHMIN-NEXT:    sub a5, a2, a4
 ; ZVFHMIN-NEXT:    sltu a6, a2, a5
 ; ZVFHMIN-NEXT:    addi a6, a6, -1
 ; ZVFHMIN-NEXT:    and a5, a6, a5
 ; ZVFHMIN-NEXT:    vsetvli a6, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl1r.v v8, (a6) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmv1r.v v7, v8
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
+; ZVFHMIN-NEXT:    vmv1r.v v7, v17
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v17, a3
 ; ZVFHMIN-NEXT:    vsetvli a6, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    li a7, 18
-; ZVFHMIN-NEXT:    mul a6, a6, a7
+; ZVFHMIN-NEXT:    slli a6, a6, 3
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    li a7, 10
-; ZVFHMIN-NEXT:    mul a6, a6, a7
+; ZVFHMIN-NEXT:    slli a6, a6, 4
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    li a7, 26
+; ZVFHMIN-NEXT:    li a7, 24
 ; ZVFHMIN-NEXT:    mul a6, a6, a7
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
 ; ZVFHMIN-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    li a6, 10
-; ZVFHMIN-NEXT:    mul a5, a5, a6
+; ZVFHMIN-NEXT:    slli a5, a5, 4
 ; ZVFHMIN-NEXT:    add a5, sp, a5
 ; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmfeq.vv v5, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmfeq.vv v5, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    bltu a2, a4, .LBB85_6
 ; ZVFHMIN-NEXT:  # %bb.5:
 ; ZVFHMIN-NEXT:    mv a2, a4
 ; ZVFHMIN-NEXT:  .LBB85_6:
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    li a5, 26
+; ZVFHMIN-NEXT:    li a5, 24
 ; ZVFHMIN-NEXT:    mul a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v5, a3
 ; ZVFHMIN-NEXT:    add a0, a1, a1
@@ -2402,8 +2383,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v6, a1
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 34
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
@@ -3494,109 +3474,93 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 48
+; CHECK-NEXT:    li a3, 24
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 40
-; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul t2, a3, a1
-; CHECK-NEXT:    slli t1, a3, 3
+; CHECK-NEXT:    slli a7, a3, 3
 ; CHECK-NEXT:    srli a4, a3, 2
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v7, v0, a4
 ; CHECK-NEXT:    srli a1, a3, 3
-; CHECK-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    add a5, a2, t1
-; CHECK-NEXT:    vl8re64.v v8, (a5)
 ; CHECK-NEXT:    slli t0, a3, 4
+; CHECK-NEXT:    add a5, a2, a7
+; CHECK-NEXT:    vl8re64.v v16, (a5)
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    slli a5, a3, 1
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    mv a7, a6
+; CHECK-NEXT:    mv t1, a6
 ; CHECK-NEXT:    bltu a6, a5, .LBB171_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a7, a5
+; CHECK-NEXT:    mv t1, a5
 ; CHECK-NEXT:  .LBB171_2:
 ; CHECK-NEXT:    add t2, a2, t2
-; CHECK-NEXT:    add t1, a0, t1
+; CHECK-NEXT:    add a7, a0, a7
 ; CHECK-NEXT:    add t0, a2, t0
-; CHECK-NEXT:    vl8re64.v v16, (a2)
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    sub a2, a7, a3
-; CHECK-NEXT:    sltu t3, a7, a2
+; CHECK-NEXT:    vl8re64.v v8, (a2)
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    sub a2, t1, a3
+; CHECK-NEXT:    sltu t3, t1, a2
 ; CHECK-NEXT:    addi t3, t3, -1
 ; CHECK-NEXT:    and a2, t3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v6, v16, v8, v0.t
-; CHECK-NEXT:    bltu a7, a3, .LBB171_4
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v6, v8, v16, v0.t
+; CHECK-NEXT:    bltu t1, a3, .LBB171_4
 ; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv t1, a3
 ; CHECK-NEXT:  .LBB171_4:
-; CHECK-NEXT:    vl8re64.v v8, (t2)
+; CHECK-NEXT:    vl8re64.v v16, (t2)
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v8, (t1)
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v18, v7, a1
+; CHECK-NEXT:    vsetvli zero, t1, e64, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li t1, 24
-; CHECK-NEXT:    mul a2, a2, t1
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v18, v7, a1
-; CHECK-NEXT:    vl8re64.v v8, (t0)
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v17, v24, v8, v0.t
+; CHECK-NEXT:    vl8re64.v v8, (a7)
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v8, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v17, v24, v8, v0.t
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
-; CHECK-NEXT:    sub a0, a6, a5
-; CHECK-NEXT:    sltu a2, a6, a0
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    vl8re64.v v8, (t0)
+; CHECK-NEXT:    add a2, a1, a1
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, tu, ma
+; CHECK-NEXT:    sub a2, a6, a5
+; CHECK-NEXT:    sltu a5, a6, a2
+; CHECK-NEXT:    vl8re64.v v24, (a0)
+; CHECK-NEXT:    addi a0, a5, -1
+; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    vslideup.vx v17, v6, a1
 ; CHECK-NEXT:    mv a2, a0
 ; CHECK-NEXT:    bltu a0, a3, .LBB171_6
@@ -3605,13 +3569,6 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK-NEXT:  .LBB171_6:
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmfeq.vv v16, v24, v8, v0.t
 ; CHECK-NEXT:    add a2, a4, a1
 ; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, tu, ma
@@ -3623,13 +3580,12 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv1r.v v0, v18
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -3641,7 +3597,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK-NEXT:    vslideup.vx v17, v16, a0
 ; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 48
+; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index 7fd77c050b2957..85f5ffd784e982 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -1092,7 +1092,7 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
@@ -1102,23 +1102,23 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a4, a0, a1
 ; CHECK-NEXT:    vl8r.v v8, (a4)
-; CHECK-NEXT:    vl8r.v v0, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    sub a0, a3, a1
+; CHECK-NEXT:    vsetvli a4, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a2)
-; CHECK-NEXT:    sltu a2, a3, a0
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a0, a2, a0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v7, v16, v8, v0.t
+; CHECK-NEXT:    sub a2, a3, a1
+; CHECK-NEXT:    sltu a4, a3, a2
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    vl8r.v v24, (a0)
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    and a2, a4, a2
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vmseq.vv v6, v16, v8, v0.t
 ; CHECK-NEXT:    bltu a3, a1, .LBB96_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:  .LBB96_2:
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
@@ -1128,7 +1128,7 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmseq.vv v16, v8, v24, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    vmv1r.v v8, v6
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
@@ -2248,17 +2248,17 @@ define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    slli a3, a1, 3
-; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    vl8re32.v v8, (a3)
-; CHECK-NEXT:    slli a3, a1, 1
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    srli a1, a3, 2
+; CHECK-NEXT:    slli a4, a3, 3
+; CHECK-NEXT:    add a4, a0, a4
+; CHECK-NEXT:    vl8re32.v v8, (a4)
+; CHECK-NEXT:    slli a3, a3, 1
 ; CHECK-NEXT:    sub a4, a2, a3
 ; CHECK-NEXT:    sltu a5, a2, a4
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    and a4, a5, a4
-; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vl8re32.v v0, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
index a41c2621161366..0f47236d660075 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
@@ -1125,22 +1125,22 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    slli a3, a1, 3
-; CHECK-NEXT:    add a5, a2, a3
+; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    slli a5, a1, 3
+; CHECK-NEXT:    add a6, a2, a5
+; CHECK-NEXT:    vl8re64.v v8, (a6)
+; CHECK-NEXT:    csrr a6, vlenb
+; CHECK-NEXT:    slli a6, a6, 4
+; CHECK-NEXT:    add a6, sp, a6
+; CHECK-NEXT:    addi a6, a6, 16
+; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; CHECK-NEXT:    sub a6, a4, a1
+; CHECK-NEXT:    sltu a7, a4, a6
+; CHECK-NEXT:    addi a7, a7, -1
+; CHECK-NEXT:    and a6, a7, a6
+; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 4
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    sub a5, a4, a1
-; CHECK-NEXT:    sltu a6, a4, a5
-; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a5, a6, a5
-; CHECK-NEXT:    srli a6, a1, 3
-; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    vl8re64.v v8, (a3)
 ; CHECK-NEXT:    vl8re64.v v16, (a2)
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
@@ -1150,8 +1150,8 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v0, v0, a6
-; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
+; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a2, 24
 ; CHECK-NEXT:    mul a0, a0, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
index 292f27794f378a..bacf9bae83ed7a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
@@ -1125,22 +1125,22 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    slli a3, a1, 3
-; CHECK-NEXT:    add a5, a2, a3
+; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    slli a5, a1, 3
+; CHECK-NEXT:    add a6, a2, a5
+; CHECK-NEXT:    vl8re64.v v8, (a6)
+; CHECK-NEXT:    csrr a6, vlenb
+; CHECK-NEXT:    slli a6, a6, 4
+; CHECK-NEXT:    add a6, sp, a6
+; CHECK-NEXT:    addi a6, a6, 16
+; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; CHECK-NEXT:    sub a6, a4, a1
+; CHECK-NEXT:    sltu a7, a4, a6
+; CHECK-NEXT:    addi a7, a7, -1
+; CHECK-NEXT:    and a6, a7, a6
+; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 4
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    sub a5, a4, a1
-; CHECK-NEXT:    sltu a6, a4, a5
-; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a5, a6, a5
-; CHECK-NEXT:    srli a6, a1, 3
-; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    vl8re64.v v8, (a3)
 ; CHECK-NEXT:    vl8re64.v v16, (a2)
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
@@ -1150,8 +1150,8 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vslidedown.vx v0, v0, a6
-; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
+; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a2, 24
 ; CHECK-NEXT:    mul a0, a0, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
index 76efdda15bf77b..26f7c56f05cecf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
@@ -369,14 +369,14 @@ define <vscale x 128 x i8> @vpmerge_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a4, a0, a1
 ; CHECK-NEXT:    vl8r.v v16, (a4)
-; CHECK-NEXT:    vl8r.v v8, (a0)
-; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    sub a0, a3, a1
+; CHECK-NEXT:    vsetvli a4, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a2)
-; CHECK-NEXT:    sltu a2, a3, a0
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a0, a2, a0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, tu, ma
+; CHECK-NEXT:    sub a2, a3, a1
+; CHECK-NEXT:    sltu a4, a3, a2
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    vl8r.v v8, (a0)
+; CHECK-NEXT:    and a2, a4, a2
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    bltu a3, a1, .LBB28_2
 ; CHECK-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index 0a5e50160fbc93..312378d3937378 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -373,10 +373,10 @@ define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    and a4, a5, a4
 ; CHECK-NEXT:    srli a3, a3, 2
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vl8re32.v v0, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a3
 ; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
@@ -430,10 +430,10 @@ define <vscale x 32 x i32> @select_evl_nxv32i32(<vscale x 32 x i1> %a, <vscale x
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a4, a1, 2
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vl8re32.v v0, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
@@ -716,10 +716,10 @@ define <vscale x 16 x double> @select_nxv16f64(<vscale x 16 x i1> %a, <vscale x
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a4, a1, 3
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vl8re64.v v0, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
index 25aa3a7081a166..9d5ff00fd597d2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
@@ -242,7 +242,6 @@ define <vscale x 1 x double> @test6(i64 %avl, i8 zeroext %cond, <vscale x 1 x do
 ; CHECK-NEXT:    andi a1, a1, 2
 ; CHECK-NEXT:    beqz a1, .LBB5_4
 ; CHECK-NEXT:  .LBB5_2: # %if.then4
-; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI5_0)
 ; CHECK-NEXT:    vlse64.v v9, (a0), zero
@@ -631,22 +630,22 @@ declare <vscale x 2 x i32> @llvm.riscv.vwadd.w.nxv2i32.nxv2i16(<vscale x 2 x i32
 define void @vlmax(i64 %N, ptr %c, ptr %a, ptr %b) {
 ; CHECK-LABEL: vlmax:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a6, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    blez a0, .LBB12_3
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    li a5, 0
-; CHECK-NEXT:    slli a4, a6, 3
+; CHECK-NEXT:    li a4, 0
+; CHECK-NEXT:    vsetvli a6, zero, e64, m1, ta, ma
+; CHECK-NEXT:    slli a5, a6, 3
 ; CHECK-NEXT:  .LBB12_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle64.v v8, (a2)
 ; CHECK-NEXT:    vle64.v v9, (a3)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
 ; CHECK-NEXT:    vse64.v v8, (a1)
-; CHECK-NEXT:    add a5, a5, a6
-; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    add a3, a3, a4
-; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    blt a5, a0, .LBB12_2
+; CHECK-NEXT:    add a4, a4, a6
+; CHECK-NEXT:    add a1, a1, a5
+; CHECK-NEXT:    add a3, a3, a5
+; CHECK-NEXT:    add a2, a2, a5
+; CHECK-NEXT:    blt a4, a0, .LBB12_2
 ; CHECK-NEXT:  .LBB12_3: # %for.end
 ; CHECK-NEXT:    ret
 entry:
@@ -678,18 +677,18 @@ for.end:                                          ; preds = %for.body, %entry
 define void @vector_init_vlmax(i64 %N, ptr %c) {
 ; CHECK-LABEL: vector_init_vlmax:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    blez a0, .LBB13_3
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    li a3, 0
-; CHECK-NEXT:    slli a4, a2, 3
+; CHECK-NEXT:    li a2, 0
+; CHECK-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
+; CHECK-NEXT:    slli a4, a3, 3
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:  .LBB13_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vse64.v v8, (a1)
-; CHECK-NEXT:    add a3, a3, a2
+; CHECK-NEXT:    add a2, a2, a3
 ; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    blt a3, a0, .LBB13_2
+; CHECK-NEXT:    blt a2, a0, .LBB13_2
 ; CHECK-NEXT:  .LBB13_3: # %for.end
 ; CHECK-NEXT:    ret
 entry:
@@ -714,20 +713,20 @@ for.end:                                          ; preds = %for.body, %entry
 define void @vector_init_vsetvli_N(i64 %N, ptr %c) {
 ; CHECK-LABEL: vector_init_vsetvli_N:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli a2, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    blez a0, .LBB14_3
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
-; CHECK-NEXT:    li a3, 0
-; CHECK-NEXT:    slli a4, a2, 3
+; CHECK-NEXT:    li a2, 0
+; CHECK-NEXT:    vsetvli a3, a0, e64, m1, ta, ma
+; CHECK-NEXT:    slli a4, a3, 3
 ; CHECK-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:  .LBB14_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e64, m1, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a1)
-; CHECK-NEXT:    add a3, a3, a2
+; CHECK-NEXT:    add a2, a2, a3
 ; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    blt a3, a0, .LBB14_2
+; CHECK-NEXT:    blt a2, a0, .LBB14_2
 ; CHECK-NEXT:  .LBB14_3: # %for.end
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll
index f658a2c6b24a6c..c3b19b59ec3d68 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll
@@ -11,9 +11,10 @@ define i32 @illegal_preserve_vl(<vscale x 2 x i32> %a, <vscale x 4 x i64> %x, pt
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vadd.vv v12, v12, v12
-; CHECK-NEXT:    vs4r.v v12, (a0)
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vmv.x.s a1, v8
+; CHECK-NEXT:    vs4r.v v12, (a0)
+; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:    ret
   %index = add <vscale x 4 x i64> %x, %x
   store <vscale x 4 x i64> %index, ptr %y

From 74218a9c8fc4b0bdb4b2a4839455cf2f211a2a30 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Wed, 15 May 2024 15:43:28 +0200
Subject: [PATCH 42/71] [clang][Interp] Implement __builtin_convertvector

---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp    |  33 +++
 clang/lib/AST/Interp/ByteCodeExprGen.h      |   1 +
 clang/test/AST/Interp/builtin-functions.cpp | 235 ++++++++++++++++++++
 3 files changed, 269 insertions(+)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 7cdc1c6d1947c6..205e53f02b1a02 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -2498,6 +2498,39 @@ bool ByteCodeExprGen<Emitter>::VisitAddrLabelExpr(const AddrLabelExpr *E) {
   return this->emitGetLocal(PT_Ptr, Offset, E);
 }
 
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitConvertVectorExpr(
+    const ConvertVectorExpr *E) {
+  assert(Initializing);
+  const auto *VT = E->getType()->castAs<VectorType>();
+  QualType ElemType = VT->getElementType();
+  PrimType ElemT = classifyPrim(ElemType);
+  const Expr *Src = E->getSrcExpr();
+  PrimType SrcElemT =
+      classifyPrim(Src->getType()->castAs<VectorType>()->getElementType());
+
+  unsigned SrcOffset = this->allocateLocalPrimitive(Src, PT_Ptr, true, false);
+  if (!this->visit(Src))
+    return false;
+  if (!this->emitSetLocal(PT_Ptr, SrcOffset, E))
+    return false;
+
+  for (unsigned I = 0; I != VT->getNumElements(); ++I) {
+    if (!this->emitGetLocal(PT_Ptr, SrcOffset, E))
+      return false;
+    if (!this->emitArrayElemPop(SrcElemT, I, E))
+      return false;
+    if (SrcElemT != ElemT) {
+      if (!this->emitPrimCast(SrcElemT, ElemT, ElemType, E))
+        return false;
+    }
+    if (!this->emitInitElem(ElemT, I, E))
+      return false;
+  }
+
+  return true;
+}
+
 template <class Emitter> bool ByteCodeExprGen<Emitter>::discard(const Expr *E) {
   OptionScope<Emitter> Scope(this, /*NewDiscardResult=*/true,
                              /*NewInitializing=*/false);
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 6039a54d32a5b2..fba4e45b9aa23c 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -123,6 +123,7 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
   bool VisitPackIndexingExpr(const PackIndexingExpr *E);
   bool VisitRecoveryExpr(const RecoveryExpr *E);
   bool VisitAddrLabelExpr(const AddrLabelExpr *E);
+  bool VisitConvertVectorExpr(const ConvertVectorExpr *E);
 
 protected:
   bool visitExpr(const Expr *E) override;
diff --git a/clang/test/AST/Interp/builtin-functions.cpp b/clang/test/AST/Interp/builtin-functions.cpp
index 0cbab1fcd91d09..afdfd25527e4a1 100644
--- a/clang/test/AST/Interp/builtin-functions.cpp
+++ b/clang/test/AST/Interp/builtin-functions.cpp
@@ -639,3 +639,238 @@ void test7(void) {
 /// the actual implementation uses analyze_os_log::computeOSLogBufferLayout(), which
 /// is tested elsewhere.
 static_assert(__builtin_os_log_format_buffer_size("%{mask.xyz}s", "abc") != 0, "");
+
+/// Copied from test/Sema/constant_builtins_vector.cpp.
+/// Some tests are missing since we run this for multiple targets,
+/// some of which do not support _BitInt.
+#ifndef __AVR__
+namespace convertvector {
+  typedef _BitInt(128) BitInt128;
+
+  typedef double vector4double __attribute__((__vector_size__(32)));
+  typedef float vector4float __attribute__((__vector_size__(16)));
+  typedef long long vector4long __attribute__((__vector_size__(32)));
+  typedef int vector4int __attribute__((__vector_size__(16)));
+  typedef short vector4short __attribute__((__vector_size__(8)));
+  typedef char vector4char __attribute__((__vector_size__(4)));
+  typedef BitInt128 vector4BitInt128 __attribute__((__vector_size__(64)));
+  typedef double vector8double __attribute__((__vector_size__(64)));
+  typedef float vector8float __attribute__((__vector_size__(32)));
+  typedef long long vector8long __attribute__((__vector_size__(64)));
+  typedef int vector8int __attribute__((__vector_size__(32)));
+  typedef short vector8short __attribute__((__vector_size__(16)));
+  typedef char vector8char __attribute__((__vector_size__(8)));
+  typedef BitInt128 vector8BitInt128 __attribute__((__vector_size__(128)));
+
+  constexpr vector4double from_vector4double_to_vector4double_var =
+      __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4double);
+  constexpr vector4float from_vector4double_to_vector4float_var =
+      __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4float);
+  constexpr vector4long from_vector4double_to_vector4long_var =
+      __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4long);
+  constexpr vector4int from_vector4double_to_vector4int_var =
+      __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4int);
+  constexpr vector4short from_vector4double_to_vector4short_var =
+      __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4short);
+  constexpr vector4char from_vector4double_to_vector4char_var =
+      __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4char);
+  constexpr vector4BitInt128 from_vector4double_to_vector4BitInt128_var =
+      __builtin_convertvector((vector4double){0, 1, 2, 3}, vector4BitInt128);
+  constexpr vector4double from_vector4float_to_vector4double_var =
+      __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4double);
+  constexpr vector4float from_vector4float_to_vector4float_var =
+      __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4float);
+  constexpr vector4long from_vector4float_to_vector4long_var =
+      __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4long);
+  constexpr vector4int from_vector4float_to_vector4int_var =
+      __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4int);
+  constexpr vector4short from_vector4float_to_vector4short_var =
+      __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4short);
+  constexpr vector4char from_vector4float_to_vector4char_var =
+      __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4char);
+  constexpr vector4BitInt128 from_vector4float_to_vector4BitInt128_var =
+      __builtin_convertvector((vector4float){0, 1, 2, 3}, vector4BitInt128);
+  constexpr vector4double from_vector4long_to_vector4double_var =
+      __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4double);
+  constexpr vector4float from_vector4long_to_vector4float_var =
+      __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4float);
+  constexpr vector4long from_vector4long_to_vector4long_var =
+      __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4long);
+  constexpr vector4int from_vector4long_to_vector4int_var =
+      __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4int);
+  constexpr vector4short from_vector4long_to_vector4short_var =
+      __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4short);
+  constexpr vector4char from_vector4long_to_vector4char_var =
+      __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4char);
+  constexpr vector4BitInt128 from_vector4long_to_vector4BitInt128_var =
+      __builtin_convertvector((vector4long){0, 1, 2, 3}, vector4BitInt128);
+  constexpr vector4double from_vector4int_to_vector4double_var =
+      __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4double);
+  constexpr vector4float from_vector4int_to_vector4float_var =
+      __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4float);
+  constexpr vector4long from_vector4int_to_vector4long_var =
+      __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4long);
+  constexpr vector4int from_vector4int_to_vector4int_var =
+      __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4int);
+  constexpr vector4short from_vector4int_to_vector4short_var =
+      __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4short);
+  constexpr vector4char from_vector4int_to_vector4char_var =
+      __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4char);
+  constexpr vector4BitInt128 from_vector4int_to_vector4BitInt128_var =
+      __builtin_convertvector((vector4int){0, 1, 2, 3}, vector4BitInt128);
+  constexpr vector4double from_vector4short_to_vector4double_var =
+      __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4double);
+  constexpr vector4float from_vector4short_to_vector4float_var =
+      __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4float);
+  constexpr vector4long from_vector4short_to_vector4long_var =
+      __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4long);
+  constexpr vector4int from_vector4short_to_vector4int_var =
+      __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4int);
+  constexpr vector4short from_vector4short_to_vector4short_var =
+      __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4short);
+  constexpr vector4char from_vector4short_to_vector4char_var =
+      __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4char);
+  constexpr vector4BitInt128 from_vector4short_to_vector4BitInt128_var =
+      __builtin_convertvector((vector4short){0, 1, 2, 3}, vector4BitInt128);
+  constexpr vector4double from_vector4char_to_vector4double_var =
+      __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4double);
+  constexpr vector4float from_vector4char_to_vector4float_var =
+      __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4float);
+  constexpr vector4long from_vector4char_to_vector4long_var =
+      __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4long);
+  constexpr vector4int from_vector4char_to_vector4int_var =
+      __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4int);
+  constexpr vector4short from_vector4char_to_vector4short_var =
+      __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4short);
+  constexpr vector4char from_vector4char_to_vector4char_var =
+      __builtin_convertvector((vector4char){0, 1, 2, 3}, vector4char);
+  constexpr vector8double from_vector8double_to_vector8double_var =
+      __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8double);
+  constexpr vector8float from_vector8double_to_vector8float_var =
+      __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8float);
+  constexpr vector8long from_vector8double_to_vector8long_var =
+      __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8long);
+  constexpr vector8int from_vector8double_to_vector8int_var =
+      __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8int);
+  constexpr vector8short from_vector8double_to_vector8short_var =
+      __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8short);
+  constexpr vector8char from_vector8double_to_vector8char_var =
+      __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8char);
+  constexpr vector8BitInt128 from_vector8double_to_vector8BitInt128_var =
+      __builtin_convertvector((vector8double){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8BitInt128);
+  constexpr vector8double from_vector8float_to_vector8double_var =
+      __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8double);
+  constexpr vector8float from_vector8float_to_vector8float_var =
+      __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8float);
+  constexpr vector8long from_vector8float_to_vector8long_var =
+      __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8long);
+  constexpr vector8int from_vector8float_to_vector8int_var =
+      __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+  constexpr vector8short from_vector8float_to_vector8short_var =
+      __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8short);
+  constexpr vector8char from_vector8float_to_vector8char_var =
+      __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8char);
+  constexpr vector8BitInt128 from_vector8float_to_vector8BitInt128_var =
+      __builtin_convertvector((vector8float){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8BitInt128);
+  constexpr vector8double from_vector8long_to_vector8double_var =
+      __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8double);
+  constexpr vector8float from_vector8long_to_vector8float_var =
+      __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8float);
+  constexpr vector8long from_vector8long_to_vector8long_var =
+      __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7}, vector8long);
+  constexpr vector8int from_vector8long_to_vector8int_var =
+      __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+  constexpr vector8short from_vector8long_to_vector8short_var =
+      __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8short);
+  constexpr vector8char from_vector8long_to_vector8char_var =
+      __builtin_convertvector((vector8long){0, 1, 2, 3, 4, 5, 6, 7}, vector8char);
+  constexpr vector8double from_vector8int_to_vector8double_var =
+      __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8double);
+  constexpr vector8float from_vector8int_to_vector8float_var =
+      __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8float);
+  constexpr vector8long from_vector8int_to_vector8long_var =
+      __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8long);
+  constexpr vector8int from_vector8int_to_vector8int_var =
+      __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+  constexpr vector8short from_vector8int_to_vector8short_var =
+      __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8short);
+  constexpr vector8char from_vector8int_to_vector8char_var =
+      __builtin_convertvector((vector8int){0, 1, 2, 3, 4, 5, 6, 7}, vector8char);
+  constexpr vector8double from_vector8short_to_vector8double_var =
+      __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8double);
+  constexpr vector8float from_vector8short_to_vector8float_var =
+      __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8float);
+  constexpr vector8long from_vector8short_to_vector8long_var =
+      __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8long);
+  constexpr vector8int from_vector8short_to_vector8int_var =
+      __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+  constexpr vector8short from_vector8short_to_vector8short_var =
+      __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8short);
+  constexpr vector8char from_vector8short_to_vector8char_var =
+      __builtin_convertvector((vector8short){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8char);
+
+  constexpr vector8double from_vector8char_to_vector8double_var =
+      __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8double);
+  constexpr vector8float from_vector8char_to_vector8float_var =
+      __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8float);
+  constexpr vector8long from_vector8char_to_vector8long_var =
+      __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7}, vector8long);
+  constexpr vector8int from_vector8char_to_vector8int_var =
+      __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7}, vector8int);
+  constexpr vector8short from_vector8char_to_vector8short_var =
+      __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8short);
+  constexpr vector8char from_vector8char_to_vector8char_var =
+      __builtin_convertvector((vector8char){0, 1, 2, 3, 4, 5, 6, 7}, vector8char);
+  constexpr vector8double from_vector8BitInt128_to_vector8double_var =
+      __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8double);
+  constexpr vector8float from_vector8BitInt128_to_vector8float_var =
+      __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8float);
+  constexpr vector8long from_vector8BitInt128_to_vector8long_var =
+      __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8long);
+  constexpr vector8int from_vector8BitInt128_to_vector8int_var =
+      __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8int);
+  constexpr vector8short from_vector8BitInt128_to_vector8short_var =
+      __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8short);
+  constexpr vector8char from_vector8BitInt128_to_vector8char_var =
+      __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8char);
+  constexpr vector8BitInt128 from_vector8BitInt128_to_vector8BitInt128_var =
+      __builtin_convertvector((vector8BitInt128){0, 1, 2, 3, 4, 5, 6, 7},
+                              vector8BitInt128);
+  static_assert(from_vector8BitInt128_to_vector8BitInt128_var[0] == 0, ""); // ref-error {{not an integral constant expression}}
+  static_assert(from_vector8BitInt128_to_vector8BitInt128_var[1] == 1, ""); // ref-error {{not an integral constant expression}}
+  static_assert(from_vector8BitInt128_to_vector8BitInt128_var[2] == 2, ""); // ref-error {{not an integral constant expression}}
+  static_assert(from_vector8BitInt128_to_vector8BitInt128_var[3] == 3, ""); // ref-error {{not an integral constant expression}}
+  static_assert(from_vector8BitInt128_to_vector8BitInt128_var[4] == 4, ""); // ref-error {{not an integral constant expression}}
+}
+#endif

From 3a8d176af519e4385652e762c615ace9b80ef045 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 15 May 2024 16:50:52 +0100
Subject: [PATCH 43/71] [utils][filecheck-lint] Add shebang (#92243)

---
 llvm/utils/filecheck_lint/filecheck_lint.py | 1 +
 1 file changed, 1 insertion(+)
 mode change 100644 => 100755 llvm/utils/filecheck_lint/filecheck_lint.py

diff --git a/llvm/utils/filecheck_lint/filecheck_lint.py b/llvm/utils/filecheck_lint/filecheck_lint.py
old mode 100644
new mode 100755
index dc054ab76a098e..837846db833215
--- a/llvm/utils/filecheck_lint/filecheck_lint.py
+++ b/llvm/utils/filecheck_lint/filecheck_lint.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # ===----------------------------------------------------------------------===##
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

From ba3447601c435bb2b24ad9e3c8d146c578f00568 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Wed, 15 May 2024 17:52:46 +0200
Subject: [PATCH 44/71] [clang-tidy] Fix crash in modernize-use-constraints
 (#92019)

Improved modernize-use-constraints check by fixing a crash that occurred
in some scenarios and excluded system headers from analysis.

Problem were with DependentNameTypeLoc having null type location as
getQualifierLoc().getTypeLoc().

Fixes #91872
---
 .../modernize/UseConstraintsCheck.cpp         |  4 +++
 clang-tools-extra/docs/ReleaseNotes.rst       |  4 +++
 .../checks/modernize/use-constraints.rst      |  4 +++
 .../checkers/modernize/use-constraints.cpp    | 32 +++++++++++++++++++
 4 files changed, 44 insertions(+)

diff --git a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
index 1585925ee99672..7a021fe14436a1 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
@@ -41,6 +41,8 @@ AST_MATCHER(FunctionDecl, hasOtherDeclarations) {
 void UseConstraintsCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(
       functionTemplateDecl(
+          // Skip external libraries included as system headers
+          unless(isExpansionInSystemHeader()),
           has(functionDecl(unless(hasOtherDeclarations()), isDefinition(),
                            hasReturnTypeLoc(typeLoc().bind("return")))
                   .bind("function")))
@@ -57,6 +59,8 @@ matchEnableIfSpecializationImplTypename(TypeLoc TheType) {
       return std::nullopt;
     }
     TheType = Dep.getQualifierLoc().getTypeLoc();
+    if (TheType.isNull())
+      return std::nullopt;
   }
 
   if (const auto SpecializationLoc =
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 19f8307412956b..b7ef5a860e8b19 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -322,6 +322,10 @@ Changes in existing checks
   don't remove parentheses used in ``sizeof`` calls when they have array index
   accesses as arguments.
 
+- Improved :doc:`modernize-use-constraints
+  <clang-tidy/checks/modernize/use-constraints>` check by fixing a crash that
+  occurred in some scenarios and excluding system headers from analysis.
+
 - Improved :doc:`modernize-use-nullptr
   <clang-tidy/checks/modernize/use-nullptr>` check to include support for C23,
   which also has introduced the ``nullptr`` keyword.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-constraints.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-constraints.rst
index be62dd5823d552..a8b31b80e580b0 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-constraints.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-constraints.rst
@@ -68,3 +68,7 @@ The tool will replace the above code with,
   // The tool will not emit a diagnostic or attempt to replace the code.
   template <typename T, std::enable_if_t<T::some_trait, int> = 0>
   struct my_class {};
+
+.. note::
+
+  System headers are not analyzed by this check.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-constraints.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-constraints.cpp
index 3ec44be8a1c8c3..3bcd5cd74024ea 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-constraints.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-constraints.cpp
@@ -724,3 +724,35 @@ void not_last_param() {
 }
 
 } // namespace enable_if_trailing_type_parameter
+
+
+// Issue fixes:
+
+namespace PR91872 {
+
+enum expression_template_option { value1, value2 };
+
+template <typename T> struct number_category {
+  static const int value = 0;
+};
+
+constexpr int number_kind_complex = 1;
+
+template <typename T, expression_template_option ExpressionTemplates>
+struct number {
+  using type = T;
+};
+
+template <typename T> struct component_type {
+  using type = T;
+};
+
+template <class T, expression_template_option ExpressionTemplates>
+inline typename std::enable_if<
+    number_category<T>::value == number_kind_complex,
+    component_type<number<T, ExpressionTemplates>>>::type::type
+abs(const number<T, ExpressionTemplates> &v) {
+  return {};
+}
+
+}

From 54c6ee922abbaea7d2f138a209f320c414c1657b Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Wed, 15 May 2024 17:53:03 +0200
Subject: [PATCH 45/71] [clang-tidy] Add AllowImplicitlyDeletedCopyOrMove
 option to cppcoreguidelines-special-member-functions (#71683)

Improved cppcoreguidelines-special-member-functions check with a new
option AllowImplicitlyDeletedCopyOrMove, which removes the requirement
for explicit copy or move special member functions when they are already
implicitly deleted.

Closes #62392
---
 .../SpecialMemberFunctionsCheck.cpp           | 70 ++++++++++++++-----
 .../SpecialMemberFunctionsCheck.h             |  7 +-
 clang-tools-extra/docs/ReleaseNotes.rst       |  6 ++
 .../special-member-functions.rst              | 28 ++++++--
 .../special-member-functions-relaxed.cpp      | 26 ++++++-
 5 files changed, 107 insertions(+), 30 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp
index d2117c67a76d0b..ed76ac665049d1 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.cpp
@@ -25,7 +25,9 @@ SpecialMemberFunctionsCheck::SpecialMemberFunctionsCheck(
                                          "AllowMissingMoveFunctions", false)),
       AllowSoleDefaultDtor(Options.get("AllowSoleDefaultDtor", false)),
       AllowMissingMoveFunctionsWhenCopyIsDeleted(
-          Options.get("AllowMissingMoveFunctionsWhenCopyIsDeleted", false)) {}
+          Options.get("AllowMissingMoveFunctionsWhenCopyIsDeleted", false)),
+      AllowImplicitlyDeletedCopyOrMove(
+          Options.get("AllowImplicitlyDeletedCopyOrMove", false)) {}
 
 void SpecialMemberFunctionsCheck::storeOptions(
     ClangTidyOptions::OptionMap &Opts) {
@@ -33,17 +35,34 @@ void SpecialMemberFunctionsCheck::storeOptions(
   Options.store(Opts, "AllowSoleDefaultDtor", AllowSoleDefaultDtor);
   Options.store(Opts, "AllowMissingMoveFunctionsWhenCopyIsDeleted",
                 AllowMissingMoveFunctionsWhenCopyIsDeleted);
+  Options.store(Opts, "AllowImplicitlyDeletedCopyOrMove",
+                AllowImplicitlyDeletedCopyOrMove);
+}
+
+std::optional<TraversalKind>
+SpecialMemberFunctionsCheck::getCheckTraversalKind() const {
+  return AllowImplicitlyDeletedCopyOrMove ? TK_AsIs
+                                          : TK_IgnoreUnlessSpelledInSource;
 }
 
 void SpecialMemberFunctionsCheck::registerMatchers(MatchFinder *Finder) {
+  auto IsNotImplicitOrDeleted = anyOf(unless(isImplicit()), isDeleted());
+
   Finder->addMatcher(
       cxxRecordDecl(
-          eachOf(has(cxxDestructorDecl().bind("dtor")),
-                 has(cxxConstructorDecl(isCopyConstructor()).bind("copy-ctor")),
-                 has(cxxMethodDecl(isCopyAssignmentOperator())
+          unless(isImplicit()),
+          eachOf(has(cxxDestructorDecl(unless(isImplicit())).bind("dtor")),
+                 has(cxxConstructorDecl(isCopyConstructor(),
+                                        IsNotImplicitOrDeleted)
+                         .bind("copy-ctor")),
+                 has(cxxMethodDecl(isCopyAssignmentOperator(),
+                                   IsNotImplicitOrDeleted)
                          .bind("copy-assign")),
-                 has(cxxConstructorDecl(isMoveConstructor()).bind("move-ctor")),
-                 has(cxxMethodDecl(isMoveAssignmentOperator())
+                 has(cxxConstructorDecl(isMoveConstructor(),
+                                        IsNotImplicitOrDeleted)
+                         .bind("move-ctor")),
+                 has(cxxMethodDecl(isMoveAssignmentOperator(),
+                                   IsNotImplicitOrDeleted)
                          .bind("move-assign"))))
           .bind("class-def"),
       this);
@@ -127,7 +146,8 @@ void SpecialMemberFunctionsCheck::check(
   for (const auto &KV : Matchers)
     if (const auto *MethodDecl =
             Result.Nodes.getNodeAs<CXXMethodDecl>(KV.first)) {
-      StoreMember({KV.second, MethodDecl->isDeleted()});
+      StoreMember(
+          {KV.second, MethodDecl->isDeleted(), MethodDecl->isImplicit()});
     }
 }
 
@@ -144,7 +164,13 @@ void SpecialMemberFunctionsCheck::checkForMissingMembers(
 
   auto HasMember = [&](SpecialMemberFunctionKind Kind) {
     return llvm::any_of(DefinedMembers, [Kind](const auto &Data) {
-      return Data.FunctionKind == Kind;
+      return Data.FunctionKind == Kind && !Data.IsImplicit;
+    });
+  };
+
+  auto HasImplicitDeletedMember = [&](SpecialMemberFunctionKind Kind) {
+    return llvm::any_of(DefinedMembers, [Kind](const auto &Data) {
+      return Data.FunctionKind == Kind && Data.IsImplicit && Data.IsDeleted;
     });
   };
 
@@ -154,9 +180,17 @@ void SpecialMemberFunctionsCheck::checkForMissingMembers(
     });
   };
 
-  auto RequireMember = [&](SpecialMemberFunctionKind Kind) {
-    if (!HasMember(Kind))
-      MissingMembers.push_back(Kind);
+  auto RequireMembers = [&](SpecialMemberFunctionKind Kind1,
+                            SpecialMemberFunctionKind Kind2) {
+    if (AllowImplicitlyDeletedCopyOrMove && HasImplicitDeletedMember(Kind1) &&
+        HasImplicitDeletedMember(Kind2))
+      return;
+
+    if (!HasMember(Kind1))
+      MissingMembers.push_back(Kind1);
+
+    if (!HasMember(Kind2))
+      MissingMembers.push_back(Kind2);
   };
 
   bool RequireThree =
@@ -180,8 +214,8 @@ void SpecialMemberFunctionsCheck::checkForMissingMembers(
         !HasMember(SpecialMemberFunctionKind::NonDefaultDestructor))
       MissingMembers.push_back(SpecialMemberFunctionKind::Destructor);
 
-    RequireMember(SpecialMemberFunctionKind::CopyConstructor);
-    RequireMember(SpecialMemberFunctionKind::CopyAssignment);
+    RequireMembers(SpecialMemberFunctionKind::CopyConstructor,
+                   SpecialMemberFunctionKind::CopyAssignment);
   }
 
   if (RequireFive &&
@@ -189,14 +223,16 @@ void SpecialMemberFunctionsCheck::checkForMissingMembers(
         (IsDeleted(SpecialMemberFunctionKind::CopyConstructor) &&
          IsDeleted(SpecialMemberFunctionKind::CopyAssignment)))) {
     assert(RequireThree);
-    RequireMember(SpecialMemberFunctionKind::MoveConstructor);
-    RequireMember(SpecialMemberFunctionKind::MoveAssignment);
+    RequireMembers(SpecialMemberFunctionKind::MoveConstructor,
+                   SpecialMemberFunctionKind::MoveAssignment);
   }
 
   if (!MissingMembers.empty()) {
     llvm::SmallVector<SpecialMemberFunctionKind, 5> DefinedMemberKinds;
-    llvm::transform(DefinedMembers, std::back_inserter(DefinedMemberKinds),
-                    [](const auto &Data) { return Data.FunctionKind; });
+    for (const auto &Data : DefinedMembers) {
+      if (!Data.IsImplicit)
+        DefinedMemberKinds.push_back(Data.FunctionKind);
+    }
     diag(ID.first, "class '%0' defines %1 but does not define %2")
         << ID.second << cppcoreguidelines::join(DefinedMemberKinds, " and ")
         << cppcoreguidelines::join(MissingMembers, " or ");
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h
index 6042f0fd6cb054..9ebc03ed2fa139 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h
@@ -30,9 +30,8 @@ class SpecialMemberFunctionsCheck : public ClangTidyCheck {
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
   void onEndOfTranslationUnit() override;
-  std::optional<TraversalKind> getCheckTraversalKind() const override {
-    return TK_IgnoreUnlessSpelledInSource;
-  }
+  std::optional<TraversalKind> getCheckTraversalKind() const override;
+
   enum class SpecialMemberFunctionKind : uint8_t {
     Destructor,
     DefaultDestructor,
@@ -46,6 +45,7 @@ class SpecialMemberFunctionsCheck : public ClangTidyCheck {
   struct SpecialMemberFunctionData {
     SpecialMemberFunctionKind FunctionKind;
     bool IsDeleted;
+    bool IsImplicit = false;
 
     bool operator==(const SpecialMemberFunctionData &Other) const {
       return (Other.FunctionKind == FunctionKind) &&
@@ -67,6 +67,7 @@ class SpecialMemberFunctionsCheck : public ClangTidyCheck {
   const bool AllowMissingMoveFunctions;
   const bool AllowSoleDefaultDtor;
   const bool AllowMissingMoveFunctionsWhenCopyIsDeleted;
+  const bool AllowImplicitlyDeletedCopyOrMove;
   ClassDefiningSpecialMembersMap ClassWithSpecialMembers;
 };
 
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index b7ef5a860e8b19..71734617bf7aa8 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -262,6 +262,12 @@ Changes in existing checks
   <clang-tidy/checks/cppcoreguidelines/use-default-member-init>`. Fixed
   incorrect hints when using list-initialization.
 
+- Improved :doc:`cppcoreguidelines-special-member-functions
+  <clang-tidy/checks/cppcoreguidelines/special-member-functions>` check with a
+  new option `AllowImplicitlyDeletedCopyOrMove`, which removes the requirement
+  for explicit copy or move special member functions when they are already
+  implicitly deleted.
+
 - Improved :doc:`google-build-namespaces
   <clang-tidy/checks/google/build-namespaces>` check by replacing the local
   option `HeaderFileExtensions` by the global option of the same name.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/special-member-functions.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/special-member-functions.rst
index 176956d6cb2bd7..20f898fdab9305 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/special-member-functions.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/special-member-functions.rst
@@ -45,9 +45,10 @@ Options
 
 .. option:: AllowMissingMoveFunctions
 
-   When set to `true` (default is `false`), this check doesn't flag classes which define no move
-   operations at all. It still flags classes which define only one of either
-   move constructor or move assignment operator. With this option enabled, the following class won't be flagged:
+   When set to `true` (default is `false`), this check doesn't flag classes
+   which define no move operations at all. It still flags classes which define
+   only one of either move constructor or move assignment operator. With this
+   option enabled, the following class won't be flagged:
 
    .. code-block:: c++
 
@@ -59,10 +60,11 @@ Options
 
 .. option:: AllowMissingMoveFunctionsWhenCopyIsDeleted
 
-   When set to `true` (default is `false`), this check doesn't flag classes which define deleted copy
-   operations but don't define move operations. This flag is related to Google C++ Style Guide
-   https://google.github.io/styleguide/cppguide.html#Copyable_Movable_Types. With this option enabled, the
-   following class won't be flagged:
+   When set to `true` (default is `false`), this check doesn't flag classes
+   which define deleted copy operations but don't define move operations. This
+   flag is related to Google C++ Style Guide `Copyable and Movable Types
+   <https://google.github.io/styleguide/cppguide.html#Copyable_Movable_Types>`_.
+   With this option enabled, the following class won't be flagged:
 
    .. code-block:: c++
 
@@ -71,3 +73,15 @@ Options
        A& operator=(const A&) = delete;
        ~A();
      };
+
+.. option:: AllowImplicitlyDeletedCopyOrMove
+
+   When set to `true` (default is `false`), this check doesn't flag classes
+   which implicitly delete copy or move operations.
+   With this option enabled, the following class won't be flagged:
+
+   .. code-block:: c++
+
+      struct A : boost::noncopyable {
+        ~A() { std::cout << "dtor\n"; }
+      };
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/special-member-functions-relaxed.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/special-member-functions-relaxed.cpp
index 0c17f57968a990..26142ccc835f29 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/special-member-functions-relaxed.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/special-member-functions-relaxed.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-special-member-functions %t -- -config="{CheckOptions: {cppcoreguidelines-special-member-functions.AllowMissingMoveFunctions: true, cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true}}" --
+// RUN: %check_clang_tidy %s cppcoreguidelines-special-member-functions %t -- -config="{CheckOptions: {cppcoreguidelines-special-member-functions.AllowMissingMoveFunctions: true, cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true, cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true}}" --
 
 // Don't warn on destructors without definitions, they might be defaulted in another TU.
 class DeclaresDestructor {
@@ -34,12 +34,13 @@ class DefinesCopyAssignment {
 class DefinesMoveConstructor {
   DefinesMoveConstructor(DefinesMoveConstructor &&);
 };
-// CHECK-MESSAGES: [[@LINE-3]]:7: warning: class 'DefinesMoveConstructor' defines a move constructor but does not define a destructor, a copy constructor, a copy assignment operator or a move assignment operator [cppcoreguidelines-special-member-functions]
+// CHECK-MESSAGES: [[@LINE-3]]:7: warning: class 'DefinesMoveConstructor' defines a move constructor but does not define a destructor or a move assignment operator [cppcoreguidelines-special-member-functions]
 
 class DefinesMoveAssignment {
   DefinesMoveAssignment &operator=(DefinesMoveAssignment &&);
 };
-// CHECK-MESSAGES: [[@LINE-3]]:7: warning: class 'DefinesMoveAssignment' defines a move assignment operator but does not define a destructor, a copy constructor, a copy assignment operator or a move constructor [cppcoreguidelines-special-member-functions]
+// CHECK-MESSAGES: [[@LINE-3]]:7: warning: class 'DefinesMoveAssignment' defines a move assignment operator but does not define a destructor or a move constructor [cppcoreguidelines-special-member-functions]
+
 class DefinesNothing {
 };
 
@@ -81,3 +82,22 @@ struct TemplateClass {
 // This should not cause problems.
 TemplateClass<int> InstantiationWithInt;
 TemplateClass<double> InstantiationWithDouble;
+
+struct NoCopy
+{
+  NoCopy() = default;
+  ~NoCopy() = default;
+
+  NoCopy(const NoCopy&) = delete;
+  NoCopy(NoCopy&&) = delete;
+
+  NoCopy& operator=(const NoCopy&) = delete;
+  NoCopy& operator=(NoCopy&&) = delete;
+};
+
+// CHECK-MESSAGES: [[@LINE+1]]:8: warning: class 'NonCopyable' defines a copy constructor but does not define a destructor or a copy assignment operator [cppcoreguidelines-special-member-functions]
+struct NonCopyable : NoCopy
+{
+  NonCopyable() = default;
+  NonCopyable(const NonCopyable&) = delete;
+};

From 9bbefb7f600019c9d7025281132dd160729bfff2 Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff@gmail.com>
Date: Wed, 15 May 2024 23:12:57 +0700
Subject: [PATCH 46/71] [clang] Store FPOptions earlier when parsing function
 (#92146)

After https://github.com/llvm/llvm-project/pull/85605 ([clang] Set
correct FPOptions if attribute 'optnone' presents) the current FP
options in Sema are saved during parsing function because Sema can
modify them if optnone is present. However they were saved too late, it
caused fails in some cases when precompiled headers are used. This patch
moves the storing earlier.
---
 clang/lib/Parse/Parser.cpp | 4 ++--
 clang/test/PCH/optnone.cpp | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/PCH/optnone.cpp

diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index adcbe5858bc78e..869b9c6669c27f 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -1439,6 +1439,8 @@ Decl *Parser::ParseFunctionDefinition(ParsingDeclarator &D,
     }
   }
 
+  Sema::FPFeaturesStateRAII SaveFPFeatures(Actions);
+
   // Tell the actions module that we have entered a function definition with the
   // specified Declarator for the function.
   SkipBodyInfo SkipBody;
@@ -1497,8 +1499,6 @@ Decl *Parser::ParseFunctionDefinition(ParsingDeclarator &D,
     return Actions.ActOnFinishFunctionBody(Res, nullptr, false);
   }
 
-  Sema::FPFeaturesStateRAII SaveFPFeatures(Actions);
-
   if (Tok.is(tok::kw_try))
     return ParseFunctionTryBlock(Res, BodyScope);
 
diff --git a/clang/test/PCH/optnone.cpp b/clang/test/PCH/optnone.cpp
new file mode 100644
index 00000000000000..8351bd9de70dd6
--- /dev/null
+++ b/clang/test/PCH/optnone.cpp
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -emit-pch -x c++-header %s -o %t.pch
+// RUN: %clang_cc1 -emit-llvm -DMAIN -include-pch %t.pch %s -o /dev/null
+
+#ifndef MAIN
+__attribute__((optnone)) void foo() {}
+#endif

From 03c53c69a367008da689f0d2940e2197eb4a955c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 15 May 2024 09:18:39 -0700
Subject: [PATCH 47/71] [MC] Remove UseAssemblerInfoForParsing

Commit 6c0665e22174d474050e85ca367424f6e02476be
(https://reviews.llvm.org/D45164) enabled certain constant expression
evaluation for `MCObjectStreamer` at parse time (e.g. `.if` directives,
see llvm/test/MC/AsmParser/assembler-expressions.s).

`getUseAssemblerInfoForParsing` was added to make `clang -c` handling
inline assembly similar to `MCAsmStreamer` (e.g. `llvm-mc -filetype=asm`),
where such expression folding (related to
`AttemptToFoldSymbolOffsetDifference`) is unavailable.

I believe this is overly conservative. We can make some parse-time
expression folding work for `clang -c` even if `clang -S` would still
report an error, a MCAsmStreamer issue (we cannot print `.if`
directives) that should not restrict the functionality of
MCObjectStreamer.

```
% cat b.cc
asm(R"(
.pushsection .text,"ax"
.globl _start; _start: ret
.if . -_start == 1
  ret
.endif
.popsection
)");
% gcc -S b.cc && gcc -c b.cc
% clang -S -fno-integrated-as b.cc     # succeeded

% clang -c b.cc     # succeeded with this patch
% clang -S b.cc     # still failed
<inline asm>:4:5: error: expected absolute expression
    4 | .if . -_start == 1
      |     ^
1 error generated.
```

Close #62520
Link: https://discourse.llvm.org/t/rfc-clang-assembly-object-equivalence-for-files-with-inline-assembly/78841

Pull Request: https://github.com/llvm/llvm-project/pull/91082
---
 clang/tools/driver/cc1as_main.cpp                |  3 ---
 llvm/include/llvm/MC/MCStreamer.h                |  7 ++-----
 .../CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp   |  3 ---
 llvm/lib/MC/MCObjectStreamer.cpp                 |  9 +--------
 llvm/lib/MC/MCStreamer.cpp                       |  2 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp      |  7 ++-----
 llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp        |  3 ---
 .../AsmParser/assembler-expressions-inlineasm.ll | 16 ++++++++++------
 llvm/tools/llvm-mc/llvm-mc.cpp                   |  3 ---
 llvm/tools/llvm-ml/llvm-ml.cpp                   |  3 ---
 10 files changed, 16 insertions(+), 40 deletions(-)

diff --git a/clang/tools/driver/cc1as_main.cpp b/clang/tools/driver/cc1as_main.cpp
index 86afe22fac24cc..4eb753a7297a92 100644
--- a/clang/tools/driver/cc1as_main.cpp
+++ b/clang/tools/driver/cc1as_main.cpp
@@ -576,9 +576,6 @@ static bool ExecuteAssemblerImpl(AssemblerInvocation &Opts,
     Str.get()->emitZeros(1);
   }
 
-  // Assembly to object compilation should leverage assembly info.
-  Str->setUseAssemblerInfoForParsing(true);
-
   bool Failed = false;
 
   std::unique_ptr<MCAsmParser> Parser(
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 69867620e1bf8a..50986e6bde8867 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -245,8 +245,6 @@ class MCStreamer {
   /// requires.
   unsigned NextWinCFIID = 0;
 
-  bool UseAssemblerInfoForParsing;
-
   /// Is the assembler allowed to insert padding automatically?  For
   /// correctness reasons, we sometimes need to ensure instructions aren't
   /// separated in unexpected ways.  At the moment, this feature is only
@@ -296,11 +294,10 @@ class MCStreamer {
 
   MCContext &getContext() const { return Context; }
 
+  // MCObjectStreamer has an MCAssembler and allows more expression folding at
+  // parse time.
   virtual MCAssembler *getAssemblerPtr() { return nullptr; }
 
-  void setUseAssemblerInfoForParsing(bool v) { UseAssemblerInfoForParsing = v; }
-  bool getUseAssemblerInfoForParsing() { return UseAssemblerInfoForParsing; }
-
   MCTargetStreamer *getTargetStreamer() {
     return TargetStreamer.get();
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index d0ef3e5a19391c..08e3c208ba4d38 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -102,9 +102,6 @@ void AsmPrinter::emitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
   std::unique_ptr<MCAsmParser> Parser(
       createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI, BufNum));
 
-  // Do not use assembler-level information for parsing inline assembly.
-  OutStreamer->setUseAssemblerInfoForParsing(false);
-
   // We create a new MCInstrInfo here since we might be at the module level
   // and not have a MachineFunction to initialize the TargetInstrInfo from and
   // we only need MCInstrInfo for asm parsing. We create one unconditionally
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index d2da5d0d3f90f2..a9003a164b306d 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -40,14 +40,7 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context,
 
 MCObjectStreamer::~MCObjectStreamer() = default;
 
-// AssemblerPtr is used for evaluation of expressions and causes
-// difference between asm and object outputs. Return nullptr to in
-// inline asm mode to limit divergence to assembly inputs.
-MCAssembler *MCObjectStreamer::getAssemblerPtr() {
-  if (getUseAssemblerInfoForParsing())
-    return Assembler.get();
-  return nullptr;
-}
+MCAssembler *MCObjectStreamer::getAssemblerPtr() { return Assembler.get(); }
 
 void MCObjectStreamer::addPendingLabel(MCSymbol* S) {
   MCSection *CurSection = getCurrentSectionOnly();
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index 176d55aa890bed..199d865ea3496d 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -93,7 +93,7 @@ void MCTargetStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {}
 
 MCStreamer::MCStreamer(MCContext &Ctx)
     : Context(Ctx), CurrentWinFrameInfo(nullptr),
-      CurrentProcWinFrameInfoStartIndex(0), UseAssemblerInfoForParsing(false) {
+      CurrentProcWinFrameInfoStartIndex(0) {
   SectionStack.push_back(std::pair<MCSectionSubPair, MCSectionSubPair>());
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index b7388ed9e85a85..bd48a5f80c8284 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -517,12 +517,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   DumpCodeInstEmitter = nullptr;
   if (STM.dumpCode()) {
-    // For -dumpcode, get the assembler out of the streamer, even if it does
-    // not really want to let us have it. This only works with -filetype=obj.
-    bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing();
-    OutStreamer->setUseAssemblerInfoForParsing(true);
+    // For -dumpcode, get the assembler out of the streamer. This only works
+    // with -filetype=obj.
     MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
-    OutStreamer->setUseAssemblerInfoForParsing(SaveFlag);
     if (Assembler)
       DumpCodeInstEmitter = Assembler->getEmitterPtr();
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 2ebe5bdc47715b..ad015808604487 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -114,12 +114,9 @@ void SPIRVAsmPrinter::emitEndOfAsmFile(Module &M) {
   // Bound is an approximation that accounts for the maximum used register
   // number and number of generated OpLabels
   unsigned Bound = 2 * (ST->getBound() + 1) + NLabels;
-  bool FlagToRestore = OutStreamer->getUseAssemblerInfoForParsing();
-  OutStreamer->setUseAssemblerInfoForParsing(true);
   if (MCAssembler *Asm = OutStreamer->getAssemblerPtr())
     Asm->setBuildVersion(static_cast<MachO::PlatformType>(0), Major, Minor,
                          Bound, VersionTuple(Major, Minor, 0, Bound));
-  OutStreamer->setUseAssemblerInfoForParsing(FlagToRestore);
 }
 
 void SPIRVAsmPrinter::emitFunctionHeader() {
diff --git a/llvm/test/MC/AsmParser/assembler-expressions-inlineasm.ll b/llvm/test/MC/AsmParser/assembler-expressions-inlineasm.ll
index 35f110f37e2fb6..9d9a38f5b5a54b 100644
--- a/llvm/test/MC/AsmParser/assembler-expressions-inlineasm.ll
+++ b/llvm/test/MC/AsmParser/assembler-expressions-inlineasm.ll
@@ -1,13 +1,17 @@
-; RUN: not llc -mtriple x86_64-unknown-linux-gnu -o %t.s -filetype=asm %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple x86_64-unknown-linux-gnu -o %t.o -filetype=obj %s 2>&1 | FileCheck %s
-
-; Assembler-aware expression evaluation should be disabled in inline
-; assembly to prevent differences in behavior between object and
-; assembly output.
+; RUN: not llc -mtriple=x86_64 %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: llc -mtriple=x86_64 -no-integrated-as < %s | FileCheck %s --check-prefix=GAS
+; RUN: llc -mtriple=x86_64 -filetype=obj %s -o - | llvm-objdump -d - |  FileCheck %s --check-prefix=DISASM
 
+; GAS: nop;  .if . - foo==1; nop;.endif
 
 ; CHECK: <inline asm>:1:17: error: expected absolute expression
 
+; DISASM:      <main>:
+; DISASM-NEXT:   nop
+; DISASM-NEXT:   nop
+; DISASM-NEXT:   xorl %eax, %eax
+; DISASM-NEXT:   retq
+
 define i32 @main() local_unnamed_addr {
   tail call void asm sideeffect "foo: nop;  .if . - foo==1;  nop;.endif", "~{dirflag},~{fpsr},~{flags}"()
   ret i32 0
diff --git a/llvm/tools/llvm-mc/llvm-mc.cpp b/llvm/tools/llvm-mc/llvm-mc.cpp
index 807071a7b9a16a..506e4f22ef8f54 100644
--- a/llvm/tools/llvm-mc/llvm-mc.cpp
+++ b/llvm/tools/llvm-mc/llvm-mc.cpp
@@ -569,9 +569,6 @@ int main(int argc, char **argv) {
       Str->initSections(true, *STI);
   }
 
-  // Use Assembler information for parsing.
-  Str->setUseAssemblerInfoForParsing(true);
-
   int Res = 1;
   bool disassemble = false;
   switch (Action) {
diff --git a/llvm/tools/llvm-ml/llvm-ml.cpp b/llvm/tools/llvm-ml/llvm-ml.cpp
index 1cac576f54e77f..f1f39af059aa49 100644
--- a/llvm/tools/llvm-ml/llvm-ml.cpp
+++ b/llvm/tools/llvm-ml/llvm-ml.cpp
@@ -428,9 +428,6 @@ int llvm_ml_main(int Argc, char **Argv, const llvm::ToolContext &) {
     Str->emitAssignment(Feat00Sym, MCConstantExpr::create(Feat00Flags, Ctx));
   }
 
-  // Use Assembler information for parsing.
-  Str->setUseAssemblerInfoForParsing(true);
-
   int Res = 1;
   if (InputArgs.hasArg(OPT_as_lex)) {
     // -as-lex; Lex only, and output a stream of tokens

From 141391ad2f22885342935442642c6c892f43e1ed Mon Sep 17 00:00:00 2001
From: Nuri Amari <nuri.amari99@gmail.com>
Date: Wed, 15 May 2024 09:21:02 -0700
Subject: [PATCH 48/71] [lld] Fix -ObjC load behavior with LTO (#92162)

When -ObjC is passed, the linker must force load any object files that
contain special sections that store Objective-C / Swift information that
is used at runtime.

This should work regadless if input files are bitcode or native, but it
was not working with bitcode. This is because the sections that identify
an object file that should be loaded were inconsistent when dealing with
a native file vs bitcode file. In particular, bitcode files were not
searched for `__TEXT,__swift` prefixed sections, while native files
were.

This means LLD wasn't loading certain bitcode files and forcing the user
to introduce --force-load to their linker invocation for that archive.

Co-authored-by: Nuri Amari <nuriamari@fb.com>
---
 lld/test/MachO/objc.s                     | 23 ++++++++++++++++++++---
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp |  3 ++-
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/lld/test/MachO/objc.s b/lld/test/MachO/objc.s
index e7074141f0113f..dbb9f1df275719 100644
--- a/lld/test/MachO/objc.s
+++ b/lld/test/MachO/objc.s
@@ -5,12 +5,14 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/has-objc-category.s -o %t/has-objc-category.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/has-objc-symbol-and-category.s -o %t/has-objc-symbol-and-category.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/has-swift.s -o %t/has-swift.o
+# RUN: llvm-as %t/has-swift-ir-loaded.ll -o %t/has-swift-ir-loaded.o
+# RUN: llvm-as %t/has-swift-ir-not-loaded.ll -o %t/has-swift-ir-not-loaded.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/has-swift-proto.s -o %t/has-swift-proto.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/no-objc.s -o %t/no-objc.o
 ## Make sure we don't mis-parse a 32-bit file as 64-bit
 # RUN: llvm-mc -filetype=obj -triple=armv7-apple-watchos %t/no-objc.s -o %t/wrong-arch.o
-# RUN: llvm-ar rcs %t/libHasSomeObjC.a %t/no-objc.o %t/has-objc-symbol.o %t/has-objc-category.o %t/has-swift.o %t/has-swift-proto.o %t/wrong-arch.o
-# RUN: llvm-ar rcs %t/libHasSomeObjC2.a %t/no-objc.o %t/has-objc-symbol-and-category.o %t/has-swift.o %t/has-swift-proto.o %t/wrong-arch.o
+# RUN: llvm-ar rcs %t/libHasSomeObjC.a %t/no-objc.o %t/has-objc-symbol.o %t/has-objc-category.o %t/has-swift.o %t/has-swift-proto.o %t/has-swift-ir-loaded.o %t/has-swift-ir-not-loaded.o %t/wrong-arch.o
+# RUN: llvm-ar rcs %t/libHasSomeObjC2.a %t/no-objc.o %t/has-objc-symbol-and-category.o %t/has-swift.o %t/has-swift-proto.o %t/has-swift-ir-loaded.o %t/has-swift-ir-not-loaded.o %t/wrong-arch.o
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/test.s -o %t/test.o
 
@@ -20,7 +22,7 @@
 # RUN: %lld -lSystem %t/test.o -o %t/test -L%t -lHasSomeObjC2 -ObjC
 # RUN: llvm-objdump --section-headers --syms %t/test | FileCheck %s --check-prefix=OBJC
 
-# RUN: %no-fatal-warnings-lld -lSystem %t/test.o -o %t/test --start-lib %t/no-objc.o %t/has-objc-symbol.o %t/has-objc-category.o %t/has-swift.o %t/has-swift-proto.o %t/wrong-arch.o --end-lib -ObjC 2>&1 \
+# RUN: %no-fatal-warnings-lld -lSystem %t/test.o -o %t/test --start-lib %t/no-objc.o %t/has-objc-symbol.o %t/has-objc-category.o %t/has-swift.o %t/has-swift-proto.o %t/has-swift-ir-loaded.o %t/has-swift-ir-not-loaded.o %t/wrong-arch.o --end-lib -ObjC 2>&1 \
 # RUN:     | FileCheck -check-prefix=WARNING %s
 # RUN: llvm-objdump --section-headers --syms %t/test | FileCheck %s --check-prefix=OBJC
 
@@ -36,6 +38,7 @@
 # OBJC-NEXT:    4 has_objc_symbol {{.*}}      DATA
 # OBJC-EMPTY:
 # OBJC-NEXT:  SYMBOL TABLE:
+# OBJC-DAG:   g     O __TEXT,__swift _foo
 # OBJC-DAG:   g     F __TEXT,__text _main
 # OBJC-DAG:   g     F __TEXT,__text _OBJC_CLASS_$_MyObject
 # OBJC-DAG:   g     O __TEXT,__swift5_fieldmd $s7somelib4Blah_pMF
@@ -100,6 +103,20 @@ _has_dup:
 .section __TEXT,__swift
 .quad 0x1234
 
+#--- has-swift-ir-loaded.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "x86_64-apple-darwin"
+
+@foo = global i64 1234, section "__TEXT,__swift"
+@llvm.used = appending global [1 x ptr] [ptr @foo]
+
+#--- has-swift-ir-not-loaded.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "x86_64-apple-darwin"
+
+@bar = global i64 1234
+@llvm.used = appending global [1 x ptr] [ptr @bar]
+
 #--- has-swift-proto.s
 .section __TEXT,__swift5_fieldmd
 .globl $s7somelib4Blah_pMF
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 19a15209f8b661..e64051cf538627 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -306,7 +306,8 @@ static Expected<bool> hasObjCCategoryInModule(BitstreamCursor &Stream) {
         return error("Invalid section name record");
       // Check for the i386 and other (x86_64, ARM) conventions
       if (S.find("__DATA,__objc_catlist") != std::string::npos ||
-          S.find("__OBJC,__category") != std::string::npos)
+          S.find("__OBJC,__category") != std::string::npos ||
+          S.find("__TEXT,__swift") != std::string::npos)
         return true;
       break;
     }

From f0e79db215ada7316b4d4046490ab715194a519a Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 15 May 2024 08:58:07 -0700
Subject: [PATCH 49/71] [RISCV] Fix 80 columns in RISCVMatInt.cpp. NFC

---
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
index 0a304d4cb7d907..0a857eb96935ea 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
@@ -114,11 +114,13 @@ static void generateInstSeqImpl(int64_t Val, const MCSubtargetInfo &STI,
     ShiftAmount = llvm::countr_zero((uint64_t)Val);
     Val >>= ShiftAmount;
 
-    // If the remaining bits don't fit in 12 bits, we might be able to reduce the
-    // shift amount in order to use LUI which will zero the lower 12 bits.
+    // If the remaining bits don't fit in 12 bits, we might be able to reduce
+    // the // shift amount in order to use LUI which will zero the lower 12
+    // bits.
     if (ShiftAmount > 12 && !isInt<12>(Val)) {
       if (isInt<32>((uint64_t)Val << 12)) {
-        // Reduce the shift amount and add zeros to the LSBs so it will match LUI.
+        // Reduce the shift amount and add zeros to the LSBs so it will match
+        // LUI.
         ShiftAmount -= 12;
         Val = (uint64_t)Val << 12;
       } else if (isUInt<32>((uint64_t)Val << 12) &&

From 29c2475f215110d9e6b3955d5eb2832b3f719c2f Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Wed, 15 May 2024 18:34:59 +0200
Subject: [PATCH 50/71] [mlir] Fix the build after
 03c53c69a367008da689f0d2940e2197eb4a955c

---
 mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp | 1 -
 mlir/lib/Target/LLVM/ROCDL/Target.cpp                | 1 -
 2 files changed, 2 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
index b07addc77b56cc..a4f19981eec38b 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
@@ -363,7 +363,6 @@ LogicalResult SerializeToHsacoPass::assembleIsa(const std::string &isa,
       mab->createObjectWriter(os), std::unique_ptr<llvm::MCCodeEmitter>(ce),
       *sti, mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible,
       /*DWARFMustBeAtTheEnd*/ false));
-  mcStreamer->setUseAssemblerInfoForParsing(true);
 
   std::unique_ptr<llvm::MCAsmParser> parser(
       createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai));
diff --git a/mlir/lib/Target/LLVM/ROCDL/Target.cpp b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
index 66593fd8a55fa6..cc13e5b7436ea7 100644
--- a/mlir/lib/Target/LLVM/ROCDL/Target.cpp
+++ b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
@@ -299,7 +299,6 @@ SerializeGPUModuleBase::assembleIsa(StringRef isa) {
       mab->createObjectWriter(os), std::unique_ptr<llvm::MCCodeEmitter>(ce),
       *sti, mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible,
       /*DWARFMustBeAtTheEnd*/ false));
-  mcStreamer->setUseAssemblerInfoForParsing(true);
 
   std::unique_ptr<llvm::MCAsmParser> parser(
       createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai));

From 3f954f575156bce8ac81d6b4d94de443786befed Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Wed, 15 May 2024 12:33:54 -0400
Subject: [PATCH 51/71] Correct mismatched allocation/deallocation calls

This amends dceaa0f4491ebe30c0b0f1bc7fa5ec365b60ced6 because ASAN
caught an issue where the allocation and deallocation were not properly
paired: https://lab.llvm.org/buildbot/#/builders/239/builds/7001

Use malloc and free throughout this file to ensure that all kinds of
memory buffers use the proper pairing.
---
 llvm/lib/Support/MemoryBuffer.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index 50308bd2bf4a3b..fb7e804fd7e843 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -79,8 +79,16 @@ void *operator new(size_t N, const NamedBufferAlloc &Alloc) {
   SmallString<256> NameBuf;
   StringRef NameRef = Alloc.Name.toStringRef(NameBuf);
 
-  char *Mem = static_cast<char *>(operator new(N + sizeof(size_t) +
-                                               NameRef.size() + 1));
+  // We use malloc() and manually handle it returning null instead of calling
+  // operator new because we need all uses of NamedBufferAlloc to be
+  // deallocated with a call to free() due to needing to use malloc() in
+  // WritableMemoryBuffer::getNewUninitMemBuffer() to work around the out-of-
+  // memory handler installed by default in LLVM. See operator delete() member
+  // functions within this file for the paired call to free().
+  char *Mem =
+      static_cast<char *>(std::malloc(N + sizeof(size_t) + NameRef.size() + 1));
+  if (!Mem)
+    llvm::report_bad_alloc_error("Allocation failed");
   *reinterpret_cast<size_t *>(Mem + N) = NameRef.size();
   CopyStringRef(Mem + N + sizeof(size_t), NameRef);
   return Mem;
@@ -225,7 +233,7 @@ class MemoryBufferMMapFile : public MB {
 
   /// Disable sized deallocation for MemoryBufferMMapFile, because it has
   /// tail-allocated data.
-  void operator delete(void *p) { ::operator delete(p); }
+  void operator delete(void *p) { std::free(p); }
 
   StringRef getBufferIdentifier() const override {
     // The name is stored after the class itself.

From 332de4b2677ce7a95cc2df30d761fbb55376fe07 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Wed, 15 May 2024 11:38:41 -0500
Subject: [PATCH 52/71] [Offload] Correctly reject building on unsupported
 architectures (#92276)

Summary:
Previously we had this `LIBOMPTARGET_ENABLED` variable which controlled
including `libomptarget`. This is now redundant since it's controlled by
`LLVM_ENABLE_RUNTIMES`. However, this had the extra effect of not
building it when given unsupported targets. THis was lost during the
move to `offload`. This patch moves this logic back and makes the
`offload` target just quit without doing anything if used on an
unsupported architecture.

https://github.com/llvm/llvm-project/issues/91881
https://github.com/llvm/llvm-project/issues/91819

---------

Co-authored-by: Sylvestre Ledru <sylvestre@debian.org>
---
 offload/CMakeLists.txt | 30 ++++++++++--------------------
 openmp/CMakeLists.txt  | 12 ------------
 2 files changed, 10 insertions(+), 32 deletions(-)

diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index 626df812506383..1d8cab240924e2 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -17,26 +17,16 @@ if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
   project(offload C CXX ASM)
 endif()
 
-set(ENABLE_LIBOMPTARGET ON)
-# Currently libomptarget cannot be compiled on Windows or MacOS X.
-# Since the device plugins are only supported on Linux anyway,
-# there is no point in trying to compile libomptarget on other OSes.
-# 32-bit systems are not supported either.
-if (APPLE OR WIN32 OR NOT "cxx_std_17" IN_LIST CMAKE_CXX_COMPILE_FEATURES OR NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
-  set(ENABLE_LIBOMPTARGET OFF)
-endif()
-
-option(OPENMP_ENABLE_LIBOMPTARGET "Enable building libomptarget for offloading."
-       ${ENABLE_LIBOMPTARGET})
-if (OPENMP_ENABLE_LIBOMPTARGET)
-  # Check that the library can actually be built.
-  if (APPLE OR WIN32)
-    message(FATAL_ERROR "libomptarget cannot be built on Windows and MacOS X!")
-  elseif (NOT "cxx_std_17" IN_LIST CMAKE_CXX_COMPILE_FEATURES)
-    message(FATAL_ERROR "Host compiler must support C++17 to build libomptarget!")
-  elseif (NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
-    message(FATAL_ERROR "libomptarget on 32-bit systems are not supported!")
-  endif()
+# Check that the library can actually be built.
+if(APPLE OR WIN32 OR WASM)
+  message(WARNING "libomptarget cannot be built on Windows and MacOS X!")
+  return()
+elseif(NOT "cxx_std_17" IN_LIST CMAKE_CXX_COMPILE_FEATURES)
+  message(WARNING "Host compiler must support C++17 to build libomptarget!")
+  return()
+elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+  message(WARNING "libomptarget on 32-bit systems is not supported!")
+  return()
 endif()
 
 if(OPENMP_STANDALONE_BUILD)
diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt
index 9097ca5623000c..33bfdc8630eff9 100644
--- a/openmp/CMakeLists.txt
+++ b/openmp/CMakeLists.txt
@@ -97,18 +97,6 @@ set(OPENMP_TEST_FLAGS "" CACHE STRING
 set(OPENMP_TEST_OPENMP_FLAGS ${OPENMP_TEST_COMPILER_OPENMP_FLAGS} CACHE STRING
   "OpenMP compiler flag to use for testing OpenMP runtime libraries.")
 
-set(ENABLE_LIBOMPTARGET ON)
-# Currently libomptarget cannot be compiled on Windows or MacOS X.
-# Since the device plugins are only supported on Linux anyway,
-# there is no point in trying to compile libomptarget on other OSes.
-# 32-bit systems are not supported either.
-if (APPLE OR WIN32 OR WASM OR NOT "cxx_std_17" IN_LIST CMAKE_CXX_COMPILE_FEATURES
-    OR NOT CMAKE_SIZEOF_VOID_P EQUAL 8 OR ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
-  set(ENABLE_LIBOMPTARGET OFF)
-endif()
-
-option(OPENMP_ENABLE_LIBOMPTARGET "Enable building libomptarget for offloading."
-       ${ENABLE_LIBOMPTARGET})
 option(OPENMP_ENABLE_LIBOMP_PROFILING "Enable time profiling for libomp." OFF)
 
 # Header install location

From be10746f3a4381456eb5082a968766201c17ab5d Mon Sep 17 00:00:00 2001
From: John Ericson <John.Ericson@Obsidian.Systems>
Date: Wed, 15 May 2024 12:43:55 -0400
Subject: [PATCH 53/71] [clang] Don't assume location of compiler-rt for
 OpenBSD (#92183)

If the `/usr/lib/...` path where compiler-rt is conventionally installed
on OpenBSD does not exist, fall back to the regular logic to find it.

This is a minimal change to allow OpenBSD cross compilation from a
toolchain that doesn't adopt all of OpenBSD's monorepo's conventions.
---
 clang/lib/Driver/ToolChains/OpenBSD.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index e20d9fb1cfc417..3770471bae7c0d 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -375,7 +375,8 @@ std::string OpenBSD::getCompilerRT(const ArgList &Args, StringRef Component,
   if (Component == "builtins") {
     SmallString<128> Path(getDriver().SysRoot);
     llvm::sys::path::append(Path, "/usr/lib/libcompiler_rt.a");
-    return std::string(Path);
+    if (getVFS().exists(Path))
+      return std::string(Path);
   }
   SmallString<128> P(getDriver().ResourceDir);
   std::string CRTBasename =

From e2d74a25eb562b117974add098ba2b9dd4cfc7f5 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 15 May 2024 17:46:49 +0100
Subject: [PATCH 54/71] [X86] EmitCmp - always use cmpw with foldable loads
 (#92251)

By default, EmitCmp avoids cmpw with i16 immediates due to 66/67h length-changing prefixes causing stalls, instead extending the value to i32 and using a cmpl with a i32 immediate, unless it has the TuningFastImm16 flag or we're building for optsize/minsize.

However, if we're loading the value for comparison, the performance costs of the decode stalls are likely to be exceeded by the impact of the load latency of the folded load, the shorter encoding and not needing an extra register to store the ext-load.

This matches the behaviour of gcc and msvc.

Fixes #90355
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   9 +-
 llvm/test/CodeGen/X86/cmp16.ll                | 196 ++++--------------
 .../CodeGen/X86/memcmp-more-load-pairs-x32.ll |   3 +-
 .../CodeGen/X86/memcmp-more-load-pairs.ll     |   3 +-
 llvm/test/CodeGen/X86/memcmp-optsize-x32.ll   |   3 +-
 llvm/test/CodeGen/X86/memcmp-optsize.ll       |   3 +-
 llvm/test/CodeGen/X86/memcmp-pgso-x32.ll      |   3 +-
 llvm/test/CodeGen/X86/memcmp-pgso.ll          |   3 +-
 llvm/test/CodeGen/X86/memcmp-x32.ll           |   3 +-
 llvm/test/CodeGen/X86/memcmp.ll               |   3 +-
 10 files changed, 60 insertions(+), 169 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a57c10e784d9c8..e7c70e3872ad13 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -22692,11 +22692,14 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
           CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
 
   // Only promote the compare up to I32 if it is a 16 bit operation
-  // with an immediate.  16 bit immediates are to be avoided.
+  // with an immediate. 16 bit immediates are to be avoided unless the target
+  // isn't slowed down by length changing prefixes, we're optimizing for
+  // codesize or the comparison is with a folded load.
   if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
+      !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
       !DAG.getMachineFunction().getFunction().hasMinSize()) {
-    ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
-    ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
+    auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
+    auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
     // Don't do this if the immediate can fit in 8-bits.
     if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
         (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
diff --git a/llvm/test/CodeGen/X86/cmp16.ll b/llvm/test/CodeGen/X86/cmp16.ll
index 699ea3e4dd4734..fa9e75ff16a5ca 100644
--- a/llvm/test/CodeGen/X86/cmp16.ll
+++ b/llvm/test/CodeGen/X86/cmp16.ll
@@ -113,8 +113,7 @@ define i1 @cmp16_reg_eq_imm8(i16 %a0) {
 define i1 @cmp16_reg_eq_imm16(i16 %a0) {
 ; X86-GENERIC-LABEL: cmp16_reg_eq_imm16:
 ; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    cmpl $1024, %eax # imm = 0x400
+; X86-GENERIC-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
 ; X86-GENERIC-NEXT:    sete %al
 ; X86-GENERIC-NEXT:    retl
 ;
@@ -177,12 +176,11 @@ define i1 @cmp16_reg_eq_imm16_minsize(i16 %a0) minsize {
 }
 
 define i1 @cmp16_reg_eq_imm16_optsize(i16 %a0) optsize {
-; X86-GENERIC-LABEL: cmp16_reg_eq_imm16_optsize:
-; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    cmpl $1024, %eax # imm = 0x400
-; X86-GENERIC-NEXT:    sete %al
-; X86-GENERIC-NEXT:    retl
+; X86-LABEL: cmp16_reg_eq_imm16_optsize:
+; X86:       # %bb.0:
+; X86-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
 ;
 ; X64-GENERIC-LABEL: cmp16_reg_eq_imm16_optsize:
 ; X64-GENERIC:       # %bb.0:
@@ -191,24 +189,12 @@ define i1 @cmp16_reg_eq_imm16_optsize(i16 %a0) optsize {
 ; X64-GENERIC-NEXT:    sete %al
 ; X64-GENERIC-NEXT:    retq
 ;
-; X86-FAST-LABEL: cmp16_reg_eq_imm16_optsize:
-; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
-; X86-FAST-NEXT:    sete %al
-; X86-FAST-NEXT:    retl
-;
 ; X64-FAST-LABEL: cmp16_reg_eq_imm16_optsize:
 ; X64-FAST:       # %bb.0:
 ; X64-FAST-NEXT:    cmpw $1024, %di # imm = 0x400
 ; X64-FAST-NEXT:    sete %al
 ; X64-FAST-NEXT:    retq
 ;
-; X86-ATOM-LABEL: cmp16_reg_eq_imm16_optsize:
-; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    cmpw $1024, {{[0-9]+}}(%esp) # imm = 0x400
-; X86-ATOM-NEXT:    sete %al
-; X86-ATOM-NEXT:    retl
-;
 ; X64-ATOM-LABEL: cmp16_reg_eq_imm16_optsize:
 ; X64-ATOM:       # %bb.0:
 ; X64-ATOM-NEXT:    cmpw $1024, %di # imm = 0x400
@@ -269,8 +255,7 @@ define i1 @cmp16_reg_sgt_imm8(i16 %a0) {
 define i1 @cmp16_reg_sgt_imm16(i16 %a0) {
 ; X86-GENERIC-LABEL: cmp16_reg_sgt_imm16:
 ; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movswl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    cmpl $-1023, %eax # imm = 0xFC01
+; X86-GENERIC-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
 ; X86-GENERIC-NEXT:    setge %al
 ; X86-GENERIC-NEXT:    retl
 ;
@@ -333,12 +318,11 @@ define i1 @cmp16_reg_sgt_imm16_minsize(i16 %a0) minsize {
 }
 
 define i1 @cmp16_reg_sgt_imm16_optsize(i16 %a0) optsize {
-; X86-GENERIC-LABEL: cmp16_reg_sgt_imm16_optsize:
-; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movswl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    cmpl $-1023, %eax # imm = 0xFC01
-; X86-GENERIC-NEXT:    setge %al
-; X86-GENERIC-NEXT:    retl
+; X86-LABEL: cmp16_reg_sgt_imm16_optsize:
+; X86:       # %bb.0:
+; X86-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
+; X86-NEXT:    setge %al
+; X86-NEXT:    retl
 ;
 ; X64-GENERIC-LABEL: cmp16_reg_sgt_imm16_optsize:
 ; X64-GENERIC:       # %bb.0:
@@ -347,24 +331,12 @@ define i1 @cmp16_reg_sgt_imm16_optsize(i16 %a0) optsize {
 ; X64-GENERIC-NEXT:    setge %al
 ; X64-GENERIC-NEXT:    retq
 ;
-; X86-FAST-LABEL: cmp16_reg_sgt_imm16_optsize:
-; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
-; X86-FAST-NEXT:    setge %al
-; X86-FAST-NEXT:    retl
-;
 ; X64-FAST-LABEL: cmp16_reg_sgt_imm16_optsize:
 ; X64-FAST:       # %bb.0:
 ; X64-FAST-NEXT:    cmpw $-1023, %di # imm = 0xFC01
 ; X64-FAST-NEXT:    setge %al
 ; X64-FAST-NEXT:    retq
 ;
-; X86-ATOM-LABEL: cmp16_reg_sgt_imm16_optsize:
-; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    cmpw $-1023, {{[0-9]+}}(%esp) # imm = 0xFC01
-; X86-ATOM-NEXT:    setge %al
-; X86-ATOM-NEXT:    retl
-;
 ; X64-ATOM-LABEL: cmp16_reg_sgt_imm16_optsize:
 ; X64-ATOM:       # %bb.0:
 ; X64-ATOM-NEXT:    cmpw $-1023, %di # imm = 0xFC01
@@ -377,8 +349,7 @@ define i1 @cmp16_reg_sgt_imm16_optsize(i16 %a0) optsize {
 define i1 @cmp16_reg_uge_imm16(i16 %a0) {
 ; X86-GENERIC-LABEL: cmp16_reg_uge_imm16:
 ; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    cmpl $64512, %eax # imm = 0xFC00
+; X86-GENERIC-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
 ; X86-GENERIC-NEXT:    setae %al
 ; X86-GENERIC-NEXT:    retl
 ;
@@ -441,12 +412,11 @@ define i1 @cmp16_reg_uge_imm16_minsize(i16 %a0) minsize {
 }
 
 define i1 @cmp16_reg_uge_imm16_optsize(i16 %a0) optsize {
-; X86-GENERIC-LABEL: cmp16_reg_uge_imm16_optsize:
-; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    cmpl $64512, %eax # imm = 0xFC00
-; X86-GENERIC-NEXT:    setae %al
-; X86-GENERIC-NEXT:    retl
+; X86-LABEL: cmp16_reg_uge_imm16_optsize:
+; X86:       # %bb.0:
+; X86-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
+; X86-NEXT:    setae %al
+; X86-NEXT:    retl
 ;
 ; X64-GENERIC-LABEL: cmp16_reg_uge_imm16_optsize:
 ; X64-GENERIC:       # %bb.0:
@@ -455,24 +425,12 @@ define i1 @cmp16_reg_uge_imm16_optsize(i16 %a0) optsize {
 ; X64-GENERIC-NEXT:    setae %al
 ; X64-GENERIC-NEXT:    retq
 ;
-; X86-FAST-LABEL: cmp16_reg_uge_imm16_optsize:
-; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
-; X86-FAST-NEXT:    setae %al
-; X86-FAST-NEXT:    retl
-;
 ; X64-FAST-LABEL: cmp16_reg_uge_imm16_optsize:
 ; X64-FAST:       # %bb.0:
 ; X64-FAST-NEXT:    cmpw $-1024, %di # imm = 0xFC00
 ; X64-FAST-NEXT:    setae %al
 ; X64-FAST-NEXT:    retq
 ;
-; X86-ATOM-LABEL: cmp16_reg_uge_imm16_optsize:
-; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    cmpw $-1024, {{[0-9]+}}(%esp) # imm = 0xFC00
-; X86-ATOM-NEXT:    setae %al
-; X86-ATOM-NEXT:    retl
-;
 ; X64-ATOM-LABEL: cmp16_reg_uge_imm16_optsize:
 ; X64-ATOM:       # %bb.0:
 ; X64-ATOM-NEXT:    cmpw $-1024, %di # imm = 0xFC00
@@ -592,15 +550,13 @@ define i1 @cmp16_load_ne_imm16(ptr %p0) {
 ; X86-GENERIC-LABEL: cmp16_load_ne_imm16:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    movzwl (%eax), %eax
-; X86-GENERIC-NEXT:    cmpl $512, %eax # imm = 0x200
+; X86-GENERIC-NEXT:    cmpw $512, (%eax) # imm = 0x200
 ; X86-GENERIC-NEXT:    setne %al
 ; X86-GENERIC-NEXT:    retl
 ;
 ; X64-GENERIC-LABEL: cmp16_load_ne_imm16:
 ; X64-GENERIC:       # %bb.0:
-; X64-GENERIC-NEXT:    movzwl (%rdi), %eax
-; X64-GENERIC-NEXT:    cmpl $512, %eax # imm = 0x200
+; X64-GENERIC-NEXT:    cmpw $512, (%rdi) # imm = 0x200
 ; X64-GENERIC-NEXT:    setne %al
 ; X64-GENERIC-NEXT:    retq
 ;
@@ -694,15 +650,13 @@ define i1 @cmp16_load_slt_imm16(ptr %p0) {
 ; X86-GENERIC-LABEL: cmp16_load_slt_imm16:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    movswl (%eax), %eax
-; X86-GENERIC-NEXT:    cmpl $512, %eax # imm = 0x200
+; X86-GENERIC-NEXT:    cmpw $512, (%eax) # imm = 0x200
 ; X86-GENERIC-NEXT:    setl %al
 ; X86-GENERIC-NEXT:    retl
 ;
 ; X64-GENERIC-LABEL: cmp16_load_slt_imm16:
 ; X64-GENERIC:       # %bb.0:
-; X64-GENERIC-NEXT:    movswl (%rdi), %eax
-; X64-GENERIC-NEXT:    cmpl $512, %eax # imm = 0x200
+; X64-GENERIC-NEXT:    cmpw $512, (%rdi) # imm = 0x200
 ; X64-GENERIC-NEXT:    setl %al
 ; X64-GENERIC-NEXT:    retq
 ;
@@ -761,46 +715,18 @@ define i1 @cmp16_load_slt_imm16_minsize(ptr %p0) minsize {
 }
 
 define i1 @cmp16_load_slt_imm16_optsize(ptr %p0) optsize {
-; X86-GENERIC-LABEL: cmp16_load_slt_imm16_optsize:
-; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    movswl (%eax), %eax
-; X86-GENERIC-NEXT:    cmpl $512, %eax # imm = 0x200
-; X86-GENERIC-NEXT:    setl %al
-; X86-GENERIC-NEXT:    retl
-;
-; X64-GENERIC-LABEL: cmp16_load_slt_imm16_optsize:
-; X64-GENERIC:       # %bb.0:
-; X64-GENERIC-NEXT:    movswl (%rdi), %eax
-; X64-GENERIC-NEXT:    cmpl $512, %eax # imm = 0x200
-; X64-GENERIC-NEXT:    setl %al
-; X64-GENERIC-NEXT:    retq
-;
-; X86-FAST-LABEL: cmp16_load_slt_imm16_optsize:
-; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT:    cmpw $512, (%eax) # imm = 0x200
-; X86-FAST-NEXT:    setl %al
-; X86-FAST-NEXT:    retl
-;
-; X64-FAST-LABEL: cmp16_load_slt_imm16_optsize:
-; X64-FAST:       # %bb.0:
-; X64-FAST-NEXT:    cmpw $512, (%rdi) # imm = 0x200
-; X64-FAST-NEXT:    setl %al
-; X64-FAST-NEXT:    retq
-;
-; X86-ATOM-LABEL: cmp16_load_slt_imm16_optsize:
-; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-ATOM-NEXT:    cmpw $512, (%eax) # imm = 0x200
-; X86-ATOM-NEXT:    setl %al
-; X86-ATOM-NEXT:    retl
+; X86-LABEL: cmp16_load_slt_imm16_optsize:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw $512, (%eax) # imm = 0x200
+; X86-NEXT:    setl %al
+; X86-NEXT:    retl
 ;
-; X64-ATOM-LABEL: cmp16_load_slt_imm16_optsize:
-; X64-ATOM:       # %bb.0:
-; X64-ATOM-NEXT:    cmpw $512, (%rdi) # imm = 0x200
-; X64-ATOM-NEXT:    setl %al
-; X64-ATOM-NEXT:    retq
+; X64-LABEL: cmp16_load_slt_imm16_optsize:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpw $512, (%rdi) # imm = 0x200
+; X64-NEXT:    setl %al
+; X64-NEXT:    retq
   %ld = load i16, ptr %p0
   %cmp = icmp slt i16 %ld, 512
   ret i1 %cmp
@@ -860,15 +786,13 @@ define i1 @cmp16_load_ule_imm16(ptr %p0) {
 ; X86-GENERIC-LABEL: cmp16_load_ule_imm16:
 ; X86-GENERIC:       # %bb.0:
 ; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    movzwl (%eax), %eax
-; X86-GENERIC-NEXT:    cmpl $513, %eax # imm = 0x201
+; X86-GENERIC-NEXT:    cmpw $513, (%eax) # imm = 0x201
 ; X86-GENERIC-NEXT:    setb %al
 ; X86-GENERIC-NEXT:    retl
 ;
 ; X64-GENERIC-LABEL: cmp16_load_ule_imm16:
 ; X64-GENERIC:       # %bb.0:
-; X64-GENERIC-NEXT:    movzwl (%rdi), %eax
-; X64-GENERIC-NEXT:    cmpl $513, %eax # imm = 0x201
+; X64-GENERIC-NEXT:    cmpw $513, (%rdi) # imm = 0x201
 ; X64-GENERIC-NEXT:    setb %al
 ; X64-GENERIC-NEXT:    retq
 ;
@@ -927,46 +851,18 @@ define i1 @cmp16_load_ule_imm16_minsize(ptr %p0) minsize {
 }
 
 define i1 @cmp16_load_ule_imm16_optsize(ptr %p0) optsize {
-; X86-GENERIC-LABEL: cmp16_load_ule_imm16_optsize:
-; X86-GENERIC:       # %bb.0:
-; X86-GENERIC-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-GENERIC-NEXT:    movzwl (%eax), %eax
-; X86-GENERIC-NEXT:    cmpl $513, %eax # imm = 0x201
-; X86-GENERIC-NEXT:    setb %al
-; X86-GENERIC-NEXT:    retl
-;
-; X64-GENERIC-LABEL: cmp16_load_ule_imm16_optsize:
-; X64-GENERIC:       # %bb.0:
-; X64-GENERIC-NEXT:    movzwl (%rdi), %eax
-; X64-GENERIC-NEXT:    cmpl $513, %eax # imm = 0x201
-; X64-GENERIC-NEXT:    setb %al
-; X64-GENERIC-NEXT:    retq
-;
-; X86-FAST-LABEL: cmp16_load_ule_imm16_optsize:
-; X86-FAST:       # %bb.0:
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT:    cmpw $513, (%eax) # imm = 0x201
-; X86-FAST-NEXT:    setb %al
-; X86-FAST-NEXT:    retl
-;
-; X64-FAST-LABEL: cmp16_load_ule_imm16_optsize:
-; X64-FAST:       # %bb.0:
-; X64-FAST-NEXT:    cmpw $513, (%rdi) # imm = 0x201
-; X64-FAST-NEXT:    setb %al
-; X64-FAST-NEXT:    retq
-;
-; X86-ATOM-LABEL: cmp16_load_ule_imm16_optsize:
-; X86-ATOM:       # %bb.0:
-; X86-ATOM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-ATOM-NEXT:    cmpw $513, (%eax) # imm = 0x201
-; X86-ATOM-NEXT:    setb %al
-; X86-ATOM-NEXT:    retl
+; X86-LABEL: cmp16_load_ule_imm16_optsize:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw $513, (%eax) # imm = 0x201
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
 ;
-; X64-ATOM-LABEL: cmp16_load_ule_imm16_optsize:
-; X64-ATOM:       # %bb.0:
-; X64-ATOM-NEXT:    cmpw $513, (%rdi) # imm = 0x201
-; X64-ATOM-NEXT:    setb %al
-; X64-ATOM-NEXT:    retq
+; X64-LABEL: cmp16_load_ule_imm16_optsize:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpw $513, (%rdi) # imm = 0x201
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
   %ld = load i16, ptr %p0
   %cmp = icmp ule i16 %ld, 512
   ret i1 %cmp
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
index 0253d131226083..ee5fd78c643793 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
@@ -116,8 +116,7 @@ define i1 @length2_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length2_eq_const:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X86-NEXT:    cmpw $12849, (%eax) # imm = 0x3231
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
index da46ea40655791..a46f9ed3d3798d 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
@@ -113,8 +113,7 @@ define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
 define i1 @length2_eq_const(ptr %X) nounwind {
 ; X64-LABEL: length2_eq_const:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X64-NEXT:    cmpw $12849, (%rdi) # imm = 0x3231
 ; X64-NEXT:    setne %al
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
index 3db6ae8b76b29e..4a9643c0f4fc89 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
@@ -45,8 +45,7 @@ define i1 @length2_eq_const(ptr %X) nounwind optsize {
 ; X86-LABEL: length2_eq_const:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X86-NEXT:    cmpw $12849, (%eax) # imm = 0x3231
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll
index edd61641ad2aa4..4e27301436c344 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll
@@ -41,8 +41,7 @@ define i1 @length2_eq(ptr %X, ptr %Y) nounwind optsize {
 define i1 @length2_eq_const(ptr %X) nounwind optsize {
 ; X64-LABEL: length2_eq_const:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X64-NEXT:    cmpw $12849, (%rdi) # imm = 0x3231
 ; X64-NEXT:    setne %al
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
index 1c301da26beaa5..bdb50f5b60c49a 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
@@ -45,8 +45,7 @@ define i1 @length2_eq_const(ptr %X) nounwind !prof !14 {
 ; X86-LABEL: length2_eq_const:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X86-NEXT:    cmpw $12849, (%eax) # imm = 0x3231
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
index 1ee3317b9c9694..9347e542202208 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll
@@ -41,8 +41,7 @@ define i1 @length2_eq(ptr %X, ptr %Y) nounwind !prof !14 {
 define i1 @length2_eq_const(ptr %X) nounwind !prof !14 {
 ; X64-LABEL: length2_eq_const:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X64-NEXT:    cmpw $12849, (%rdi) # imm = 0x3231
 ; X64-NEXT:    setne %al
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp-x32.ll b/llvm/test/CodeGen/X86/memcmp-x32.ll
index a63402cea20962..ad9f2a30d75bb7 100644
--- a/llvm/test/CodeGen/X86/memcmp-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-x32.ll
@@ -144,8 +144,7 @@ define i1 @length2_eq_const(ptr %X) nounwind {
 ; X86-LABEL: length2_eq_const:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X86-NEXT:    cmpw $12849, (%eax) # imm = 0x3231
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
   %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll
index 83cb0d6f973be5..8fe1a581cd9c2b 100644
--- a/llvm/test/CodeGen/X86/memcmp.ll
+++ b/llvm/test/CodeGen/X86/memcmp.ll
@@ -139,8 +139,7 @@ define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
 define i1 @length2_eq_const(ptr %X) nounwind {
 ; X64-LABEL: length2_eq_const:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X64-NEXT:    cmpw $12849, (%rdi) # imm = 0x3231
 ; X64-NEXT:    setne %al
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind

From 34f33babc28d240d4ceee69f9afe7d6f5e8ac29b Mon Sep 17 00:00:00 2001
From: Dmitry Vasilyev <dvassiliev@accesssoftek.com>
Date: Wed, 15 May 2024 20:48:16 +0400
Subject: [PATCH 55/71] [lldb] Fixed the TestGdbRemoteCompletion test (#92268)

Do not try to run lldb-server on localhost in case of the remote target.
---
 lldb/test/API/tools/lldb-server/TestGdbRemoteCompletion.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemoteCompletion.py b/lldb/test/API/tools/lldb-server/TestGdbRemoteCompletion.py
index 04d6abe9d88c15..58373d2f85bb99 100644
--- a/lldb/test/API/tools/lldb-server/TestGdbRemoteCompletion.py
+++ b/lldb/test/API/tools/lldb-server/TestGdbRemoteCompletion.py
@@ -26,6 +26,7 @@ def init_lldb_server(self):
     def generate_hex_path(self, target):
         return str(os.path.join(self.getBuildDir(), target)).encode().hex()
 
+    @skipIfRemote
     @add_test_categories(["llgs"])
     def test_autocomplete_path(self):
         self.build()

From fc1df55bcf9b6cc2dec157bcd188b471bc91b945 Mon Sep 17 00:00:00 2001
From: Dmitry Vasilyev <dvassiliev@accesssoftek.com>
Date: Wed, 15 May 2024 20:50:58 +0400
Subject: [PATCH 56/71] [lldb][Windows] Fixed the test
 gdb_remote_client/TestGDBRemotePlatformFile (#92088)

The tests `test_file_permissions` and `test_file_permissions_fallback`
are disabled for Windows target. These tests use MockGDBServerResponder
and do not depend on the real target. These tests failed in case of
Windows host and Linux target. Disable them for Windows host too.
---
 .../gdb_remote_client/TestGDBRemotePlatformFile.py        | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemotePlatformFile.py b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemotePlatformFile.py
index 2be5ae3132038d..c902722a2f74bb 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemotePlatformFile.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestGDBRemotePlatformFile.py
@@ -147,7 +147,9 @@ def vFile(self, packet):
             log=server2.responder.packetLog,
         )
 
-    @skipIfWindows
+    @expectedFailureAll(
+        hostoslist=["windows"], bugnumber="github.com/llvm/llvm-project/issues/92255"
+    )
     def test_file_permissions(self):
         """Test 'platform get-permissions'"""
 
@@ -167,7 +169,9 @@ def vFile(self, packet):
             ]
         )
 
-    @skipIfWindows
+    @expectedFailureAll(
+        hostoslist=["windows"], bugnumber="github.com/llvm/llvm-project/issues/92255"
+    )
     def test_file_permissions_fallback(self):
         """Test 'platform get-permissions' fallback to fstat"""
 

From 2ec85713bd910c5b22ce090798ca00f742d5eb14 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Wed, 15 May 2024 11:57:48 -0500
Subject: [PATCH 57/71] [OpenMP] Add back in `ENABLE_LIBOMPTARGET' definition

Summary:
Even though we moved `libomptarget` this is still present in `omp.h` and
can't be removed.
---
 openmp/CMakeLists.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt
index 33bfdc8630eff9..9097ca5623000c 100644
--- a/openmp/CMakeLists.txt
+++ b/openmp/CMakeLists.txt
@@ -97,6 +97,18 @@ set(OPENMP_TEST_FLAGS "" CACHE STRING
 set(OPENMP_TEST_OPENMP_FLAGS ${OPENMP_TEST_COMPILER_OPENMP_FLAGS} CACHE STRING
   "OpenMP compiler flag to use for testing OpenMP runtime libraries.")
 
+set(ENABLE_LIBOMPTARGET ON)
+# Currently libomptarget cannot be compiled on Windows or MacOS X.
+# Since the device plugins are only supported on Linux anyway,
+# there is no point in trying to compile libomptarget on other OSes.
+# 32-bit systems are not supported either.
+if (APPLE OR WIN32 OR WASM OR NOT "cxx_std_17" IN_LIST CMAKE_CXX_COMPILE_FEATURES
+    OR NOT CMAKE_SIZEOF_VOID_P EQUAL 8 OR ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  set(ENABLE_LIBOMPTARGET OFF)
+endif()
+
+option(OPENMP_ENABLE_LIBOMPTARGET "Enable building libomptarget for offloading."
+       ${ENABLE_LIBOMPTARGET})
 option(OPENMP_ENABLE_LIBOMP_PROFILING "Enable time profiling for libomp." OFF)
 
 # Header install location

From 4525f442fadb7cc44cc2eaede2c8ac6ba15bdf78 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Wed, 15 May 2024 12:01:16 -0500
Subject: [PATCH 58/71] [flang][OpenMP] Don't pass clauses to op-generating
 functions anymore (#90108)

Remove parameter `const List<Clause> &clauses` from functions that take
construct queue. The clauses should now be accessed from the construct
queue.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp | 232 +++++++++++++-----------------
 1 file changed, 103 insertions(+), 129 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index f21acdd64d7c3e..f05cf1f5120f8d 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1233,8 +1233,7 @@ genCriticalOp(Fortran::lower::AbstractConverter &converter,
               Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-              const List<Clause> &clauses, const ConstructQueue &queue,
-              ConstructQueue::iterator item,
+              const ConstructQueue &queue, ConstructQueue::iterator item,
               const std::optional<Fortran::parser::Name> &name) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   mlir::FlatSymbolRefAttr nameAttr;
@@ -1245,8 +1244,8 @@ genCriticalOp(Fortran::lower::AbstractConverter &converter,
     auto global = mod.lookupSymbol<mlir::omp::CriticalDeclareOp>(nameStr);
     if (!global) {
       mlir::omp::CriticalClauseOps clauseOps;
-      genCriticalDeclareClauses(converter, semaCtx, clauses, loc, clauseOps,
-                                nameStr);
+      genCriticalDeclareClauses(converter, semaCtx, item->clauses, loc,
+                                clauseOps, nameStr);
 
       mlir::OpBuilder modBuilder(mod.getBodyRegion());
       global = modBuilder.create<mlir::omp::CriticalDeclareOp>(loc, clauseOps);
@@ -1266,8 +1265,7 @@ genDistributeOp(Fortran::lower::AbstractConverter &converter,
                 Fortran::lower::SymMap &symTable,
                 Fortran::semantics::SemanticsContext &semaCtx,
                 Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-                const List<Clause> &clauses, const ConstructQueue &queue,
-                ConstructQueue::iterator item) {
+                const ConstructQueue &queue, ConstructQueue::iterator item) {
   TODO(loc, "Distribute construct");
   return nullptr;
 }
@@ -1277,10 +1275,11 @@ genFlushOp(Fortran::lower::AbstractConverter &converter,
            Fortran::lower::SymMap &symTable,
            Fortran::semantics::SemanticsContext &semaCtx,
            Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-           const ObjectList &objects, const List<Clause> &clauses,
-           const ConstructQueue &queue, ConstructQueue::iterator item) {
+           const ObjectList &objects, const ConstructQueue &queue,
+           ConstructQueue::iterator item) {
   llvm::SmallVector<mlir::Value> operandRange;
-  genFlushClauses(converter, semaCtx, objects, clauses, loc, operandRange);
+  genFlushClauses(converter, semaCtx, objects, item->clauses, loc,
+                  operandRange);
 
   return converter.getFirOpBuilder().create<mlir::omp::FlushOp>(
       converter.getCurrentLocation(), operandRange);
@@ -1291,8 +1290,7 @@ genMasterOp(Fortran::lower::AbstractConverter &converter,
             Fortran::lower::SymMap &symTable,
             Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-            const List<Clause> &clauses, const ConstructQueue &queue,
-            ConstructQueue::iterator item) {
+            const ConstructQueue &queue, ConstructQueue::iterator item) {
   return genOpWithBody<mlir::omp::MasterOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_master),
@@ -1304,8 +1302,7 @@ genOrderedOp(Fortran::lower::AbstractConverter &converter,
              Fortran::lower::SymMap &symTable,
              Fortran::semantics::SemanticsContext &semaCtx,
              Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-             const List<Clause> &clauses, const ConstructQueue &queue,
-             ConstructQueue::iterator item) {
+             const ConstructQueue &queue, ConstructQueue::iterator item) {
   TODO(loc, "OMPD_ordered");
   return nullptr;
 }
@@ -1315,10 +1312,9 @@ genOrderedRegionOp(Fortran::lower::AbstractConverter &converter,
                    Fortran::lower::SymMap &symTable,
                    Fortran::semantics::SemanticsContext &semaCtx,
                    Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-                   const List<Clause> &clauses, const ConstructQueue &queue,
-                   ConstructQueue::iterator item) {
+                   const ConstructQueue &queue, ConstructQueue::iterator item) {
   mlir::omp::OrderedRegionClauseOps clauseOps;
-  genOrderedRegionClauses(converter, semaCtx, clauses, loc, clauseOps);
+  genOrderedRegionClauses(converter, semaCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::OrderedRegionOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
@@ -1331,15 +1327,15 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
               Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-              const List<Clause> &clauses, const ConstructQueue &queue,
-              ConstructQueue::iterator item, bool outerCombined = false) {
+              const ConstructQueue &queue, ConstructQueue::iterator item,
+              bool outerCombined = false) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   Fortran::lower::StatementContext stmtCtx;
   mlir::omp::ParallelClauseOps clauseOps;
   llvm::SmallVector<const Fortran::semantics::Symbol *> privateSyms;
   llvm::SmallVector<mlir::Type> reductionTypes;
   llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSyms;
-  genParallelClauses(converter, semaCtx, stmtCtx, clauses, loc,
+  genParallelClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                      /*processReduction=*/!outerCombined, clauseOps,
                      reductionTypes, reductionSyms);
 
@@ -1352,7 +1348,7 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_parallel)
           .setOuterCombined(outerCombined)
-          .setClauses(&clauses)
+          .setClauses(&item->clauses)
           .setReductions(&reductionSyms, &reductionTypes)
           .setGenRegionEntryCb(reductionCallback);
 
@@ -1361,7 +1357,7 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
                                                 clauseOps);
 
   bool privatize = !outerCombined;
-  DataSharingProcessor dsp(converter, semaCtx, clauses, eval,
+  DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval,
                            /*useDelayedPrivatization=*/true, &symTable);
 
   if (privatize)
@@ -1414,14 +1410,13 @@ genSectionOp(Fortran::lower::AbstractConverter &converter,
              Fortran::lower::SymMap &symTable,
              Fortran::semantics::SemanticsContext &semaCtx,
              Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-             const List<Clause> &clauses, const ConstructQueue &queue,
-             ConstructQueue::iterator item) {
+             const ConstructQueue &queue, ConstructQueue::iterator item) {
   // Currently only private/firstprivate clause is handled, and
   // all privatization is done within `omp.section` operations.
   return genOpWithBody<mlir::omp::SectionOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_section)
-          .setClauses(&clauses),
+          .setClauses(&item->clauses),
       queue, item);
 }
 
@@ -1430,22 +1425,21 @@ genSectionsOp(Fortran::lower::AbstractConverter &converter,
               Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-              const List<Clause> &clauses, const ConstructQueue &queue,
-              ConstructQueue::iterator item) {
+              const ConstructQueue &queue, ConstructQueue::iterator item) {
   mlir::omp::SectionsClauseOps clauseOps;
-  genSectionsClauses(converter, semaCtx, clauses, loc, clauseOps);
+  genSectionsClauses(converter, semaCtx, item->clauses, loc, clauseOps);
 
   auto &builder = converter.getFirOpBuilder();
 
   // Insert privatizations before SECTIONS
   symTable.pushScope();
-  DataSharingProcessor dsp(converter, semaCtx, clauses, eval);
+  DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval);
   dsp.processStep1();
 
   List<Clause> nonDsaClauses;
   List<const clause::Lastprivate *> lastprivates;
 
-  for (const Clause &clause : clauses) {
+  for (const Clause &clause : item->clauses) {
     if (clause.id == llvm::omp::Clause::OMPC_lastprivate) {
       lastprivates.push_back(&std::get<clause::Lastprivate>(clause.u));
     } else {
@@ -1508,18 +1502,18 @@ genSimdOp(Fortran::lower::AbstractConverter &converter,
           Fortran::lower::SymMap &symTable,
           Fortran::semantics::SemanticsContext &semaCtx,
           Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-          const List<Clause> &clauses, const ConstructQueue &queue,
-          ConstructQueue::iterator item) {
+          const ConstructQueue &queue, ConstructQueue::iterator item) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  DataSharingProcessor dsp(converter, semaCtx, clauses, eval);
+  DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval);
   dsp.processStep1();
 
   Fortran::lower::StatementContext stmtCtx;
   mlir::omp::LoopNestClauseOps loopClauseOps;
   mlir::omp::SimdClauseOps simdClauseOps;
   llvm::SmallVector<const Fortran::semantics::Symbol *> iv;
-  genLoopNestClauses(converter, semaCtx, eval, clauses, loc, loopClauseOps, iv);
-  genSimdClauses(converter, semaCtx, clauses, loc, simdClauseOps);
+  genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
+                     loopClauseOps, iv);
+  genSimdClauses(converter, semaCtx, item->clauses, loc, simdClauseOps);
 
   // Create omp.simd wrapper.
   auto simdOp = firOpBuilder.create<mlir::omp::SimdOp>(loc, simdClauseOps);
@@ -1532,7 +1526,8 @@ genSimdOp(Fortran::lower::AbstractConverter &converter,
   // Create nested omp.loop_nest and fill body with loop contents.
   auto loopOp = firOpBuilder.create<mlir::omp::LoopNestOp>(loc, loopClauseOps);
 
-  auto *nestedEval = getCollapsedLoopEval(eval, getCollapseValue(clauses));
+  auto *nestedEval =
+      getCollapsedLoopEval(eval, getCollapseValue(item->clauses));
 
   auto ivCallback = [&](mlir::Operation *op) {
     genLoopVars(op, converter, loc, iv);
@@ -1542,7 +1537,7 @@ genSimdOp(Fortran::lower::AbstractConverter &converter,
   createBodyOfOp(*loopOp,
                  OpWithBodyGenInfo(converter, symTable, semaCtx, loc,
                                    *nestedEval, llvm::omp::Directive::OMPD_simd)
-                     .setClauses(&clauses)
+                     .setClauses(&item->clauses)
                      .setDataSharingProcessor(&dsp)
                      .setGenRegionEntryCb(ivCallback),
                  queue, item);
@@ -1555,15 +1550,14 @@ genSingleOp(Fortran::lower::AbstractConverter &converter,
             Fortran::lower::SymMap &symTable,
             Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-            const List<Clause> &clauses, const ConstructQueue &queue,
-            ConstructQueue::iterator item) {
+            const ConstructQueue &queue, ConstructQueue::iterator item) {
   mlir::omp::SingleClauseOps clauseOps;
-  genSingleClauses(converter, semaCtx, clauses, loc, clauseOps);
+  genSingleClauses(converter, semaCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::SingleOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_single)
-          .setClauses(&clauses),
+          .setClauses(&item->clauses),
       queue, item, clauseOps);
 }
 
@@ -1572,8 +1566,8 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
             Fortran::lower::SymMap &symTable,
             Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-            const List<Clause> &clauses, const ConstructQueue &queue,
-            ConstructQueue::iterator item, bool outerCombined = false) {
+            const ConstructQueue &queue, ConstructQueue::iterator item,
+            bool outerCombined = false) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   Fortran::lower::StatementContext stmtCtx;
 
@@ -1586,7 +1580,7 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
       deviceAddrSyms;
   llvm::SmallVector<mlir::Location> mapLocs, devicePtrLocs, deviceAddrLocs;
   llvm::SmallVector<mlir::Type> mapTypes, devicePtrTypes, deviceAddrTypes;
-  genTargetClauses(converter, semaCtx, stmtCtx, clauses, loc,
+  genTargetClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
                    processHostOnlyClauses, /*processReduction=*/outerCombined,
                    clauseOps, mapSyms, mapLocs, mapTypes, deviceAddrSyms,
                    deviceAddrLocs, deviceAddrTypes, devicePtrSyms,
@@ -1690,15 +1684,14 @@ genTargetDataOp(Fortran::lower::AbstractConverter &converter,
                 Fortran::lower::SymMap &symTable,
                 Fortran::semantics::SemanticsContext &semaCtx,
                 Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-                const List<Clause> &clauses, const ConstructQueue &queue,
-                ConstructQueue::iterator item) {
+                const ConstructQueue &queue, ConstructQueue::iterator item) {
   Fortran::lower::StatementContext stmtCtx;
   mlir::omp::TargetDataClauseOps clauseOps;
   llvm::SmallVector<mlir::Type> useDeviceTypes;
   llvm::SmallVector<mlir::Location> useDeviceLocs;
   llvm::SmallVector<const Fortran::semantics::Symbol *> useDeviceSyms;
-  genTargetDataClauses(converter, semaCtx, stmtCtx, clauses, loc, clauseOps,
-                       useDeviceTypes, useDeviceLocs, useDeviceSyms);
+  genTargetDataClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
+                       clauseOps, useDeviceTypes, useDeviceLocs, useDeviceSyms);
 
   auto targetDataOp =
       converter.getFirOpBuilder().create<mlir::omp::TargetDataOp>(loc,
@@ -1714,8 +1707,7 @@ static OpTy
 genTargetEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
                                Fortran::lower::SymMap &symTable,
                                Fortran::semantics::SemanticsContext &semaCtx,
-                               mlir::Location loc, const List<Clause> &clauses,
-                               const ConstructQueue &queue,
+                               mlir::Location loc, const ConstructQueue &queue,
                                ConstructQueue::iterator item) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   Fortran::lower::StatementContext stmtCtx;
@@ -1733,8 +1725,8 @@ genTargetEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
   }
 
   mlir::omp::TargetEnterExitUpdateDataClauseOps clauseOps;
-  genTargetEnterExitUpdateDataClauses(converter, semaCtx, stmtCtx, clauses, loc,
-                                      directive, clauseOps);
+  genTargetEnterExitUpdateDataClauses(converter, semaCtx, stmtCtx,
+                                      item->clauses, loc, directive, clauseOps);
 
   return firOpBuilder.create<OpTy>(loc, clauseOps);
 }
@@ -1744,16 +1736,15 @@ genTaskOp(Fortran::lower::AbstractConverter &converter,
           Fortran::lower::SymMap &symTable,
           Fortran::semantics::SemanticsContext &semaCtx,
           Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-          const List<Clause> &clauses, const ConstructQueue &queue,
-          ConstructQueue::iterator item) {
+          const ConstructQueue &queue, ConstructQueue::iterator item) {
   Fortran::lower::StatementContext stmtCtx;
   mlir::omp::TaskClauseOps clauseOps;
-  genTaskClauses(converter, semaCtx, stmtCtx, clauses, loc, clauseOps);
+  genTaskClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TaskOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_task)
-          .setClauses(&clauses),
+          .setClauses(&item->clauses),
       queue, item, clauseOps);
 }
 
@@ -1762,15 +1753,14 @@ genTaskgroupOp(Fortran::lower::AbstractConverter &converter,
                Fortran::lower::SymMap &symTable,
                Fortran::semantics::SemanticsContext &semaCtx,
                Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-               const List<Clause> &clauses, const ConstructQueue &queue,
-               ConstructQueue::iterator item) {
+               const ConstructQueue &queue, ConstructQueue::iterator item) {
   mlir::omp::TaskgroupClauseOps clauseOps;
-  genTaskgroupClauses(converter, semaCtx, clauses, loc, clauseOps);
+  genTaskgroupClauses(converter, semaCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TaskgroupOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_taskgroup)
-          .setClauses(&clauses),
+          .setClauses(&item->clauses),
       queue, item, clauseOps);
 }
 
@@ -1779,8 +1769,7 @@ genTaskloopOp(Fortran::lower::AbstractConverter &converter,
               Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-              const List<Clause> &clauses, const ConstructQueue &queue,
-              ConstructQueue::iterator item) {
+              const ConstructQueue &queue, ConstructQueue::iterator item) {
   TODO(loc, "Taskloop construct");
 }
 
@@ -1789,10 +1778,9 @@ genTaskwaitOp(Fortran::lower::AbstractConverter &converter,
               Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-              const List<Clause> &clauses, const ConstructQueue &queue,
-              ConstructQueue::iterator item) {
+              const ConstructQueue &queue, ConstructQueue::iterator item) {
   mlir::omp::TaskwaitClauseOps clauseOps;
-  genTaskwaitClauses(converter, semaCtx, clauses, loc, clauseOps);
+  genTaskwaitClauses(converter, semaCtx, item->clauses, loc, clauseOps);
   return converter.getFirOpBuilder().create<mlir::omp::TaskwaitOp>(loc,
                                                                    clauseOps);
 }
@@ -1811,17 +1799,17 @@ genTeamsOp(Fortran::lower::AbstractConverter &converter,
            Fortran::lower::SymMap &symTable,
            Fortran::semantics::SemanticsContext &semaCtx,
            Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-           const List<Clause> &clauses, const ConstructQueue &queue,
-           ConstructQueue::iterator item, bool outerCombined = false) {
+           const ConstructQueue &queue, ConstructQueue::iterator item,
+           bool outerCombined = false) {
   Fortran::lower::StatementContext stmtCtx;
   mlir::omp::TeamsClauseOps clauseOps;
-  genTeamsClauses(converter, semaCtx, stmtCtx, clauses, loc, clauseOps);
+  genTeamsClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps);
 
   return genOpWithBody<mlir::omp::TeamsOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_teams)
           .setOuterCombined(outerCombined)
-          .setClauses(&clauses),
+          .setClauses(&item->clauses),
       queue, item, clauseOps);
 }
 
@@ -1830,10 +1818,9 @@ genWsloopOp(Fortran::lower::AbstractConverter &converter,
             Fortran::lower::SymMap &symTable,
             Fortran::semantics::SemanticsContext &semaCtx,
             Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-            const List<Clause> &clauses, const ConstructQueue &queue,
-            ConstructQueue::iterator item) {
+            const ConstructQueue &queue, ConstructQueue::iterator item) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  DataSharingProcessor dsp(converter, semaCtx, clauses, eval);
+  DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval);
   dsp.processStep1();
 
   Fortran::lower::StatementContext stmtCtx;
@@ -1842,8 +1829,9 @@ genWsloopOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<const Fortran::semantics::Symbol *> iv;
   llvm::SmallVector<mlir::Type> reductionTypes;
   llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSyms;
-  genLoopNestClauses(converter, semaCtx, eval, clauses, loc, loopClauseOps, iv);
-  genWsloopClauses(converter, semaCtx, stmtCtx, clauses, loc, wsClauseOps,
+  genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
+                     loopClauseOps, iv);
+  genWsloopClauses(converter, semaCtx, stmtCtx, item->clauses, loc, wsClauseOps,
                    reductionTypes, reductionSyms);
 
   // Create omp.wsloop wrapper and populate entry block arguments with reduction
@@ -1858,7 +1846,8 @@ genWsloopOp(Fortran::lower::AbstractConverter &converter,
   // Create nested omp.loop_nest and fill body with loop contents.
   auto loopOp = firOpBuilder.create<mlir::omp::LoopNestOp>(loc, loopClauseOps);
 
-  auto *nestedEval = getCollapsedLoopEval(eval, getCollapseValue(clauses));
+  auto *nestedEval =
+      getCollapsedLoopEval(eval, getCollapseValue(item->clauses));
 
   auto ivCallback = [&](mlir::Operation *op) {
     genLoopVars(op, converter, loc, iv, reductionSyms,
@@ -1869,7 +1858,7 @@ genWsloopOp(Fortran::lower::AbstractConverter &converter,
   createBodyOfOp(*loopOp,
                  OpWithBodyGenInfo(converter, symTable, semaCtx, loc,
                                    *nestedEval, llvm::omp::Directive::OMPD_do)
-                     .setClauses(&clauses)
+                     .setClauses(&item->clauses)
                      .setDataSharingProcessor(&dsp)
                      .setReductions(&reductionSyms, &reductionTypes)
                      .setGenRegionEntryCb(ivCallback),
@@ -1886,8 +1875,7 @@ static void genCompositeDistributeParallelDo(
     Fortran::lower::SymMap &symTable,
     Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-    const List<Clause> &clauses, const ConstructQueue &queue,
-    ConstructQueue::iterator item) {
+    const ConstructQueue &queue, ConstructQueue::iterator item) {
   TODO(loc, "Composite DISTRIBUTE PARALLEL DO");
 }
 
@@ -1896,8 +1884,7 @@ static void genCompositeDistributeParallelDoSimd(
     Fortran::lower::SymMap &symTable,
     Fortran::semantics::SemanticsContext &semaCtx,
     Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
-    const List<Clause> &clauses, const ConstructQueue &queue,
-    ConstructQueue::iterator item) {
+    const ConstructQueue &queue, ConstructQueue::iterator item) {
   TODO(loc, "Composite DISTRIBUTE PARALLEL DO SIMD");
 }
 
@@ -1906,8 +1893,7 @@ genCompositeDistributeSimd(Fortran::lower::AbstractConverter &converter,
                            Fortran::lower::SymMap &symTable,
                            Fortran::semantics::SemanticsContext &semaCtx,
                            Fortran::lower::pft::Evaluation &eval,
-                           mlir::Location loc, const List<Clause> &clauses,
-                           const ConstructQueue &queue,
+                           mlir::Location loc, const ConstructQueue &queue,
                            ConstructQueue::iterator item) {
   TODO(loc, "Composite DISTRIBUTE SIMD");
 }
@@ -1916,10 +1902,9 @@ static void genCompositeDoSimd(Fortran::lower::AbstractConverter &converter,
                                Fortran::lower::SymMap &symTable,
                                Fortran::semantics::SemanticsContext &semaCtx,
                                Fortran::lower::pft::Evaluation &eval,
-                               mlir::Location loc, const List<Clause> &clauses,
-                               const ConstructQueue &queue,
+                               mlir::Location loc, const ConstructQueue &queue,
                                ConstructQueue::iterator item) {
-  ClauseProcessor cp(converter, semaCtx, clauses);
+  ClauseProcessor cp(converter, semaCtx, item->clauses);
   cp.processTODO<clause::Aligned, clause::Allocate, clause::Linear,
                  clause::Order, clause::Safelen, clause::Simdlen>(
       loc, llvm::omp::OMPD_do_simd);
@@ -1931,7 +1916,7 @@ static void genCompositeDoSimd(Fortran::lower::AbstractConverter &converter,
   // When support for vectorization is enabled, then we need to add handling of
   // if clause. Currently if clause can be skipped because we always assume
   // SIMD length = 1.
-  genWsloopOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+  genWsloopOp(converter, symTable, semaCtx, eval, loc, queue, item);
 }
 
 static void
@@ -1939,8 +1924,7 @@ genCompositeTaskloopSimd(Fortran::lower::AbstractConverter &converter,
                          Fortran::lower::SymMap &symTable,
                          Fortran::semantics::SemanticsContext &semaCtx,
                          Fortran::lower::pft::Evaluation &eval,
-                         mlir::Location loc, const List<Clause> &clauses,
-                         const ConstructQueue &queue,
+                         mlir::Location loc, const ConstructQueue &queue,
                          ConstructQueue::iterator item) {
   TODO(loc, "Composite TASKLOOP SIMD");
 }
@@ -1956,18 +1940,16 @@ static void genOMPDispatch(Fortran::lower::AbstractConverter &converter,
                            mlir::Location loc, const ConstructQueue &queue,
                            ConstructQueue::iterator item) {
   assert(item != queue.end());
-  const List<Clause> &clauses = item->clauses;
 
   switch (llvm::omp::Directive dir = item->id) {
   case llvm::omp::Directive::OMPD_barrier:
     genBarrierOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_distribute:
-    genDistributeOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
-                    item);
+    genDistributeOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_do:
-    genWsloopOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    genWsloopOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_loop:
   case llvm::omp::Directive::OMPD_masked:
@@ -1975,71 +1957,64 @@ static void genOMPDispatch(Fortran::lower::AbstractConverter &converter,
                   llvm::omp::getOpenMPDirectiveName(dir) + ")");
     break;
   case llvm::omp::Directive::OMPD_master:
-    genMasterOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    genMasterOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_ordered:
     // Block-associated "ordered" construct.
-    genOrderedRegionOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
-                       item);
+    genOrderedRegionOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_parallel:
-    genParallelOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item,
+    genParallelOp(converter, symTable, semaCtx, eval, loc, queue, item,
                   /*outerCombined=*/false);
     break;
   case llvm::omp::Directive::OMPD_section:
-    genSectionOp(converter, symTable, semaCtx, eval, loc, /*clauses=*/{}, queue,
-                 item);
+    genSectionOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_sections:
-    genSectionsOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
-                  item);
+    genSectionsOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_simd:
-    genSimdOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    genSimdOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_single:
-    genSingleOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    genSingleOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_target:
-    genTargetOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item,
+    genTargetOp(converter, symTable, semaCtx, eval, loc, queue, item,
                 /*outerCombined=*/false);
     break;
   case llvm::omp::Directive::OMPD_target_data:
-    genTargetDataOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
-                    item);
+    genTargetDataOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_target_enter_data:
     genTargetEnterExitUpdateDataOp<mlir::omp::TargetEnterDataOp>(
-        converter, symTable, semaCtx, loc, clauses, queue, item);
+        converter, symTable, semaCtx, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_target_exit_data:
     genTargetEnterExitUpdateDataOp<mlir::omp::TargetExitDataOp>(
-        converter, symTable, semaCtx, loc, clauses, queue, item);
+        converter, symTable, semaCtx, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_target_update:
     genTargetEnterExitUpdateDataOp<mlir::omp::TargetUpdateOp>(
-        converter, symTable, semaCtx, loc, clauses, queue, item);
+        converter, symTable, semaCtx, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_task:
-    genTaskOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    genTaskOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_taskgroup:
-    genTaskgroupOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
-                   item);
+    genTaskgroupOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_taskloop:
-    genTaskloopOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
-                  item);
+    genTaskloopOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_taskwait:
-    genTaskwaitOp(converter, symTable, semaCtx, eval, loc, clauses, queue,
-                  item);
+    genTaskwaitOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_taskyield:
     genTaskyieldOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_teams:
-    genTeamsOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    genTeamsOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_tile:
   case llvm::omp::Directive::OMPD_unroll:
@@ -2050,29 +2025,28 @@ static void genOMPDispatch(Fortran::lower::AbstractConverter &converter,
     // FIXME: Workshare is not a commonly used OpenMP construct, an
     // implementation for this feature will come later. For the codes
     // that use this construct, add a single construct for now.
-    genSingleOp(converter, symTable, semaCtx, eval, loc, clauses, queue, item);
+    genSingleOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
 
   // Composite constructs
   case llvm::omp::Directive::OMPD_distribute_parallel_do:
     genCompositeDistributeParallelDo(converter, symTable, semaCtx, eval, loc,
-                                     clauses, queue, item);
+                                     queue, item);
     break;
   case llvm::omp::Directive::OMPD_distribute_parallel_do_simd:
     genCompositeDistributeParallelDoSimd(converter, symTable, semaCtx, eval,
-                                         loc, clauses, queue, item);
+                                         loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_distribute_simd:
-    genCompositeDistributeSimd(converter, symTable, semaCtx, eval, loc, clauses,
-                               queue, item);
+    genCompositeDistributeSimd(converter, symTable, semaCtx, eval, loc, queue,
+                               item);
     break;
   case llvm::omp::Directive::OMPD_do_simd:
-    genCompositeDoSimd(converter, symTable, semaCtx, eval, loc, clauses, queue,
-                       item);
+    genCompositeDoSimd(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_taskloop_simd:
-    genCompositeTaskloopSimd(converter, symTable, semaCtx, eval, loc, clauses,
-                             queue, item);
+    genCompositeTaskloopSimd(converter, symTable, semaCtx, eval, loc, queue,
+                             item);
     break;
   default:
     break;
@@ -2194,8 +2168,8 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
                           eval, directive.source, directive.v, clauses)};
   if (directive.v == llvm::omp::Directive::OMPD_ordered) {
     // Standalone "ordered" directive.
-    genOrderedOp(converter, symTable, semaCtx, eval, currentLocation, clauses,
-                 queue, queue.begin());
+    genOrderedOp(converter, symTable, semaCtx, eval, currentLocation, queue,
+                 queue.begin());
   } else {
     // Dispatch handles the "block-associated" variant of "ordered".
     genOMPDispatch(converter, symTable, semaCtx, eval, currentLocation, queue,
@@ -2227,7 +2201,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
       converter.getFirOpBuilder().getModule(), semaCtx, eval, verbatim.source,
       llvm::omp::Directive::OMPD_flush, clauses)};
   genFlushOp(converter, symTable, semaCtx, eval, currentLocation, objects,
-             clauses, queue, queue.begin());
+             queue, queue.begin());
 }
 
 static void
@@ -2399,8 +2373,8 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 
   const auto &name = std::get<std::optional<Fortran::parser::Name>>(cd.t);
   mlir::Location currentLocation = converter.getCurrentLocation();
-  genCriticalOp(converter, symTable, semaCtx, eval, currentLocation, clauses,
-                queue, queue.begin(), name);
+  genCriticalOp(converter, symTable, semaCtx, eval, currentLocation, queue,
+                queue.begin(), name);
 }
 
 static void

From eb822dc25853299ea81166f9bb8a43436ab8b0c8 Mon Sep 17 00:00:00 2001
From: Dmitry Vasilyev <dvassiliev@accesssoftek.com>
Date: Wed, 15 May 2024 21:03:15 +0400
Subject: [PATCH 59/71] [lldb] Fixed the TestCompletion test running on a
 remote target (#92281)

Install the image to the remote target if necessary.
---
 .../functionalities/completion/TestCompletion.py    | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/lldb/test/API/functionalities/completion/TestCompletion.py b/lldb/test/API/functionalities/completion/TestCompletion.py
index 0d6907e0c3d229..63842487fc338d 100644
--- a/lldb/test/API/functionalities/completion/TestCompletion.py
+++ b/lldb/test/API/functionalities/completion/TestCompletion.py
@@ -107,9 +107,16 @@ def test_process_unload(self):
             self, "// Break here", lldb.SBFileSpec("main.cpp")
         )
         err = lldb.SBError()
-        self.process().LoadImage(
-            lldb.SBFileSpec(self.getBuildArtifact("libshared.so")), err
+        local_spec = lldb.SBFileSpec(self.getBuildArtifact("libshared.so"))
+        remote_spec = (
+            lldb.SBFileSpec(
+                lldbutil.append_to_process_working_directory(self, "libshared.so"),
+                False,
+            )
+            if lldb.remote_platform
+            else lldb.SBFileSpec()
         )
+        self.process().LoadImage(local_spec, remote_spec, err)
         self.assertSuccess(err)
 
         self.complete_from_to("process unload ", "process unload 0")
@@ -473,7 +480,7 @@ def test_custom_command_completion(self):
         self.complete_from_to("my_test_cmd main.cp", ["main.cpp"])
         self.expect("my_test_cmd main.cpp", substrs=["main.cpp"])
 
-    @skipIfWindows
+    @skipIf(hostoslist=["windows"])
     def test_completion_target_create_from_root_dir(self):
         """Tests source file completion by completing ."""
         root_dir = os.path.abspath(os.sep)

From 7645269710493c188d1d270b9e4e085b3e92b9b0 Mon Sep 17 00:00:00 2001
From: Dmitry Vasilyev <dvassiliev@accesssoftek.com>
Date: Wed, 15 May 2024 21:06:30 +0400
Subject: [PATCH 60/71] [lldb] Fixed the TestNetBSDCore test (#92285)

TestNetBSDCore.py contains 3 classes with the same test names
test_aarch64 and test_amd64. It causes conflicts because the same build
dir. Add suffixes to avoid conflicts.
---
 .../postmortem/netbsd-core/TestNetBSDCore.py         | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py b/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py
index 756f4d1e81caa0..ff1ef21e02e319 100644
--- a/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py
+++ b/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py
@@ -147,12 +147,12 @@ def check_stack(self, process, pid, filename):
         self.check_backtrace(thread, filename, backtrace)
 
     @skipIfLLVMTargetMissing("AArch64")
-    def test_aarch64(self):
+    def test_aarch64_single_threaded(self):
         """Test single-threaded aarch64 core dump."""
         self.do_test("1lwp_SIGSEGV.aarch64", pid=8339, region_count=32)
 
     @skipIfLLVMTargetMissing("X86")
-    def test_amd64(self):
+    def test_amd64_single_threaded(self):
         """Test single-threaded amd64 core dump."""
         self.do_test("1lwp_SIGSEGV.amd64", pid=693, region_count=21)
 
@@ -177,12 +177,12 @@ def check_stack(self, process, pid, filename):
         self.assertEqual(thread.GetStopReasonDataAtIndex(0), 0)
 
     @skipIfLLVMTargetMissing("AArch64")
-    def test_aarch64(self):
+    def test_aarch64_thread_signaled(self):
         """Test double-threaded aarch64 core dump where thread 2 is signalled."""
         self.do_test("2lwp_t2_SIGSEGV.aarch64", pid=14142, region_count=31)
 
     @skipIfLLVMTargetMissing("X86")
-    def test_amd64(self):
+    def test_amd64_thread_signaled(self):
         """Test double-threaded amd64 core dump where thread 2 is signalled."""
         self.do_test("2lwp_t2_SIGSEGV.amd64", pid=622, region_count=24)
 
@@ -207,11 +207,11 @@ def check_stack(self, process, pid, filename):
         self.assertEqual(thread.GetStopReasonDataAtIndex(0), signal.SIGSEGV)
 
     @skipIfLLVMTargetMissing("AArch64")
-    def test_aarch64(self):
+    def test_aarch64_process_signaled(self):
         """Test double-threaded aarch64 core dump where process is signalled."""
         self.do_test("2lwp_process_SIGSEGV.aarch64", pid=1403, region_count=30)
 
     @skipIfLLVMTargetMissing("X86")
-    def test_amd64(self):
+    def test_amd64_process_signaled(self):
         """Test double-threaded amd64 core dump where process is signalled."""
         self.do_test("2lwp_process_SIGSEGV.amd64", pid=665, region_count=24)

From d92c67784f21063d6334a009dbf4f9e0f8217b41 Mon Sep 17 00:00:00 2001
From: Dmitry Vasilyev <dvassiliev@accesssoftek.com>
Date: Wed, 15 May 2024 21:08:35 +0400
Subject: [PATCH 61/71] [lldb][Windows] Fixed the TestIOHandlerResizeNoEditline
 test (#92286)

This test caused python crash on Windows x86_64 host with the exit code
0xC0000409 (STATUS_STACK_BUFFER_OVERRUN). Close the input stream before
exit to avoid this crash.
---
 lldb/test/API/iohandler/resize/TestIOHandlerResizeNoEditline.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/test/API/iohandler/resize/TestIOHandlerResizeNoEditline.py b/lldb/test/API/iohandler/resize/TestIOHandlerResizeNoEditline.py
index 3c07554f6cafd7..bbc2dcbe4e30ad 100644
--- a/lldb/test/API/iohandler/resize/TestIOHandlerResizeNoEditline.py
+++ b/lldb/test/API/iohandler/resize/TestIOHandlerResizeNoEditline.py
@@ -18,3 +18,4 @@ def test_resize_no_editline(self):
         dbg.RunCommandInterpreter(True, True, opts, 0, False, False)
         # Try resizing the terminal which shouldn't crash.
         dbg.SetTerminalWidth(47)
+        dbg.GetInputFile().Close()

From 217668f641e82f901645f428ae0d07a3c01e9a8a Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Wed, 15 May 2024 10:34:47 -0700
Subject: [PATCH 62/71] [nfc] Allow forwarding `Error` returns from `Expected`
 callers (#92208)

On a few compilers (clang 11/12 for example [1]), the following does not
result in a copy elision, and since `Error`'s copy dtor is elided,
results in a compile error:

```
  Expect<Something> foobar() {
    ...
    if (Error E = aCallReturningError())
      return E;
    ...
  }
```

Moving `E` would, conversely, result in the pessimizing-move warning on
more recent clangs ("moving a local object in a return statement
prevents copy elision")

We just need to make the `Expected` ctor taking an `Error` take it as a
r-value reference.

[1] https://lab.llvm.org/buildbot/#/builders/54/builds/10505
---
 llvm/include/llvm/Support/Error.h             |  2 +-
 llvm/lib/Bitstream/Reader/BitstreamReader.cpp | 12 ++++++------
 llvm/lib/Object/COFFObjectFile.cpp            |  6 +++---
 llvm/lib/Object/WindowsResource.cpp           |  4 ++--
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/Support/Error.h b/llvm/include/llvm/Support/Error.h
index 894b6484336aef..217130ce293af9 100644
--- a/llvm/include/llvm/Support/Error.h
+++ b/llvm/include/llvm/Support/Error.h
@@ -493,7 +493,7 @@ template <class T> class [[nodiscard]] Expected {
 
 public:
   /// Create an Expected<T> error value from the given Error.
-  Expected(Error Err)
+  Expected(Error &&Err)
       : HasError(true)
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
         // Expected is unchecked upon construction in Debug builds.
diff --git a/llvm/lib/Bitstream/Reader/BitstreamReader.cpp b/llvm/lib/Bitstream/Reader/BitstreamReader.cpp
index 3cc9dfdf7b8586..5b2c76350029be 100644
--- a/llvm/lib/Bitstream/Reader/BitstreamReader.cpp
+++ b/llvm/lib/Bitstream/Reader/BitstreamReader.cpp
@@ -167,7 +167,7 @@ Expected<unsigned> BitstreamCursor::skipRecord(unsigned AbbrevID) {
         if (Error Err =
                 JumpToBit(GetCurrentBitNo() + static_cast<uint64_t>(NumElts) *
                                                   EltEnc.getEncodingData()))
-          return std::move(Err);
+          return Err;
         break;
       case BitCodeAbbrevOp::VBR:
         assert((unsigned)EltEnc.getEncodingData() <= MaxChunkSize);
@@ -180,7 +180,7 @@ Expected<unsigned> BitstreamCursor::skipRecord(unsigned AbbrevID) {
         break;
       case BitCodeAbbrevOp::Char6:
         if (Error Err = JumpToBit(GetCurrentBitNo() + NumElts * 6))
-          return std::move(Err);
+          return Err;
         break;
       }
       continue;
@@ -206,7 +206,7 @@ Expected<unsigned> BitstreamCursor::skipRecord(unsigned AbbrevID) {
 
     // Skip over the blob.
     if (Error Err = JumpToBit(NewEnd))
-      return std::move(Err);
+      return Err;
   }
   return Code;
 }
@@ -344,7 +344,7 @@ Expected<unsigned> BitstreamCursor::readRecord(unsigned AbbrevID,
     // over tail padding first, in case jumping to NewEnd invalidates the Blob
     // pointer.
     if (Error Err = JumpToBit(NewEnd))
-      return std::move(Err);
+      return Err;
     const char *Ptr = (const char *)getPointerToBit(CurBitPos, NumElts);
 
     // If we can return a reference to the data, do so to avoid copying it.
@@ -421,7 +421,7 @@ Error BitstreamCursor::ReadAbbrevRecord() {
 Expected<std::optional<BitstreamBlockInfo>>
 BitstreamCursor::ReadBlockInfoBlock(bool ReadBlockInfoNames) {
   if (llvm::Error Err = EnterSubBlock(bitc::BLOCKINFO_BLOCK_ID))
-    return std::move(Err);
+    return Err;
 
   BitstreamBlockInfo NewBlockInfo;
 
@@ -452,7 +452,7 @@ BitstreamCursor::ReadBlockInfoBlock(bool ReadBlockInfoNames) {
       if (!CurBlockInfo)
         return std::nullopt;
       if (Error Err = ReadAbbrevRecord())
-        return std::move(Err);
+        return Err;
 
       // ReadAbbrevRecord installs the abbrev in CurAbbrevs.  Move it to the
       // appropriate BlockInfo.
diff --git a/llvm/lib/Object/COFFObjectFile.cpp b/llvm/lib/Object/COFFObjectFile.cpp
index 18506f39f6b57b..5a85b8e00c633a 100644
--- a/llvm/lib/Object/COFFObjectFile.cpp
+++ b/llvm/lib/Object/COFFObjectFile.cpp
@@ -294,7 +294,7 @@ COFFObjectFile::getSectionContents(DataRefImpl Ref) const {
   const coff_section *Sec = toSec(Ref);
   ArrayRef<uint8_t> Res;
   if (Error E = getSectionContents(Sec, Res))
-    return std::move(E);
+    return E;
   return Res;
 }
 
@@ -807,7 +807,7 @@ Expected<std::unique_ptr<COFFObjectFile>>
 COFFObjectFile::create(MemoryBufferRef Object) {
   std::unique_ptr<COFFObjectFile> Obj(new COFFObjectFile(std::move(Object)));
   if (Error E = Obj->initialize())
-    return std::move(E);
+    return E;
   return std::move(Obj);
 }
 
@@ -1959,7 +1959,7 @@ ResourceSectionRef::getContents(const coff_resource_data_entry &Entry) {
     uint64_t Offset = Entry.DataRVA + Sym->getValue();
     ArrayRef<uint8_t> Contents;
     if (Error E = Obj->getSectionContents(*Section, Contents))
-      return std::move(E);
+      return E;
     if (Offset + Entry.DataSize > Contents.size())
       return createStringError(object_error::parse_failed,
                                "data outside of section");
diff --git a/llvm/lib/Object/WindowsResource.cpp b/llvm/lib/Object/WindowsResource.cpp
index 983c8e30a9420d..306e8ec542068b 100644
--- a/llvm/lib/Object/WindowsResource.cpp
+++ b/llvm/lib/Object/WindowsResource.cpp
@@ -80,7 +80,7 @@ Expected<ResourceEntryRef>
 ResourceEntryRef::create(BinaryStreamRef BSR, const WindowsResource *Owner) {
   auto Ref = ResourceEntryRef(BSR, Owner);
   if (auto E = Ref.loadNext())
-    return std::move(E);
+    return E;
   return Ref;
 }
 
@@ -1006,7 +1006,7 @@ writeWindowsResourceCOFF(COFF::MachineTypes MachineType,
   Error E = Error::success();
   WindowsResourceCOFFWriter Writer(MachineType, Parser, E);
   if (E)
-    return std::move(E);
+    return E;
   return Writer.write(TimeDateStamp);
 }
 

From 0647d1035cb208195e002b38089b82004b6f7b92 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 15 May 2024 10:15:35 -0700
Subject: [PATCH 63/71] [RISCV] Remove unneeded casts from int64_t to uint64_t
 in RISCVMatInt.cpp. NFC

Most of these were to avoid undefined behavior if a shift left
changed the sign of the result. I don't think its possible to change
the sign of the result here. We're shifting left by 12 after an arithmetic
right shift by more than 12. The bits we are shifting out with the left
shift are guaranteed to be sign bits.

Also use SignExtend64<32> to force upper bits to all 1s instead of an
Or. We know the value isUInt<32> && !isInt<32> which means bit 31 is set.
---
 .../Target/RISCV/MCTargetDesc/RISCVMatInt.cpp   | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
index 0a857eb96935ea..fca3362f9a8b27 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
@@ -115,30 +115,29 @@ static void generateInstSeqImpl(int64_t Val, const MCSubtargetInfo &STI,
     Val >>= ShiftAmount;
 
     // If the remaining bits don't fit in 12 bits, we might be able to reduce
-    // the // shift amount in order to use LUI which will zero the lower 12
-    // bits.
+    // the shift amount in order to use LUI which will zero the lower 12 bits.
     if (ShiftAmount > 12 && !isInt<12>(Val)) {
-      if (isInt<32>((uint64_t)Val << 12)) {
+      if (isInt<32>(Val << 12)) {
         // Reduce the shift amount and add zeros to the LSBs so it will match
         // LUI.
         ShiftAmount -= 12;
-        Val = (uint64_t)Val << 12;
-      } else if (isUInt<32>((uint64_t)Val << 12) &&
+        Val = Val << 12;
+      } else if (isUInt<32>(Val << 12) &&
                  STI.hasFeature(RISCV::FeatureStdExtZba)) {
         // Reduce the shift amount and add zeros to the LSBs so it will match
         // LUI, then shift left with SLLI.UW to clear the upper 32 set bits.
         ShiftAmount -= 12;
-        Val = ((uint64_t)Val << 12) | (0xffffffffull << 32);
+        Val = SignExtend64<32>(Val << 12);
         Unsigned = true;
       }
     }
 
     // Try to use SLLI_UW for Val when it is uint32 but not int32.
-    if (isUInt<32>((uint64_t)Val) && !isInt<32>((uint64_t)Val) &&
+    if (isUInt<32>(Val) && !isInt<32>(Val) &&
         STI.hasFeature(RISCV::FeatureStdExtZba)) {
-      // Use LUI+ADDI or LUI to compose, then clear the upper 32 bits with
+      // Use LUI+ADDI(W) or LUI to compose, then clear the upper 32 bits with
       // SLLI_UW.
-      Val = ((uint64_t)Val) | (0xffffffffull << 32);
+      Val = SignExtend64<32>(Val);
       Unsigned = true;
     }
   }

From ec36145f58d2cf93d86bc4e3be617ad7d7d8ace7 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 15 May 2024 18:54:23 +0100
Subject: [PATCH 64/71] [LAA] Add tests with invariant dependences before
 strided ones.

Add extra test coverage for loops with strided and invariant accesses to
the same object.
---
 .../invariant-dependence-before.ll            | 756 ++++++++++++++++++
 1 file changed, 756 insertions(+)
 create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll b/llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll
new file mode 100644
index 00000000000000..2a210a5a445b94
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll
@@ -0,0 +1,756 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes='print<access-info>' -disable-output %s 2>&1 | FileCheck %s
+
+define void @test_invar_dependence_before_positive_strided_access_1(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_before_positive_strided_access_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %a, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_before_positive_strided_access_2(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_before_positive_strided_access_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %a, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %gep
+  store i32 %l, ptr %a
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_not_before_positive_strided_access_1(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_not_before_positive_strided_access_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %a, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 3
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_not_before_positive_strided_access_2(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_not_before_positive_strided_access_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %a, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 3
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %gep
+  store i32 %l, ptr %a
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_before_positive_strided_access_1_different_access_sizes(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_before_positive_strided_access_1_different_access_sizes'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %a, align 4 ->
+; CHECK-NEXT:            store i8 %t, ptr %gep, align 1
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %a
+  %t = trunc i32 %l to i8
+  store i8 %t, ptr %gep
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_not_before_positive_strided_access_1_different_access_sizes(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_not_before_positive_strided_access_1_different_access_sizes'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i64, ptr %a, align 4 ->
+; CHECK-NEXT:            store i32 %t, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i64, ptr %a
+  %t = trunc i64 %l to i32
+  store i32 %t, ptr %gep
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_before_negative_strided_access_1(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_before_negative_strided_access_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %a, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i32, ptr %a, i32 100
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep
+  %iv.next = sub i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, -100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_before_negative_strided_access_2(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_before_negative_strided_access_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %a, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i32, ptr %a, i32 100
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %gep
+  store i32 %l, ptr %a
+  %iv.next = sub i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, -100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
+define void @test_invar_dependence_not_before_negative_strided_access_1(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_not_before_negative_strided_access_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %a, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i32, ptr %a, i32 99
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep
+  %iv.next = sub i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, -100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_not_before_negative_strided_access_2(ptr %a) {
+; CHECK-LABEL: 'test_invar_dependence_not_before_negative_strided_access_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %a, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i32, ptr %a, i32 99
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %gep
+  store i32 %l, ptr %a
+  %iv.next = sub i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, -100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_both_invar_before_1(ptr %a) {
+; CHECK-LABEL: 'test_both_invar_before_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %a, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep.off, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep.off
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_both_invar_before_2(ptr %a) {
+; CHECK-LABEL: 'test_both_invar_before_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep.off, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %a, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %l = load i32, ptr %gep.off
+  store i32 %l, ptr %a
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_both_invar_not_before_1(ptr %a) {
+; CHECK-LABEL: 'test_both_invar_not_before_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %a, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep.off, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 3
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep.off
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_both_invar_not_before_2(ptr %a) {
+; CHECK-LABEL: 'test_both_invar_not_before_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep.off, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %a, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 3
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %l = load i32, ptr %gep.off
+  store i32 %l, ptr %a
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_before_via_loop_guard_positive_strided_access_1(ptr %a, i32 %off) {
+; CHECK-LABEL: 'test_invar_dependence_before_via_loop_guard_positive_strided_access_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %a, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 %off
+  %c = icmp sge i32 %off, 4
+  br i1 %c, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_before_via_loop_guard_positive_strided_access_2(ptr %a, i32 %off) {
+; CHECK-LABEL: 'test_invar_dependence_before_via_loop_guard_positive_strided_access_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %a, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 %off
+  %c = icmp sge i32 %off, 4
+  br i1 %c, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %gep
+  store i32 %l, ptr %a
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+define void @test_invar_dependence_not_before_via_loop_guard_positive_strided_access_1(ptr %a, i32 %off) {
+; CHECK-LABEL: 'test_invar_dependence_not_before_via_loop_guard_positive_strided_access_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %a, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 %off
+  %c = icmp sge i32 %off, 3
+  br i1 %c, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_not_before_via_loop_guard_positive_strided_access_2(ptr %a, i32 %off) {
+; CHECK-LABEL: 'test_invar_dependence_not_before_via_loop_guard_positive_strided_access_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %a, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 %off
+  %c = icmp sge i32 %off, 3
+  br i1 %c, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %gep
+  store i32 %l, ptr %a
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_before_positive_strided_access_via_loop_guard_1(ptr %a, i32 %off) {
+; CHECK-LABEL: 'test_invar_dependence_before_positive_strided_access_via_loop_guard_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: could not determine number of loop iterations
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  %c = icmp sge i32 %off, 0
+  br i1 %c, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %gep
+  store i32 %l, ptr %a
+  %iv.next = add i32 %iv, %off
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_before_positive_strided_access_via_loop_guard_2(ptr %a, i32 %off) {
+; CHECK-LABEL: 'test_invar_dependence_before_positive_strided_access_via_loop_guard_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: could not determine number of loop iterations
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  %c = icmp sge i32 %off, 0
+  br i1 %c, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep
+  %iv.next = add i32 %iv, %off
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_not_known_beforepositive_strided_access_not_known_via_loop_guard_1(ptr %a, i32 %off) {
+; CHECK-LABEL: 'test_invar_dependence_not_known_beforepositive_strided_access_not_known_via_loop_guard_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: could not determine number of loop iterations
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %a
+  store i32 %l, ptr %gep
+  %iv.next = add i32 %iv, %off
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_invar_dependence_not_known_beforepositive_strided_access_not_known_via_loop_guard_2(ptr %a, i32 %off) {
+; CHECK-LABEL: 'test_invar_dependence_not_known_beforepositive_strided_access_not_known_via_loop_guard_2'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: could not determine number of loop iterations
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %gep.off = getelementptr i8, ptr %a, i32 4
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr i32, ptr %gep.off, i32 %iv
+  %l = load i32, ptr %gep
+  store i32 %l, ptr %a
+  %iv.next = add i32 %iv, %off
+  %ec = icmp eq i32 %iv.next, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}

From c19f2c773b0e23fd623502888894add822079f63 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Tue, 14 May 2024 18:11:53 -0700
Subject: [PATCH 65/71] Reapply "[ctx_profile] Profile reader and writer"
 (#92199)

This reverts commit 03c7458a3603396d2d0e1dee43399d3d1664a264.

One of the problems was addressed in #92208

The other problem: needed to add `BitstreamReader` to the list of
link deps of `LLVMProfileData`
---
 .../llvm/ProfileData/PGOCtxProfReader.h       |  92 +++++++
 .../llvm/ProfileData/PGOCtxProfWriter.h       |  91 +++++++
 llvm/lib/ProfileData/CMakeLists.txt           |   3 +
 llvm/lib/ProfileData/PGOCtxProfReader.cpp     | 173 ++++++++++++
 llvm/lib/ProfileData/PGOCtxProfWriter.cpp     |  49 ++++
 llvm/unittests/ProfileData/CMakeLists.txt     |   1 +
 .../PGOCtxProfReaderWriterTest.cpp            | 255 ++++++++++++++++++
 7 files changed, 664 insertions(+)
 create mode 100644 llvm/include/llvm/ProfileData/PGOCtxProfReader.h
 create mode 100644 llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
 create mode 100644 llvm/lib/ProfileData/PGOCtxProfReader.cpp
 create mode 100644 llvm/lib/ProfileData/PGOCtxProfWriter.cpp
 create mode 100644 llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp

diff --git a/llvm/include/llvm/ProfileData/PGOCtxProfReader.h b/llvm/include/llvm/ProfileData/PGOCtxProfReader.h
new file mode 100644
index 00000000000000..a19b3f51d642de
--- /dev/null
+++ b/llvm/include/llvm/ProfileData/PGOCtxProfReader.h
@@ -0,0 +1,92 @@
+//===--- PGOCtxProfReader.h - Contextual profile reader ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+///
+/// Reader for contextual iFDO profile, which comes in bitstream format.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PROFILEDATA_CTXINSTRPROFILEREADER_H
+#define LLVM_PROFILEDATA_CTXINSTRPROFILEREADER_H
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Bitstream/BitstreamReader.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/ProfileData/PGOCtxProfWriter.h"
+#include "llvm/Support/Error.h"
+#include <map>
+#include <vector>
+
+namespace llvm {
+/// The loaded contextual profile, suitable for mutation during IPO passes. We
+/// generally expect a fraction of counters and of callsites to be populated.
+/// We continue to model counters as vectors, but callsites are modeled as a map
+/// of a map. The expectation is that, typically, there is a small number of
+/// indirect targets (usually, 1 for direct calls); but potentially a large
+/// number of callsites, and, as inlining progresses, the callsite count of a
+/// caller will grow.
+class PGOContextualProfile final {
+public:
+  using CallTargetMapTy = std::map<GlobalValue::GUID, PGOContextualProfile>;
+  using CallsiteMapTy = DenseMap<uint32_t, CallTargetMapTy>;
+
+private:
+  friend class PGOCtxProfileReader;
+  GlobalValue::GUID GUID = 0;
+  SmallVector<uint64_t, 16> Counters;
+  CallsiteMapTy Callsites;
+
+  PGOContextualProfile(GlobalValue::GUID G,
+                       SmallVectorImpl<uint64_t> &&Counters)
+      : GUID(G), Counters(std::move(Counters)) {}
+
+  Expected<PGOContextualProfile &>
+  getOrEmplace(uint32_t Index, GlobalValue::GUID G,
+               SmallVectorImpl<uint64_t> &&Counters);
+
+public:
+  PGOContextualProfile(const PGOContextualProfile &) = delete;
+  PGOContextualProfile &operator=(const PGOContextualProfile &) = delete;
+  PGOContextualProfile(PGOContextualProfile &&) = default;
+  PGOContextualProfile &operator=(PGOContextualProfile &&) = default;
+
+  GlobalValue::GUID guid() const { return GUID; }
+  const SmallVectorImpl<uint64_t> &counters() const { return Counters; }
+  const CallsiteMapTy &callsites() const { return Callsites; }
+  CallsiteMapTy &callsites() { return Callsites; }
+
+  bool hasCallsite(uint32_t I) const {
+    return Callsites.find(I) != Callsites.end();
+  }
+
+  const CallTargetMapTy &callsite(uint32_t I) const {
+    assert(hasCallsite(I) && "Callsite not found");
+    return Callsites.find(I)->second;
+  }
+  void getContainedGuids(DenseSet<GlobalValue::GUID> &Guids) const;
+};
+
+class PGOCtxProfileReader final {
+  BitstreamCursor &Cursor;
+  Expected<BitstreamEntry> advance();
+  Error readMetadata();
+  Error wrongValue(const Twine &);
+  Error unsupported(const Twine &);
+
+  Expected<std::pair<std::optional<uint32_t>, PGOContextualProfile>>
+  readContext(bool ExpectIndex);
+  bool canReadContext();
+
+public:
+  PGOCtxProfileReader(BitstreamCursor &Cursor) : Cursor(Cursor) {}
+
+  Expected<std::map<GlobalValue::GUID, PGOContextualProfile>> loadContexts();
+};
+} // namespace llvm
+#endif
diff --git a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
new file mode 100644
index 00000000000000..15578c51a49578
--- /dev/null
+++ b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
@@ -0,0 +1,91 @@
+//===- PGOCtxProfWriter.h - Contextual Profile Writer -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a utility for writing a contextual profile to bitstream.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PROFILEDATA_PGOCTXPROFWRITER_H_
+#define LLVM_PROFILEDATA_PGOCTXPROFWRITER_H_
+
+#include "llvm/Bitstream/BitstreamWriter.h"
+#include "llvm/ProfileData/CtxInstrContextNode.h"
+
+namespace llvm {
+enum PGOCtxProfileRecords { Invalid = 0, Version, Guid, CalleeIndex, Counters };
+
+enum PGOCtxProfileBlockIDs {
+  ProfileMetadataBlockID = 100,
+  ContextNodeBlockID = ProfileMetadataBlockID + 1
+};
+
+/// Write one or more ContextNodes to the provided raw_fd_stream.
+/// The caller must destroy the PGOCtxProfileWriter object before closing the
+/// stream.
+/// The design allows serializing a bunch of contexts embedded in some other
+/// file. The overall format is:
+///
+///  [... other data written to the stream...]
+///  SubBlock(ProfileMetadataBlockID)
+///   Version
+///   SubBlock(ContextNodeBlockID)
+///     [RECORDS]
+///     SubBlock(ContextNodeBlockID)
+///       [RECORDS]
+///       [... more SubBlocks]
+///     EndBlock
+///   EndBlock
+///
+/// The "RECORDS" are bitsream records. The IDs are in CtxProfileCodes (except)
+/// for Version, which is just for metadata). All contexts will have Guid and
+/// Counters, and all but the roots have CalleeIndex. The order in which the
+/// records appear does not matter, but they must precede any subcontexts,
+/// because that helps keep the reader code simpler.
+///
+/// Subblock containment captures the context->subcontext relationship. The
+/// "next()" relationship in the raw profile, between call targets of indirect
+/// calls, are just modeled as peer subblocks where the callee index is the
+/// same.
+///
+/// Versioning: the writer may produce additional records not known by the
+/// reader. The version number indicates a more structural change.
+/// The current version, in particular, is set up to expect optional extensions
+/// like value profiling - which would appear as additional records. For
+/// example, value profiling would produce a new record with a new record ID,
+/// containing the profiled values (much like the counters)
+class PGOCtxProfileWriter final {
+  SmallVector<char, 1 << 20> Buff;
+  BitstreamWriter Writer;
+
+  void writeCounters(const ctx_profile::ContextNode &Node);
+  void writeImpl(std::optional<uint32_t> CallerIndex,
+                 const ctx_profile::ContextNode &Node);
+
+public:
+  PGOCtxProfileWriter(raw_fd_stream &Out,
+                      std::optional<unsigned> VersionOverride = std::nullopt)
+      : Writer(Buff, &Out, 0) {
+    Writer.EnterSubblock(PGOCtxProfileBlockIDs::ProfileMetadataBlockID,
+                         CodeLen);
+    const auto Version = VersionOverride ? *VersionOverride : CurrentVersion;
+    Writer.EmitRecord(PGOCtxProfileRecords::Version,
+                      SmallVector<unsigned, 1>({Version}));
+  }
+
+  ~PGOCtxProfileWriter() { Writer.ExitBlock(); }
+
+  void write(const ctx_profile::ContextNode &);
+
+  // constants used in writing which a reader may find useful.
+  static constexpr unsigned CodeLen = 2;
+  static constexpr uint32_t CurrentVersion = 1;
+  static constexpr unsigned VBREncodingBits = 6;
+};
+
+} // namespace llvm
+#endif
diff --git a/llvm/lib/ProfileData/CMakeLists.txt b/llvm/lib/ProfileData/CMakeLists.txt
index 408f9ff01ec87d..4fa1b76f0a062c 100644
--- a/llvm/lib/ProfileData/CMakeLists.txt
+++ b/llvm/lib/ProfileData/CMakeLists.txt
@@ -7,6 +7,8 @@ add_llvm_component_library(LLVMProfileData
   ItaniumManglingCanonicalizer.cpp
   MemProf.cpp
   MemProfReader.cpp
+  PGOCtxProfReader.cpp
+  PGOCtxProfWriter.cpp
   ProfileSummaryBuilder.cpp
   SampleProf.cpp
   SampleProfReader.cpp
@@ -20,6 +22,7 @@ add_llvm_component_library(LLVMProfileData
   intrinsics_gen
 
   LINK_COMPONENTS
+  BitstreamReader
   Core
   Object
   Support
diff --git a/llvm/lib/ProfileData/PGOCtxProfReader.cpp b/llvm/lib/ProfileData/PGOCtxProfReader.cpp
new file mode 100644
index 00000000000000..3710f2e4b81858
--- /dev/null
+++ b/llvm/lib/ProfileData/PGOCtxProfReader.cpp
@@ -0,0 +1,173 @@
+//===- PGOCtxProfReader.cpp - Contextual Instrumentation profile reader ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Read a contextual profile into a datastructure suitable for maintenance
+// throughout IPO
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ProfileData/PGOCtxProfReader.h"
+#include "llvm/Bitstream/BitCodeEnums.h"
+#include "llvm/Bitstream/BitstreamReader.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/PGOCtxProfWriter.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+
+using namespace llvm;
+
+// FIXME(#92054) - these Error handling macros are (re-)invented in a few
+// places.
+#define EXPECT_OR_RET(LHS, RHS)                                                \
+  auto LHS = RHS;                                                              \
+  if (!LHS)                                                                    \
+    return LHS.takeError();
+
+#define RET_ON_ERR(EXPR)                                                       \
+  if (auto Err = (EXPR))                                                       \
+    return Err;
+
+Expected<PGOContextualProfile &>
+PGOContextualProfile::getOrEmplace(uint32_t Index, GlobalValue::GUID G,
+                                   SmallVectorImpl<uint64_t> &&Counters) {
+  auto [Iter, Inserted] = Callsites[Index].insert(
+      {G, PGOContextualProfile(G, std::move(Counters))});
+  if (!Inserted)
+    return make_error<InstrProfError>(instrprof_error::invalid_prof,
+                                      "Duplicate GUID for same callsite.");
+  return Iter->second;
+}
+
+void PGOContextualProfile::getContainedGuids(
+    DenseSet<GlobalValue::GUID> &Guids) const {
+  Guids.insert(GUID);
+  for (const auto &[_, Callsite] : Callsites)
+    for (const auto &[_, Callee] : Callsite)
+      Callee.getContainedGuids(Guids);
+}
+
+Expected<BitstreamEntry> PGOCtxProfileReader::advance() {
+  return Cursor.advance(BitstreamCursor::AF_DontAutoprocessAbbrevs);
+}
+
+Error PGOCtxProfileReader::wrongValue(const Twine &Msg) {
+  return make_error<InstrProfError>(instrprof_error::invalid_prof, Msg);
+}
+
+Error PGOCtxProfileReader::unsupported(const Twine &Msg) {
+  return make_error<InstrProfError>(instrprof_error::unsupported_version, Msg);
+}
+
+bool PGOCtxProfileReader::canReadContext() {
+  auto Blk = advance();
+  if (!Blk) {
+    consumeError(Blk.takeError());
+    return false;
+  }
+  return Blk->Kind == BitstreamEntry::SubBlock &&
+         Blk->ID == PGOCtxProfileBlockIDs::ContextNodeBlockID;
+}
+
+Expected<std::pair<std::optional<uint32_t>, PGOContextualProfile>>
+PGOCtxProfileReader::readContext(bool ExpectIndex) {
+  RET_ON_ERR(Cursor.EnterSubBlock(PGOCtxProfileBlockIDs::ContextNodeBlockID));
+
+  std::optional<ctx_profile::GUID> Guid;
+  std::optional<SmallVector<uint64_t, 16>> Counters;
+  std::optional<uint32_t> CallsiteIndex;
+
+  SmallVector<uint64_t, 1> RecordValues;
+
+  // We don't prescribe the order in which the records come in, and we are ok
+  // if other unsupported records appear. We seek in the current subblock until
+  // we get all we know.
+  auto GotAllWeNeed = [&]() {
+    return Guid.has_value() && Counters.has_value() &&
+           (!ExpectIndex || CallsiteIndex.has_value());
+  };
+  while (!GotAllWeNeed()) {
+    RecordValues.clear();
+    EXPECT_OR_RET(Entry, advance());
+    if (Entry->Kind != BitstreamEntry::Record)
+      return wrongValue(
+          "Expected records before encountering more subcontexts");
+    EXPECT_OR_RET(ReadRecord,
+                  Cursor.readRecord(bitc::UNABBREV_RECORD, RecordValues));
+    switch (*ReadRecord) {
+    case PGOCtxProfileRecords::Guid:
+      if (RecordValues.size() != 1)
+        return wrongValue("The GUID record should have exactly one value");
+      Guid = RecordValues[0];
+      break;
+    case PGOCtxProfileRecords::Counters:
+      Counters = std::move(RecordValues);
+      if (Counters->empty())
+        return wrongValue("Empty counters. At least the entry counter (one "
+                          "value) was expected");
+      break;
+    case PGOCtxProfileRecords::CalleeIndex:
+      if (!ExpectIndex)
+        return wrongValue("The root context should not have a callee index");
+      if (RecordValues.size() != 1)
+        return wrongValue("The callee index should have exactly one value");
+      CallsiteIndex = RecordValues[0];
+      break;
+    default:
+      // OK if we see records we do not understand, like records (profile
+      // components) introduced later.
+      break;
+    }
+  }
+
+  PGOContextualProfile Ret(*Guid, std::move(*Counters));
+
+  while (canReadContext()) {
+    EXPECT_OR_RET(SC, readContext(true));
+    auto &Targets = Ret.callsites()[*SC->first];
+    auto [_, Inserted] =
+        Targets.insert({SC->second.guid(), std::move(SC->second)});
+    if (!Inserted)
+      return wrongValue(
+          "Unexpected duplicate target (callee) at the same callsite.");
+  }
+  return std::make_pair(CallsiteIndex, std::move(Ret));
+}
+
+Error PGOCtxProfileReader::readMetadata() {
+  EXPECT_OR_RET(Blk, advance());
+  if (Blk->Kind != BitstreamEntry::SubBlock)
+    return unsupported("Expected Version record");
+  RET_ON_ERR(
+      Cursor.EnterSubBlock(PGOCtxProfileBlockIDs::ProfileMetadataBlockID));
+  EXPECT_OR_RET(MData, advance());
+  if (MData->Kind != BitstreamEntry::Record)
+    return unsupported("Expected Version record");
+
+  SmallVector<uint64_t, 1> Ver;
+  EXPECT_OR_RET(Code, Cursor.readRecord(bitc::UNABBREV_RECORD, Ver));
+  if (*Code != PGOCtxProfileRecords::Version)
+    return unsupported("Expected Version record");
+  if (Ver.size() != 1 || Ver[0] > PGOCtxProfileWriter::CurrentVersion)
+    return unsupported("Version " + Twine(*Code) +
+                       " is higher than supported version " +
+                       Twine(PGOCtxProfileWriter::CurrentVersion));
+  return Error::success();
+}
+
+Expected<std::map<GlobalValue::GUID, PGOContextualProfile>>
+PGOCtxProfileReader::loadContexts() {
+  std::map<GlobalValue::GUID, PGOContextualProfile> Ret;
+  RET_ON_ERR(readMetadata());
+  while (canReadContext()) {
+    EXPECT_OR_RET(E, readContext(false));
+    auto Key = E->second.guid();
+    if (!Ret.insert({Key, std::move(E->second)}).second)
+      return wrongValue("Duplicate roots");
+  }
+  return Ret;
+}
diff --git a/llvm/lib/ProfileData/PGOCtxProfWriter.cpp b/llvm/lib/ProfileData/PGOCtxProfWriter.cpp
new file mode 100644
index 00000000000000..50817975644693
--- /dev/null
+++ b/llvm/lib/ProfileData/PGOCtxProfWriter.cpp
@@ -0,0 +1,49 @@
+//===- PGOCtxProfWriter.cpp - Contextual Instrumentation profile writer ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Write a contextual profile to bitstream.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ProfileData/PGOCtxProfWriter.h"
+#include "llvm/Bitstream/BitCodeEnums.h"
+
+using namespace llvm;
+using namespace llvm::ctx_profile;
+
+void PGOCtxProfileWriter::writeCounters(const ContextNode &Node) {
+  Writer.EmitCode(bitc::UNABBREV_RECORD);
+  Writer.EmitVBR(PGOCtxProfileRecords::Counters, VBREncodingBits);
+  Writer.EmitVBR(Node.counters_size(), VBREncodingBits);
+  for (uint32_t I = 0U; I < Node.counters_size(); ++I)
+    Writer.EmitVBR64(Node.counters()[I], VBREncodingBits);
+}
+
+// recursively write all the subcontexts. We do need to traverse depth first to
+// model the context->subcontext implicitly, and since this captures call
+// stacks, we don't really need to be worried about stack overflow and we can
+// keep the implementation simple.
+void PGOCtxProfileWriter::writeImpl(std::optional<uint32_t> CallerIndex,
+                                    const ContextNode &Node) {
+  Writer.EnterSubblock(PGOCtxProfileBlockIDs::ContextNodeBlockID, CodeLen);
+  Writer.EmitRecord(PGOCtxProfileRecords::Guid,
+                    SmallVector<uint64_t, 1>{Node.guid()});
+  if (CallerIndex)
+    Writer.EmitRecord(PGOCtxProfileRecords::CalleeIndex,
+                      SmallVector<uint64_t, 1>{*CallerIndex});
+  writeCounters(Node);
+  for (uint32_t I = 0U; I < Node.callsites_size(); ++I)
+    for (const auto *Subcontext = Node.subContexts()[I]; Subcontext;
+         Subcontext = Subcontext->next())
+      writeImpl(I, *Subcontext);
+  Writer.ExitBlock();
+}
+
+void PGOCtxProfileWriter::write(const ContextNode &RootNode) {
+  writeImpl(std::nullopt, RootNode);
+}
diff --git a/llvm/unittests/ProfileData/CMakeLists.txt b/llvm/unittests/ProfileData/CMakeLists.txt
index ce3a0a45ccf18c..c92642ded82820 100644
--- a/llvm/unittests/ProfileData/CMakeLists.txt
+++ b/llvm/unittests/ProfileData/CMakeLists.txt
@@ -13,6 +13,7 @@ add_llvm_unittest(ProfileDataTests
   InstrProfTest.cpp
   ItaniumManglingCanonicalizerTest.cpp
   MemProfTest.cpp
+  PGOCtxProfReaderWriterTest.cpp
   SampleProfTest.cpp
   SymbolRemappingReaderTest.cpp
   )
diff --git a/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp b/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp
new file mode 100644
index 00000000000000..d2cdbb28e2fce7
--- /dev/null
+++ b/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp
@@ -0,0 +1,255 @@
+//===-------------- PGOCtxProfReadWriteTest.cpp ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Bitstream/BitstreamReader.h"
+#include "llvm/ProfileData/CtxInstrContextNode.h"
+#include "llvm/ProfileData/PGOCtxProfReader.h"
+#include "llvm/ProfileData/PGOCtxProfWriter.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::ctx_profile;
+
+class PGOCtxProfRWTest : public ::testing::Test {
+  std::vector<std::unique_ptr<char[]>> Nodes;
+  std::map<GUID, const ContextNode *> Roots;
+
+public:
+  ContextNode *createNode(GUID Guid, uint32_t NrCounters, uint32_t NrCallsites,
+                          ContextNode *Next = nullptr) {
+    auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites);
+    auto *Mem = Nodes.emplace_back(std::make_unique<char[]>(AllocSize)).get();
+    std::memset(Mem, 0, AllocSize);
+    auto *Ret = new (Mem) ContextNode(Guid, NrCounters, NrCallsites, Next);
+    return Ret;
+  }
+
+  void SetUp() override {
+    // Root (guid 1) has 2 callsites, one used for an indirect call to either
+    // guid 2 or 4.
+    // guid 2 calls guid 5
+    // guid 5 calls guid 2
+    // there's also a second root, guid3.
+    auto *Root1 = createNode(1, 2, 2);
+    Root1->counters()[0] = 10;
+    Root1->counters()[1] = 11;
+    Roots.insert({1, Root1});
+    auto *L1 = createNode(2, 1, 1);
+    L1->counters()[0] = 12;
+    Root1->subContexts()[1] = createNode(4, 3, 1, L1);
+    Root1->subContexts()[1]->counters()[0] = 13;
+    Root1->subContexts()[1]->counters()[1] = 14;
+    Root1->subContexts()[1]->counters()[2] = 15;
+
+    auto *L3 = createNode(5, 6, 3);
+    for (auto I = 0; I < 6; ++I)
+      L3->counters()[I] = 16 + I;
+    L1->subContexts()[0] = L3;
+    L3->subContexts()[2] = createNode(2, 1, 1);
+    L3->subContexts()[2]->counters()[0] = 30;
+    auto *Root2 = createNode(3, 1, 0);
+    Root2->counters()[0] = 40;
+    Roots.insert({3, Root2});
+  }
+
+  const std::map<GUID, const ContextNode *> &roots() const { return Roots; }
+};
+
+void checkSame(const ContextNode &Raw, const PGOContextualProfile &Profile) {
+  EXPECT_EQ(Raw.guid(), Profile.guid());
+  ASSERT_EQ(Raw.counters_size(), Profile.counters().size());
+  for (auto I = 0U; I < Raw.counters_size(); ++I)
+    EXPECT_EQ(Raw.counters()[I], Profile.counters()[I]);
+
+  for (auto I = 0U; I < Raw.callsites_size(); ++I) {
+    if (Raw.subContexts()[I] == nullptr)
+      continue;
+    EXPECT_TRUE(Profile.hasCallsite(I));
+    const auto &ProfileTargets = Profile.callsite(I);
+
+    std::map<GUID, const ContextNode *> Targets;
+    for (const auto *N = Raw.subContexts()[I]; N; N = N->next())
+      EXPECT_TRUE(Targets.insert({N->guid(), N}).second);
+
+    EXPECT_EQ(Targets.size(), ProfileTargets.size());
+    for (auto It : Targets) {
+      auto PIt = ProfileTargets.find(It.second->guid());
+      EXPECT_NE(PIt, ProfileTargets.end());
+      checkSame(*It.second, PIt->second);
+    }
+  }
+}
+
+TEST_F(PGOCtxProfRWTest, RoundTrip) {
+  llvm::unittest::TempFile ProfileFile("ctx_profile", "", "", /*Unique*/ true);
+  {
+    std::error_code EC;
+    raw_fd_stream Out(ProfileFile.path(), EC);
+    ASSERT_FALSE(EC);
+    {
+      PGOCtxProfileWriter Writer(Out);
+      for (auto &[_, R] : roots())
+        Writer.write(*R);
+    }
+  }
+  {
+    ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+        MemoryBuffer::getFile(ProfileFile.path());
+    ASSERT_TRUE(!!MB);
+    ASSERT_NE(*MB, nullptr);
+    BitstreamCursor Cursor((*MB)->getBuffer());
+    PGOCtxProfileReader Reader(Cursor);
+    auto Expected = Reader.loadContexts();
+    ASSERT_TRUE(!!Expected);
+    auto &Ctxes = *Expected;
+    EXPECT_EQ(Ctxes.size(), roots().size());
+    EXPECT_EQ(Ctxes.size(), 2U);
+    for (auto &[G, R] : roots())
+      checkSame(*R, Ctxes.find(G)->second);
+  }
+}
+
+TEST_F(PGOCtxProfRWTest, InvalidCounters) {
+  auto *R = createNode(1, 0, 1);
+  llvm::unittest::TempFile ProfileFile("ctx_profile", "", "", /*Unique*/ true);
+  {
+    std::error_code EC;
+    raw_fd_stream Out(ProfileFile.path(), EC);
+    ASSERT_FALSE(EC);
+    {
+      PGOCtxProfileWriter Writer(Out);
+      Writer.write(*R);
+    }
+  }
+  {
+    auto MB = MemoryBuffer::getFile(ProfileFile.path());
+    ASSERT_TRUE(!!MB);
+    ASSERT_NE(*MB, nullptr);
+    BitstreamCursor Cursor((*MB)->getBuffer());
+    PGOCtxProfileReader Reader(Cursor);
+    auto Expected = Reader.loadContexts();
+    EXPECT_FALSE(Expected);
+    consumeError(Expected.takeError());
+  }
+}
+
+TEST_F(PGOCtxProfRWTest, Empty) {
+  BitstreamCursor Cursor("");
+  PGOCtxProfileReader Reader(Cursor);
+  auto Expected = Reader.loadContexts();
+  EXPECT_FALSE(Expected);
+  consumeError(Expected.takeError());
+}
+
+TEST_F(PGOCtxProfRWTest, Invalid) {
+  BitstreamCursor Cursor("Surely this is not valid");
+  PGOCtxProfileReader Reader(Cursor);
+  auto Expected = Reader.loadContexts();
+  EXPECT_FALSE(Expected);
+  consumeError(Expected.takeError());
+}
+
+TEST_F(PGOCtxProfRWTest, ValidButEmpty) {
+  llvm::unittest::TempFile ProfileFile("ctx_profile", "", "", /*Unique*/ true);
+  {
+    std::error_code EC;
+    raw_fd_stream Out(ProfileFile.path(), EC);
+    ASSERT_FALSE(EC);
+    {
+      PGOCtxProfileWriter Writer(Out);
+      // don't write anything - this will just produce the metadata subblock.
+    }
+  }
+  {
+    auto MB = MemoryBuffer::getFile(ProfileFile.path());
+    ASSERT_TRUE(!!MB);
+    ASSERT_NE(*MB, nullptr);
+    BitstreamCursor Cursor((*MB)->getBuffer());
+    PGOCtxProfileReader Reader(Cursor);
+    auto Expected = Reader.loadContexts();
+    EXPECT_TRUE(!!Expected);
+    EXPECT_TRUE(Expected->empty());
+  }
+}
+
+TEST_F(PGOCtxProfRWTest, WrongVersion) {
+  llvm::unittest::TempFile ProfileFile("ctx_profile", "", "", /*Unique*/ true);
+  {
+    std::error_code EC;
+    raw_fd_stream Out(ProfileFile.path(), EC);
+    ASSERT_FALSE(EC);
+    {
+      PGOCtxProfileWriter Writer(Out, PGOCtxProfileWriter::CurrentVersion + 1);
+    }
+  }
+  {
+    auto MB = MemoryBuffer::getFile(ProfileFile.path());
+    ASSERT_TRUE(!!MB);
+    ASSERT_NE(*MB, nullptr);
+    BitstreamCursor Cursor((*MB)->getBuffer());
+    PGOCtxProfileReader Reader(Cursor);
+    auto Expected = Reader.loadContexts();
+    EXPECT_FALSE(Expected);
+    consumeError(Expected.takeError());
+  }
+}
+
+TEST_F(PGOCtxProfRWTest, DuplicateRoots) {
+  llvm::unittest::TempFile ProfileFile("ctx_profile", "", "", /*Unique*/ true);
+  {
+    std::error_code EC;
+    raw_fd_stream Out(ProfileFile.path(), EC);
+    ASSERT_FALSE(EC);
+    {
+      PGOCtxProfileWriter Writer(Out);
+      Writer.write(*createNode(1, 1, 1));
+      Writer.write(*createNode(1, 1, 1));
+    }
+  }
+  {
+    auto MB = MemoryBuffer::getFile(ProfileFile.path());
+    ASSERT_TRUE(!!MB);
+    ASSERT_NE(*MB, nullptr);
+    BitstreamCursor Cursor((*MB)->getBuffer());
+    PGOCtxProfileReader Reader(Cursor);
+    auto Expected = Reader.loadContexts();
+    EXPECT_FALSE(Expected);
+    consumeError(Expected.takeError());
+  }
+}
+
+TEST_F(PGOCtxProfRWTest, DuplicateTargets) {
+  llvm::unittest::TempFile ProfileFile("ctx_profile", "", "", /*Unique*/ true);
+  {
+    std::error_code EC;
+    raw_fd_stream Out(ProfileFile.path(), EC);
+    ASSERT_FALSE(EC);
+    {
+      auto *R = createNode(1, 1, 1);
+      auto *L1 = createNode(2, 1, 0);
+      auto *L2 = createNode(2, 1, 0, L1);
+      R->subContexts()[0] = L2;
+      PGOCtxProfileWriter Writer(Out);
+      Writer.write(*R);
+    }
+  }
+  {
+    auto MB = MemoryBuffer::getFile(ProfileFile.path());
+    ASSERT_TRUE(!!MB);
+    ASSERT_NE(*MB, nullptr);
+    BitstreamCursor Cursor((*MB)->getBuffer());
+    PGOCtxProfileReader Reader(Cursor);
+    auto Expected = Reader.loadContexts();
+    EXPECT_FALSE(Expected);
+    consumeError(Expected.takeError());
+  }
+}

From df5804aec48f99704ef26c740e19deaa4072fe27 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 15 May 2024 18:08:23 +0000
Subject: [PATCH 66/71] [gn build] Port c19f2c773b0e

---
 llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn       | 2 ++
 llvm/utils/gn/secondary/llvm/unittests/ProfileData/BUILD.gn | 1 +
 2 files changed, 3 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn
index 9dbfe0f94c1db3..c6fa142b376645 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ProfileData/BUILD.gn
@@ -17,6 +17,8 @@ static_library("ProfileData") {
     "ItaniumManglingCanonicalizer.cpp",
     "MemProf.cpp",
     "MemProfReader.cpp",
+    "PGOCtxProfReader.cpp",
+    "PGOCtxProfWriter.cpp",
     "ProfileSummaryBuilder.cpp",
     "SampleProf.cpp",
     "SampleProfReader.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/ProfileData/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ProfileData/BUILD.gn
index 4919a808920976..f45542519173ea 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/ProfileData/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/ProfileData/BUILD.gn
@@ -14,6 +14,7 @@ unittest("ProfileDataTests") {
     "InstrProfTest.cpp",
     "ItaniumManglingCanonicalizerTest.cpp",
     "MemProfTest.cpp",
+    "PGOCtxProfReaderWriterTest.cpp",
     "SampleProfTest.cpp",
     "SymbolRemappingReaderTest.cpp",
   ]

From 468357114c64633651ebcc5ef17161990da25a78 Mon Sep 17 00:00:00 2001
From: Pete Steinfeld <47540744+psteinfeld@users.noreply.github.com>
Date: Wed, 15 May 2024 11:30:30 -0700
Subject: [PATCH 67/71] =?UTF-8?q?Revert=20"[flang]=20Initial=20debug=20inf?=
 =?UTF-8?q?o=20support=20for=20local=20variables.=20(#909=E2=80=A6=20(#923?=
 =?UTF-8?q?02)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…05)"

This reverts commit 61da6366d043792d7db280ce9edd2db62516e0e8.

Update #90905 was causing many tests to fail.

See comments in #90905.
---
 .../include/flang/Optimizer/CodeGen/CGOps.td  | 34 -------
 .../flang/Optimizer/CodeGen/CGPasses.td       |  4 -
 .../include/flang/Optimizer/CodeGen/CodeGen.h |  6 +-
 flang/include/flang/Tools/CLOptions.inc       | 11 +--
 flang/lib/Optimizer/CodeGen/CGOps.cpp         |  2 +-
 .../flang => lib}/Optimizer/CodeGen/CGOps.h   |  1 -
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       | 50 +++-------
 flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp  | 49 ++--------
 .../lib/Optimizer/Transforms/AddDebugInfo.cpp | 56 +-----------
 .../Transforms/DebugTypeGenerator.cpp         | 10 +-
 flang/test/Fir/declare-codegen.fir            | 22 ++---
 flang/test/Fir/dummy-scope-codegen.fir        | 11 +--
 flang/test/Transforms/debug-local-var-2.f90   | 91 -------------------
 flang/test/Transforms/debug-local-var.f90     | 54 -----------
 14 files changed, 47 insertions(+), 354 deletions(-)
 rename flang/{include/flang => lib}/Optimizer/CodeGen/CGOps.h (94%)
 delete mode 100644 flang/test/Transforms/debug-local-var-2.f90
 delete mode 100644 flang/test/Transforms/debug-local-var.f90

diff --git a/flang/include/flang/Optimizer/CodeGen/CGOps.td b/flang/include/flang/Optimizer/CodeGen/CGOps.td
index c375edee1fa77f..35e70fa2ffa3fb 100644
--- a/flang/include/flang/Optimizer/CodeGen/CGOps.td
+++ b/flang/include/flang/Optimizer/CodeGen/CGOps.td
@@ -16,8 +16,6 @@
 
 include "mlir/IR/SymbolInterfaces.td"
 include "flang/Optimizer/Dialect/FIRTypes.td"
-include "flang/Optimizer/Dialect/FIRAttr.td"
-include "mlir/IR/BuiltinAttributes.td"
 
 def fircg_Dialect : Dialect {
   let name = "fircg";
@@ -204,36 +202,4 @@ def fircg_XArrayCoorOp : fircg_Op<"ext_array_coor", [AttrSizedOperandSegments]>
   }];
 }
 
-// Extended Declare operation.
-def fircg_XDeclareOp : fircg_Op<"ext_declare", [AttrSizedOperandSegments]> {
-  let summary = "for internal conversion only";
-
-  let description = [{
-    Prior to lowering to LLVM IR dialect, a DeclareOp will
-    be converted to an extended DeclareOp.
-  }];
-
-  let arguments = (ins
-    AnyRefOrBox:$memref,
-    Variadic<AnyIntegerType>:$shape,
-    Variadic<AnyIntegerType>:$shift,
-    Variadic<AnyIntegerType>:$typeparams,
-    Optional<fir_DummyScopeType>:$dummy_scope,
-    Builtin_StringAttr:$uniq_name
-  );
-  let results = (outs AnyRefOrBox);
-
-  let assemblyFormat = [{
-    $memref (`(` $shape^ `)`)? (`origin` $shift^)? (`typeparams` $typeparams^)?
-    (`dummy_scope` $dummy_scope^)?
-    attr-dict `:` functional-type(operands, results)
-  }];
-
-  let extraClassDeclaration = [{
-    // Shape is optional, but if it exists, it will be at offset 1.
-    unsigned shapeOffset() { return 1; }
-    unsigned shiftOffset() { return shapeOffset() + getShape().size(); }
-  }];
-}
-
 #endif
diff --git a/flang/include/flang/Optimizer/CodeGen/CGPasses.td b/flang/include/flang/Optimizer/CodeGen/CGPasses.td
index 565920e55e6a8d..f524fb42373444 100644
--- a/flang/include/flang/Optimizer/CodeGen/CGPasses.td
+++ b/flang/include/flang/Optimizer/CodeGen/CGPasses.td
@@ -47,10 +47,6 @@ def CodeGenRewrite : Pass<"cg-rewrite", "mlir::ModuleOp"> {
   let dependentDialects = [
     "fir::FIROpsDialect", "fir::FIRCodeGenDialect"
   ];
-  let options = [
-    Option<"preserveDeclare", "preserve-declare", "bool", /*default=*/"false",
-           "Preserve DeclareOp during pre codegen re-write.">
-  ];
   let statistics = [
     Statistic<"numDCE", "num-dce'd", "Number of operations eliminated">
   ];
diff --git a/flang/include/flang/Optimizer/CodeGen/CodeGen.h b/flang/include/flang/Optimizer/CodeGen/CodeGen.h
index 4d2b191b46d088..26097dabf56c45 100644
--- a/flang/include/flang/Optimizer/CodeGen/CodeGen.h
+++ b/flang/include/flang/Optimizer/CodeGen/CodeGen.h
@@ -30,8 +30,7 @@ struct NameUniquer;
 
 /// Prerequiste pass for code gen. Perform intermediate rewrites to perform
 /// the code gen (to LLVM-IR dialect) conversion.
-std::unique_ptr<mlir::Pass> createFirCodeGenRewritePass(
-    CodeGenRewriteOptions Options = CodeGenRewriteOptions{});
+std::unique_ptr<mlir::Pass> createFirCodeGenRewritePass();
 
 /// FirTargetRewritePass options.
 struct TargetRewriteOptions {
@@ -89,8 +88,7 @@ void populateFIRToLLVMConversionPatterns(fir::LLVMTypeConverter &converter,
                                          fir::FIRToLLVMPassOptions &options);
 
 /// Populate the pattern set with the PreCGRewrite patterns.
-void populatePreCGRewritePatterns(mlir::RewritePatternSet &patterns,
-                                  bool preserveDeclare);
+void populatePreCGRewritePatterns(mlir::RewritePatternSet &patterns);
 
 // declarative passes
 #define GEN_PASS_REGISTRATION
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index 761315e0abc812..cc3431d5b71d29 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -169,11 +169,9 @@ inline void addMemoryAllocationOpt(mlir::PassManager &pm) {
 }
 
 #if !defined(FLANG_EXCLUDE_CODEGEN)
-inline void addCodeGenRewritePass(mlir::PassManager &pm, bool preserveDeclare) {
-  fir::CodeGenRewriteOptions options;
-  options.preserveDeclare = preserveDeclare;
-  addPassConditionally(pm, disableCodeGenRewrite,
-      [&]() { return fir::createFirCodeGenRewritePass(options); });
+inline void addCodeGenRewritePass(mlir::PassManager &pm) {
+  addPassConditionally(
+      pm, disableCodeGenRewrite, fir::createFirCodeGenRewritePass);
 }
 
 inline void addTargetRewritePass(mlir::PassManager &pm) {
@@ -355,8 +353,7 @@ inline void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
     MLIRToLLVMPassPipelineConfig config, llvm::StringRef inputFilename = {}) {
   fir::addBoxedProcedurePass(pm);
   addNestedPassToAllTopLevelOperations(pm, fir::createAbstractResultOpt);
-  fir::addCodeGenRewritePass(
-      pm, (config.DebugInfo != llvm::codegenoptions::NoDebugInfo));
+  fir::addCodeGenRewritePass(pm);
   fir::addTargetRewritePass(pm);
   fir::addExternalNameConversionPass(pm, config.Underscoring);
   fir::createDebugPasses(pm, config.DebugInfo, config.OptLevel, inputFilename);
diff --git a/flang/lib/Optimizer/CodeGen/CGOps.cpp b/flang/lib/Optimizer/CodeGen/CGOps.cpp
index 6b8ba74525556e..44d07d26dd2b68 100644
--- a/flang/lib/Optimizer/CodeGen/CGOps.cpp
+++ b/flang/lib/Optimizer/CodeGen/CGOps.cpp
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "flang/Optimizer/CodeGen/CGOps.h"
+#include "CGOps.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
diff --git a/flang/include/flang/Optimizer/CodeGen/CGOps.h b/flang/lib/Optimizer/CodeGen/CGOps.h
similarity index 94%
rename from flang/include/flang/Optimizer/CodeGen/CGOps.h
rename to flang/lib/Optimizer/CodeGen/CGOps.h
index df909d9ee81cb4..b5a6d5bb9a9e6f 100644
--- a/flang/include/flang/Optimizer/CodeGen/CGOps.h
+++ b/flang/lib/Optimizer/CodeGen/CGOps.h
@@ -13,7 +13,6 @@
 #ifndef OPTIMIZER_CODEGEN_CGOPS_H
 #define OPTIMIZER_CODEGEN_CGOPS_H
 
-#include "flang/Optimizer/Dialect/FIRAttr.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 72172f63888e1c..21154902d23f8f 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -12,7 +12,7 @@
 
 #include "flang/Optimizer/CodeGen/CodeGen.h"
 
-#include "flang/Optimizer/CodeGen/CGOps.h"
+#include "CGOps.h"
 #include "flang/Optimizer/CodeGen/CodeGenOpenMP.h"
 #include "flang/Optimizer/CodeGen/FIROpPatterns.h"
 #include "flang/Optimizer/CodeGen/TypeConverter.h"
@@ -170,28 +170,6 @@ genAllocationScaleSize(OP op, mlir::Type ity,
   return nullptr;
 }
 
-namespace {
-struct DeclareOpConversion : public fir::FIROpConversion<fir::cg::XDeclareOp> {
-public:
-  using FIROpConversion::FIROpConversion;
-  mlir::LogicalResult
-  matchAndRewrite(fir::cg::XDeclareOp declareOp, OpAdaptor adaptor,
-                  mlir::ConversionPatternRewriter &rewriter) const override {
-    auto memRef = adaptor.getOperands()[0];
-    if (auto fusedLoc = mlir::dyn_cast<mlir::FusedLoc>(declareOp.getLoc())) {
-      if (auto varAttr =
-              mlir::dyn_cast_or_null<mlir::LLVM::DILocalVariableAttr>(
-                  fusedLoc.getMetadata())) {
-        rewriter.create<mlir::LLVM::DbgDeclareOp>(memRef.getLoc(), memRef,
-                                                  varAttr, nullptr);
-      }
-    }
-    rewriter.replaceOp(declareOp, memRef);
-    return mlir::success();
-  }
-};
-} // namespace
-
 namespace {
 /// convert to LLVM IR dialect `alloca`
 struct AllocaOpConversion : public fir::FIROpConversion<fir::AllocaOp> {
@@ -3736,19 +3714,19 @@ void fir::populateFIRToLLVMConversionPatterns(
       BoxOffsetOpConversion, BoxProcHostOpConversion, BoxRankOpConversion,
       BoxTypeCodeOpConversion, BoxTypeDescOpConversion, CallOpConversion,
       CmpcOpConversion, ConstcOpConversion, ConvertOpConversion,
-      CoordinateOpConversion, DTEntryOpConversion, DeclareOpConversion,
-      DivcOpConversion, EmboxOpConversion, EmboxCharOpConversion,
-      EmboxProcOpConversion, ExtractValueOpConversion, FieldIndexOpConversion,
-      FirEndOpConversion, FreeMemOpConversion, GlobalLenOpConversion,
-      GlobalOpConversion, HasValueOpConversion, InsertOnRangeOpConversion,
-      InsertValueOpConversion, IsPresentOpConversion, LenParamIndexOpConversion,
-      LoadOpConversion, MulcOpConversion, NegcOpConversion,
-      NoReassocOpConversion, SelectCaseOpConversion, SelectOpConversion,
-      SelectRankOpConversion, SelectTypeOpConversion, ShapeOpConversion,
-      ShapeShiftOpConversion, ShiftOpConversion, SliceOpConversion,
-      StoreOpConversion, StringLitOpConversion, SubcOpConversion,
-      TypeDescOpConversion, TypeInfoOpConversion, UnboxCharOpConversion,
-      UnboxProcOpConversion, UndefOpConversion, UnreachableOpConversion,
+      CoordinateOpConversion, DTEntryOpConversion, DivcOpConversion,
+      EmboxOpConversion, EmboxCharOpConversion, EmboxProcOpConversion,
+      ExtractValueOpConversion, FieldIndexOpConversion, FirEndOpConversion,
+      FreeMemOpConversion, GlobalLenOpConversion, GlobalOpConversion,
+      HasValueOpConversion, InsertOnRangeOpConversion, InsertValueOpConversion,
+      IsPresentOpConversion, LenParamIndexOpConversion, LoadOpConversion,
+      MulcOpConversion, NegcOpConversion, NoReassocOpConversion,
+      SelectCaseOpConversion, SelectOpConversion, SelectRankOpConversion,
+      SelectTypeOpConversion, ShapeOpConversion, ShapeShiftOpConversion,
+      ShiftOpConversion, SliceOpConversion, StoreOpConversion,
+      StringLitOpConversion, SubcOpConversion, TypeDescOpConversion,
+      TypeInfoOpConversion, UnboxCharOpConversion, UnboxProcOpConversion,
+      UndefOpConversion, UnreachableOpConversion,
       UnrealizedConversionCastOpConversion, XArrayCoorOpConversion,
       XEmboxOpConversion, XReboxOpConversion, ZeroOpConversion>(converter,
                                                                 options);
diff --git a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
index c54a7457db7610..5bd3ec8d18450e 100644
--- a/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/PreCGRewrite.cpp
@@ -12,8 +12,8 @@
 
 #include "flang/Optimizer/CodeGen/CodeGen.h"
 
+#include "CGOps.h"
 #include "flang/Optimizer/Builder/Todo.h" // remove when TODO's are done
-#include "flang/Optimizer/CodeGen/CGOps.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
@@ -270,43 +270,13 @@ class ArrayCoorConversion : public mlir::OpRewritePattern<fir::ArrayCoorOp> {
 };
 
 class DeclareOpConversion : public mlir::OpRewritePattern<fir::DeclareOp> {
-  bool preserveDeclare;
-
 public:
   using OpRewritePattern::OpRewritePattern;
-  DeclareOpConversion(mlir::MLIRContext *ctx, bool preserveDecl)
-      : OpRewritePattern(ctx), preserveDeclare(preserveDecl) {}
 
   mlir::LogicalResult
   matchAndRewrite(fir::DeclareOp declareOp,
                   mlir::PatternRewriter &rewriter) const override {
-    if (!preserveDeclare) {
-      rewriter.replaceOp(declareOp, declareOp.getMemref());
-      return mlir::success();
-    }
-    auto loc = declareOp.getLoc();
-    llvm::SmallVector<mlir::Value> shapeOpers;
-    llvm::SmallVector<mlir::Value> shiftOpers;
-    if (auto shapeVal = declareOp.getShape()) {
-      if (auto shapeOp = mlir::dyn_cast<fir::ShapeOp>(shapeVal.getDefiningOp()))
-        populateShape(shapeOpers, shapeOp);
-      else if (auto shiftOp =
-                   mlir::dyn_cast<fir::ShapeShiftOp>(shapeVal.getDefiningOp()))
-        populateShapeAndShift(shapeOpers, shiftOpers, shiftOp);
-      else if (auto shiftOp =
-                   mlir::dyn_cast<fir::ShiftOp>(shapeVal.getDefiningOp()))
-        populateShift(shiftOpers, shiftOp);
-      else
-        return mlir::failure();
-    }
-    // FIXME: Add FortranAttrs and CudaAttrs
-    auto xDeclOp = rewriter.create<fir::cg::XDeclareOp>(
-        loc, declareOp.getType(), declareOp.getMemref(), shapeOpers, shiftOpers,
-        declareOp.getTypeparams(), declareOp.getDummyScope(),
-        declareOp.getUniqName());
-    LLVM_DEBUG(llvm::dbgs()
-               << "rewriting " << declareOp << " to " << xDeclOp << '\n');
-    rewriter.replaceOp(declareOp, xDeclOp.getOperation()->getResults());
+    rewriter.replaceOp(declareOp, declareOp.getMemref());
     return mlir::success();
   }
 };
@@ -327,7 +297,6 @@ class DummyScopeOpConversion
 
 class CodeGenRewrite : public fir::impl::CodeGenRewriteBase<CodeGenRewrite> {
 public:
-  CodeGenRewrite(fir::CodeGenRewriteOptions opts) : Base(opts) {}
   void runOnOperation() override final {
     mlir::ModuleOp mod = getOperation();
 
@@ -345,7 +314,7 @@ class CodeGenRewrite : public fir::impl::CodeGenRewriteBase<CodeGenRewrite> {
                    mlir::cast<fir::BaseBoxType>(embox.getType()).getEleTy()));
     });
     mlir::RewritePatternSet patterns(&context);
-    fir::populatePreCGRewritePatterns(patterns, preserveDeclare);
+    fir::populatePreCGRewritePatterns(patterns);
     if (mlir::failed(
             mlir::applyPartialConversion(mod, target, std::move(patterns)))) {
       mlir::emitError(mlir::UnknownLoc::get(&context),
@@ -361,14 +330,12 @@ class CodeGenRewrite : public fir::impl::CodeGenRewriteBase<CodeGenRewrite> {
 
 } // namespace
 
-std::unique_ptr<mlir::Pass>
-fir::createFirCodeGenRewritePass(fir::CodeGenRewriteOptions Options) {
-  return std::make_unique<CodeGenRewrite>(Options);
+std::unique_ptr<mlir::Pass> fir::createFirCodeGenRewritePass() {
+  return std::make_unique<CodeGenRewrite>();
 }
 
-void fir::populatePreCGRewritePatterns(mlir::RewritePatternSet &patterns,
-                                       bool preserveDeclare) {
+void fir::populatePreCGRewritePatterns(mlir::RewritePatternSet &patterns) {
   patterns.insert<EmboxConversion, ArrayCoorConversion, ReboxConversion,
-                  DummyScopeOpConversion>(patterns.getContext());
-  patterns.add<DeclareOpConversion>(patterns.getContext(), preserveDeclare);
+                  DeclareOpConversion, DummyScopeOpConversion>(
+      patterns.getContext());
 }
diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index cfad366cb5cb5a..908c8fc96f633e 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -15,7 +15,6 @@
 #include "flang/Common/Version.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Todo.h"
-#include "flang/Optimizer/CodeGen/CGOps.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
@@ -46,59 +45,13 @@ namespace fir {
 namespace {
 
 class AddDebugInfoPass : public fir::impl::AddDebugInfoBase<AddDebugInfoPass> {
-  void handleDeclareOp(fir::cg::XDeclareOp declOp,
-                       mlir::LLVM::DIFileAttr fileAttr,
-                       mlir::LLVM::DIScopeAttr scopeAttr,
-                       fir::DebugTypeGenerator &typeGen);
-
 public:
   AddDebugInfoPass(fir::AddDebugInfoOptions options) : Base(options) {}
   void runOnOperation() override;
 };
 
-static uint32_t getLineFromLoc(mlir::Location loc) {
-  uint32_t line = 1;
-  if (auto fileLoc = mlir::dyn_cast<mlir::FileLineColLoc>(loc))
-    line = fileLoc.getLine();
-  return line;
-}
-
 } // namespace
 
-void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp,
-                                       mlir::LLVM::DIFileAttr fileAttr,
-                                       mlir::LLVM::DIScopeAttr scopeAttr,
-                                       fir::DebugTypeGenerator &typeGen) {
-  mlir::MLIRContext *context = &getContext();
-  mlir::OpBuilder builder(context);
-  auto result = fir::NameUniquer::deconstruct(declOp.getUniqName());
-
-  if (result.first != fir::NameUniquer::NameKind::VARIABLE)
-    return;
-
-  // Only accept local variables.
-  if (result.second.procs.empty())
-    return;
-
-  // FIXME: There may be cases where an argument is processed a bit before
-  // DeclareOp is generated. In that case, DeclareOp may point to an
-  // intermediate op and not to BlockArgument. We need to find those cases and
-  // walk the chain to get to the actual argument.
-
-  unsigned argNo = 0;
-  if (auto Arg = llvm::dyn_cast<mlir::BlockArgument>(declOp.getMemref()))
-    argNo = Arg.getArgNumber() + 1;
-
-  auto tyAttr = typeGen.convertType(fir::unwrapRefType(declOp.getType()),
-                                    fileAttr, scopeAttr, declOp.getLoc());
-
-  auto localVarAttr = mlir::LLVM::DILocalVariableAttr::get(
-      context, scopeAttr, mlir::StringAttr::get(context, result.second.name),
-      fileAttr, getLineFromLoc(declOp.getLoc()), argNo, /* alignInBits*/ 0,
-      tyAttr);
-  declOp->setLoc(builder.getFusedLoc({declOp->getLoc()}, localVarAttr));
-}
-
 void AddDebugInfoPass::runOnOperation() {
   mlir::ModuleOp module = getOperation();
   mlir::MLIRContext *context = &getContext();
@@ -191,15 +144,14 @@ void AddDebugInfoPass::runOnOperation() {
       subprogramFlags =
           subprogramFlags | mlir::LLVM::DISubprogramFlags::Definition;
     }
-    unsigned line = getLineFromLoc(l);
+    unsigned line = 1;
+    if (auto funcLoc = mlir::dyn_cast<mlir::FileLineColLoc>(l))
+      line = funcLoc.getLine();
+
     auto spAttr = mlir::LLVM::DISubprogramAttr::get(
         context, id, compilationUnit, fileAttr, funcName, fullName,
         funcFileAttr, line, line, subprogramFlags, subTypeAttr);
     funcOp->setLoc(builder.getFusedLoc({funcOp->getLoc()}, spAttr));
-
-    funcOp.walk([&](fir::cg::XDeclareOp declOp) {
-      handleDeclareOp(declOp, fileAttr, spAttr, typeGen);
-    });
   });
 }
 
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index 64c6547e06e0f9..e5b4050dfb2426 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -24,6 +24,11 @@ DebugTypeGenerator::DebugTypeGenerator(mlir::ModuleOp m)
   LLVM_DEBUG(llvm::dbgs() << "DITypeAttr generator\n");
 }
 
+static mlir::LLVM::DITypeAttr genPlaceholderType(mlir::MLIRContext *context) {
+  return mlir::LLVM::DIBasicTypeAttr::get(
+      context, llvm::dwarf::DW_TAG_base_type, "void", 32, 1);
+}
+
 static mlir::LLVM::DITypeAttr genBasicType(mlir::MLIRContext *context,
                                            mlir::StringAttr name,
                                            unsigned bitSize,
@@ -32,11 +37,6 @@ static mlir::LLVM::DITypeAttr genBasicType(mlir::MLIRContext *context,
       context, llvm::dwarf::DW_TAG_base_type, name, bitSize, decoding);
 }
 
-static mlir::LLVM::DITypeAttr genPlaceholderType(mlir::MLIRContext *context) {
-  return genBasicType(context, mlir::StringAttr::get(context, "integer"), 32,
-                      llvm::dwarf::DW_ATE_signed);
-}
-
 mlir::LLVM::DITypeAttr
 DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr,
                                 mlir::LLVM::DIScopeAttr scope,
diff --git a/flang/test/Fir/declare-codegen.fir b/flang/test/Fir/declare-codegen.fir
index c5879facb1572f..9d68d3b2f9d4de 100644
--- a/flang/test/Fir/declare-codegen.fir
+++ b/flang/test/Fir/declare-codegen.fir
@@ -1,7 +1,5 @@
 // Test rewrite of fir.declare. The result is replaced by the memref operand.
-// RUN: fir-opt --cg-rewrite="preserve-declare=true" %s -o - | FileCheck %s --check-prefixes DECL
-// RUN: fir-opt --cg-rewrite="preserve-declare=false" %s -o - | FileCheck %s --check-prefixes NODECL
-// RUN: fir-opt --cg-rewrite %s -o - | FileCheck %s --check-prefixes NODECL
+// RUN: fir-opt --cg-rewrite %s -o - | FileCheck %s
 
 
 func.func @test(%arg0: !fir.ref<!fir.array<12x23xi32>>) {
@@ -17,14 +15,9 @@ func.func @test(%arg0: !fir.ref<!fir.array<12x23xi32>>) {
 func.func private @bar(%arg0: !fir.ref<!fir.array<12x23xi32>>)
 
 
-// NODECL-LABEL: func.func @test(
-// NODECL-SAME: %[[arg0:.*]]: !fir.ref<!fir.array<12x23xi32>>) {
-// NODECL-NEXT: fir.call @bar(%[[arg0]]) : (!fir.ref<!fir.array<12x23xi32>>) -> ()
-
-// DECL-LABEL: func.func @test(
-// DECL-SAME: %[[arg0:.*]]: !fir.ref<!fir.array<12x23xi32>>) {
-// DECL: fircg.ext_declare
-
+// CHECK-LABEL: func.func @test(
+// CHECK-SAME: %[[arg0:.*]]: !fir.ref<!fir.array<12x23xi32>>) {
+// CHECK-NEXT: fir.call @bar(%[[arg0]]) : (!fir.ref<!fir.array<12x23xi32>>) -> ()
 
 func.func @useless_shape_with_duplicate_extent_operand(%arg0: !fir.ref<!fir.array<3x3xf32>>) {
   %c3 = arith.constant 3 : index
@@ -33,8 +26,5 @@ func.func @useless_shape_with_duplicate_extent_operand(%arg0: !fir.ref<!fir.arra
   return
 }
 
-// NODECL-LABEL: func.func @useless_shape_with_duplicate_extent_operand(
-// NODECL-NEXT: return
-
-// DECL-LABEL: func.func @useless_shape_with_duplicate_extent_operand(
-// DECL: fircg.ext_declare
+// CHECK-LABEL: func.func @useless_shape_with_duplicate_extent_operand(
+// CHECK-NEXT: return
diff --git a/flang/test/Fir/dummy-scope-codegen.fir b/flang/test/Fir/dummy-scope-codegen.fir
index 2e5c091552a7a6..caef3c1b257832 100644
--- a/flang/test/Fir/dummy-scope-codegen.fir
+++ b/flang/test/Fir/dummy-scope-codegen.fir
@@ -1,14 +1,9 @@
-// RUN: fir-opt --cg-rewrite="preserve-declare=true" %s -o - | FileCheck %s --check-prefixes DECL
-// RUN: fir-opt --cg-rewrite="preserve-declare=false" %s -o - | FileCheck %s --check-prefixes NODECL
-// RUN: fir-opt --cg-rewrite %s -o - | FileCheck %s  --check-prefixes NODECL
+// RUN: fir-opt --cg-rewrite %s -o - | FileCheck %s
 
 func.func @dummy_scope(%arg0: !fir.ref<f32>) {
   %scope = fir.dummy_scope : !fir.dscope
   %0 = fir.declare %arg0 dummy_scope %scope {uniq_name = "x"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
   return
 }
-// DECL-LABEL: func.func @dummy_scope(
-// DECL: fircg.ext_declare
-
-// NODECL-LABEL: func.func @dummy_scope(
-// NODECL-NEXT: return
\ No newline at end of file
+// CHECK-LABEL: func.func @dummy_scope(
+// CHECK-NEXT: return
diff --git a/flang/test/Transforms/debug-local-var-2.f90 b/flang/test/Transforms/debug-local-var-2.f90
deleted file mode 100644
index 15b9b148492e14..00000000000000
--- a/flang/test/Transforms/debug-local-var-2.f90
+++ /dev/null
@@ -1,91 +0,0 @@
-! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s
-
-! This tests checks the debug information for local variables in llvm IR.
-
-! CHECK-LABEL: define void @_QQmain
-! CHECK-DAG: %[[AL11:.*]] = alloca i32
-! CHECK-DAG: %[[AL12:.*]] = alloca i64
-! CHECK-DAG: %[[AL13:.*]] = alloca i8
-! CHECK-DAG: %[[AL14:.*]] = alloca i32
-! CHECK-DAG: %[[AL15:.*]] = alloca float
-! CHECK-DAG: %[[AL16:.*]] = alloca double
-! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[AL11]], metadata ![[I4:.*]], metadata !DIExpression())
-! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[AL12]], metadata ![[I8:.*]], metadata !DIExpression())
-! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[AL13]], metadata ![[L1:.*]], metadata !DIExpression())
-! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[AL14]], metadata ![[L4:.*]], metadata !DIExpression())
-! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[AL15]], metadata ![[R4:.*]], metadata !DIExpression())
-! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[AL16]], metadata ![[R8:.*]], metadata !DIExpression())
-! CHECK-LABEL: }
-
-! CHECK-LABEL: define {{.*}}i64 @_QFPfn1
-! CHECK-SAME: (ptr %[[ARG1:.*]], ptr %[[ARG2:.*]], ptr %[[ARG3:.*]])
-! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[ARG1]], metadata ![[A1:.*]], metadata !DIExpression())
-! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[ARG2]], metadata ![[B1:.*]], metadata !DIExpression())
-! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[ARG3]], metadata ![[C1:.*]], metadata !DIExpression())
-! CHECK-DAG: %[[AL2:.*]] = alloca i64
-! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[AL2]], metadata ![[RES1:.*]], metadata !DIExpression())
-! CHECK-LABEL: }
-
-! CHECK-LABEL: define {{.*}}i32 @_QFPfn2
-! CHECK-SAME: (ptr %[[FN2ARG1:.*]], ptr %[[FN2ARG2:.*]], ptr %[[FN2ARG3:.*]])
-! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[FN2ARG1]], metadata ![[A2:.*]], metadata !DIExpression())
-! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[FN2ARG2]], metadata ![[B2:.*]], metadata !DIExpression())
-! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[FN2ARG3]], metadata ![[C2:.*]], metadata !DIExpression())
-! CHECK-DAG: %[[AL3:.*]] = alloca i32
-! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[AL3]], metadata ![[RES2:.*]], metadata !DIExpression())
-! CHECK-LABEL: }
-
-program mn
-! CHECK-DAG: ![[MAIN:.*]] = distinct !DISubprogram(name: "_QQmain", {{.*}})
-
-! CHECK-DAG: ![[TYI32:.*]] = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed)
-! CHECK-DAG: ![[TYI64:.*]] = !DIBasicType(name: "integer", size: 64, encoding: DW_ATE_signed)
-! CHECK-DAG: ![[TYL8:.*]]  = !DIBasicType(name: "logical", size: 8, encoding: DW_ATE_boolean)
-! CHECK-DAG: ![[TYL32:.*]] = !DIBasicType(name: "logical", size: 32, encoding: DW_ATE_boolean)
-! CHECK-DAG: ![[TYR32:.*]] = !DIBasicType(name: "real", size: 32, encoding: DW_ATE_float)
-! CHECK-DAG: ![[TYR64:.*]] = !DIBasicType(name: "real", size: 64, encoding: DW_ATE_float)
-
-! CHECK-DAG: ![[I4]] = !DILocalVariable(name: "i4", scope: ![[MAIN]], file: !{{.*}}, line: [[@LINE+6]], type: ![[TYI32]])
-! CHECK-DAG: ![[I8]] = !DILocalVariable(name: "i8", scope: ![[MAIN]], file: !{{.*}}, line: [[@LINE+6]], type: ![[TYI64]])
-! CHECK-DAG: ![[R4]] = !DILocalVariable(name: "r4", scope: ![[MAIN]], file: !{{.*}}, line: [[@LINE+6]], type: ![[TYR32]])
-! CHECK-DAG: ![[R8]] = !DILocalVariable(name: "r8", scope: ![[MAIN]], file: !{{.*}}, line: [[@LINE+6]], type: ![[TYR64]])
-! CHECK-DAG: ![[L1]] = !DILocalVariable(name: "l1", scope: ![[MAIN]], file: !{{.*}}, line: [[@LINE+6]], type: ![[TYL8]])
-! CHECK-DAG: ![[L4]] = !DILocalVariable(name: "l4", scope: ![[MAIN]], file: !{{.*}}, line: [[@LINE+6]], type: ![[TYL32]])
-  integer(kind=4) :: i4
-  integer(kind=8) :: i8
-  real(kind=4) :: r4
-  real(kind=8) :: r8
-  logical(kind=1) :: l1
-  logical(kind=4) :: l4
-
-  i8 = fn1(i4, r8, l1)
-  i4 = fn2(i8, r4, l4)
-contains
-! CHECK-DAG: ![[FN1:.*]] = distinct !DISubprogram(name: "fn1", {{.*}})
-! CHECK-DAG: ![[A1]] = !DILocalVariable(name: "a1", arg: 1, scope: ![[FN1]], file: !{{.*}}, line: [[@LINE+5]], type: ![[TYI32]])
-! CHECK-DAG: ![[B1]] = !DILocalVariable(name: "b1", arg: 2, scope: ![[FN1]], file: !{{.*}}, line: [[@LINE+5]], type: ![[TYR64]])
-! CHECK-DAG: ![[C1]] = !DILocalVariable(name: "c1", arg: 3, scope: ![[FN1]], file: !{{.*}}, line: [[@LINE+5]], type: ![[TYL8]])
-! CHECK-DAG: ![[RES1]] = !DILocalVariable(name: "res1", scope: ![[FN1]], file: !{{.*}}, line: [[@LINE+5]], type: ![[TYI64]])
-  function fn1(a1, b1, c1) result (res1)
-    integer(kind=4), intent(in) :: a1
-    real(kind=8), intent(in) :: b1
-    logical(kind=1), intent(in) :: c1
-    integer(kind=8) :: res1
-
-    res1 = a1 + b1
-  end function
-
-! CHECK-DAG: ![[FN2:.*]] = distinct !DISubprogram(name: "fn2", {{.*}})
-! CHECK-DAG: ![[A2]] = !DILocalVariable(name: "a2", arg: 1, scope: ![[FN2]], file: !{{.*}}, line: [[@LINE+5]], type: ![[TYI64]])
-! CHECK-DAG: ![[B2]] = !DILocalVariable(name: "b2", arg: 2, scope: ![[FN2]], file: !{{.*}}, line: [[@LINE+5]], type: ![[TYR32]])
-! CHECK-DAG: ![[C2]] = !DILocalVariable(name: "c2", arg: 3, scope: ![[FN2]], file: !{{.*}}, line: [[@LINE+5]], type: ![[TYL32]])
-! CHECK-DAG: ![[RES2]] = !DILocalVariable(name: "res2", scope: ![[FN2]], file: !{{.*}}, line: [[@LINE+5]], type: ![[TYI32]])
-  function fn2(a2, b2, c2) result (res2)
-    integer(kind=8), intent(in) :: a2
-    real(kind=4), intent(in) :: b2
-    logical(kind=4), intent(in) :: c2
-    integer(kind=4) :: res2
-
-    res2 = a2 + b2
-  end function
-end program
diff --git a/flang/test/Transforms/debug-local-var.f90 b/flang/test/Transforms/debug-local-var.f90
deleted file mode 100644
index 96dc111ad308e1..00000000000000
--- a/flang/test/Transforms/debug-local-var.f90
+++ /dev/null
@@ -1,54 +0,0 @@
-! RUN: %flang_fc1 -emit-fir -debug-info-kind=standalone -mmlir --mlir-print-debuginfo %s -o - | \
-! RUN: fir-opt --cg-rewrite="preserve-declare=true" --mlir-print-debuginfo | fir-opt --add-debug-info --mlir-print-debuginfo | FileCheck %s
-
-! CHECK-DAG: #[[INT8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 64, encoding = DW_ATE_signed>
-! CHECK-DAG: #[[INT4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
-! CHECK-DAG: #[[REAL8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 64, encoding = DW_ATE_float>
-! CHECK-DAG: #[[LOG1:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 8, encoding = DW_ATE_boolean>
-! CHECK-DAG: #[[REAL4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
-! CHECK-DAG: #[[LOG4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 32, encoding = DW_ATE_boolean>
-! CHECK-DAG: #[[MAIN:.*]] = #llvm.di_subprogram<{{.*}}name = "_QQmain"{{.*}}>
-! CHECK-DAG: #[[FN1:.*]] = #llvm.di_subprogram<{{.*}}name = "fn1"{{.*}}>
-! CHECK-DAG: #[[FN2:.*]] = #llvm.di_subprogram<{{.*}}name = "fn2"{{.*}}>
-
-program mn
-! CHECK-DAG: #[[I4:.*]] = #llvm.di_local_variable<scope = #[[MAIN]], name = "i4", file = #{{.*}}, line = [[@LINE+6]], type = #[[INT4]]>
-! CHECK-DAG: #[[I8:.*]] = #llvm.di_local_variable<scope = #[[MAIN]], name = "i8", file = #{{.*}}, line = [[@LINE+6]], type = #[[INT8]]>
-! CHECK-DAG: #[[R4:.*]] = #llvm.di_local_variable<scope = #[[MAIN]], name = "r4", file = #{{.*}}, line = [[@LINE+6]], type = #[[REAL4]]>
-! CHECK-DAG: #[[R8:.*]] = #llvm.di_local_variable<scope = #[[MAIN]], name = "r8", file = #{{.*}}, line = [[@LINE+6]], type = #[[REAL8]]>
-! CHECK-DAG: #[[L1:.*]] = #llvm.di_local_variable<scope = #[[MAIN]], name = "l1", file = #{{.*}}, line = [[@LINE+6]], type = #[[LOG1]]>
-! CHECK-DAG: #[[L4:.*]] = #llvm.di_local_variable<scope = #[[MAIN]], name = "l4", file = #{{.*}}, line = [[@LINE+6]], type = #[[LOG4]]>
-  integer(kind=4) :: i4
-  integer(kind=8) :: i8
-  real(kind=4) :: r4
-  real(kind=8) :: r8
-  logical(kind=1) :: l1
-  logical(kind=4) :: l4
-  i8 = fn1(i4, r8, l1)
-  i4 = fn2(i8, r4, l4)
-contains
-  function fn1(a1, b1, c1) result (res1)
-! CHECK-DAG: #[[A1:.*]] = #llvm.di_local_variable<scope = #[[FN1]], name = "a1", file = #{{.*}}, line = [[@LINE+4]], arg = 1, type = #[[INT4]]>
-! CHECK-DAG: #[[B1:.*]] = #llvm.di_local_variable<scope = #[[FN1]], name = "b1", file = #{{.*}}, line = [[@LINE+4]], arg = 2, type = #[[REAL8]]>
-! CHECK-DAG: #[[C1:.*]] = #llvm.di_local_variable<scope = #[[FN1]], name = "c1", file = #{{.*}}, line = [[@LINE+4]], arg = 3, type = #[[LOG1]]>
-! CHECK-DAG: #[[RES1:.*]] = #llvm.di_local_variable<scope = #[[FN1]], name = "res1", file = #{{.*}}, line = [[@LINE+4]], type = #[[INT8]]>
-    integer(kind=4), intent(in) :: a1
-    real(kind=8), intent(in) :: b1
-    logical(kind=1), intent(in) :: c1
-    integer(kind=8) :: res1
-    res1 = a1 + b1
-  end function
-
-  function fn2(a2, b2, c2) result (res2)
-    implicit none
-! CHECK-DAG: #[[A2:.*]] = #llvm.di_local_variable<scope = #[[FN2]], name = "a2", file = #{{.*}}, line = [[@LINE+4]], arg = 1, type = #[[INT8]]>
-! CHECK-DAG: #[[B2:.*]] = #llvm.di_local_variable<scope = #[[FN2]], name = "b2", file = #{{.*}}, line = [[@LINE+4]], arg = 2, type = #[[REAL4]]>
-! CHECK-DAG: #[[C2:.*]] = #llvm.di_local_variable<scope = #[[FN2]], name = "c2", file = #{{.*}}, line = [[@LINE+4]], arg = 3, type = #[[LOG4]]>
-! CHECK-DAG: #[[RES2:.*]] = #llvm.di_local_variable<scope = #[[FN2]], name = "res2", file = #{{.*}}, line = [[@LINE+4]], type = #[[INT4]]>
-    integer(kind=8), intent(in) :: a2
-    real(kind=4), intent(in) :: b2
-    logical(kind=4), intent(in) :: c2
-    integer(kind=4) :: res2
-    res2 = a2 + b2
-  end function
-end program

From 411bf385ba27f15145c635c7d8ff2701fe8de5b9 Mon Sep 17 00:00:00 2001
From: Walter Erquinigo <a20012251@gmail.com>
Date: Wed, 15 May 2024 20:44:12 +0200
Subject: [PATCH 68/71] [lldb-dap] Include npm install in the extension
 installation steps (#92028)

Otherwise the build step fails due to missing dependencies.
---
 lldb/tools/lldb-dap/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/tools/lldb-dap/README.md b/lldb/tools/lldb-dap/README.md
index 274b1519208a16..16ce4672be71c3 100644
--- a/lldb/tools/lldb-dap/README.md
+++ b/lldb/tools/lldb-dap/README.md
@@ -46,6 +46,7 @@ Installing the plug-in is very straightforward and involves just a few steps.
 
 ```bash
 cd /path/to/lldb/tools/lldb-dap
+npm install
 npm run package # This also compiles the extension.
 npm run vscode-install
 ```
@@ -69,6 +70,7 @@ no effect.
 ```bash
 # Bump version in package.json
 cd /path/to/lldb/tools/lldb-dap
+npm install
 npm run package
 npm run vscode-install
 ```

From 2c54bf497f7d7aecd24f4b849ee08e37a3519611 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Wed, 15 May 2024 11:44:26 -0700
Subject: [PATCH 69/71] Revert "Reapply "[ctx_profile] Profile reader and
 writer" (#92199)"

This reverts commit c19f2c773b0e23fd623502888894add822079f63.

Broke the gcc-7 bot.
---
 .../llvm/ProfileData/PGOCtxProfReader.h       |  92 -------
 .../llvm/ProfileData/PGOCtxProfWriter.h       |  91 -------
 llvm/lib/ProfileData/CMakeLists.txt           |   3 -
 llvm/lib/ProfileData/PGOCtxProfReader.cpp     | 173 ------------
 llvm/lib/ProfileData/PGOCtxProfWriter.cpp     |  49 ----
 llvm/unittests/ProfileData/CMakeLists.txt     |   1 -
 .../PGOCtxProfReaderWriterTest.cpp            | 255 ------------------
 7 files changed, 664 deletions(-)
 delete mode 100644 llvm/include/llvm/ProfileData/PGOCtxProfReader.h
 delete mode 100644 llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
 delete mode 100644 llvm/lib/ProfileData/PGOCtxProfReader.cpp
 delete mode 100644 llvm/lib/ProfileData/PGOCtxProfWriter.cpp
 delete mode 100644 llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp

diff --git a/llvm/include/llvm/ProfileData/PGOCtxProfReader.h b/llvm/include/llvm/ProfileData/PGOCtxProfReader.h
deleted file mode 100644
index a19b3f51d642de..00000000000000
--- a/llvm/include/llvm/ProfileData/PGOCtxProfReader.h
+++ /dev/null
@@ -1,92 +0,0 @@
-//===--- PGOCtxProfReader.h - Contextual profile reader ---------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-///
-/// Reader for contextual iFDO profile, which comes in bitstream format.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_PROFILEDATA_CTXINSTRPROFILEREADER_H
-#define LLVM_PROFILEDATA_CTXINSTRPROFILEREADER_H
-
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/Bitstream/BitstreamReader.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/ProfileData/PGOCtxProfWriter.h"
-#include "llvm/Support/Error.h"
-#include <map>
-#include <vector>
-
-namespace llvm {
-/// The loaded contextual profile, suitable for mutation during IPO passes. We
-/// generally expect a fraction of counters and of callsites to be populated.
-/// We continue to model counters as vectors, but callsites are modeled as a map
-/// of a map. The expectation is that, typically, there is a small number of
-/// indirect targets (usually, 1 for direct calls); but potentially a large
-/// number of callsites, and, as inlining progresses, the callsite count of a
-/// caller will grow.
-class PGOContextualProfile final {
-public:
-  using CallTargetMapTy = std::map<GlobalValue::GUID, PGOContextualProfile>;
-  using CallsiteMapTy = DenseMap<uint32_t, CallTargetMapTy>;
-
-private:
-  friend class PGOCtxProfileReader;
-  GlobalValue::GUID GUID = 0;
-  SmallVector<uint64_t, 16> Counters;
-  CallsiteMapTy Callsites;
-
-  PGOContextualProfile(GlobalValue::GUID G,
-                       SmallVectorImpl<uint64_t> &&Counters)
-      : GUID(G), Counters(std::move(Counters)) {}
-
-  Expected<PGOContextualProfile &>
-  getOrEmplace(uint32_t Index, GlobalValue::GUID G,
-               SmallVectorImpl<uint64_t> &&Counters);
-
-public:
-  PGOContextualProfile(const PGOContextualProfile &) = delete;
-  PGOContextualProfile &operator=(const PGOContextualProfile &) = delete;
-  PGOContextualProfile(PGOContextualProfile &&) = default;
-  PGOContextualProfile &operator=(PGOContextualProfile &&) = default;
-
-  GlobalValue::GUID guid() const { return GUID; }
-  const SmallVectorImpl<uint64_t> &counters() const { return Counters; }
-  const CallsiteMapTy &callsites() const { return Callsites; }
-  CallsiteMapTy &callsites() { return Callsites; }
-
-  bool hasCallsite(uint32_t I) const {
-    return Callsites.find(I) != Callsites.end();
-  }
-
-  const CallTargetMapTy &callsite(uint32_t I) const {
-    assert(hasCallsite(I) && "Callsite not found");
-    return Callsites.find(I)->second;
-  }
-  void getContainedGuids(DenseSet<GlobalValue::GUID> &Guids) const;
-};
-
-class PGOCtxProfileReader final {
-  BitstreamCursor &Cursor;
-  Expected<BitstreamEntry> advance();
-  Error readMetadata();
-  Error wrongValue(const Twine &);
-  Error unsupported(const Twine &);
-
-  Expected<std::pair<std::optional<uint32_t>, PGOContextualProfile>>
-  readContext(bool ExpectIndex);
-  bool canReadContext();
-
-public:
-  PGOCtxProfileReader(BitstreamCursor &Cursor) : Cursor(Cursor) {}
-
-  Expected<std::map<GlobalValue::GUID, PGOContextualProfile>> loadContexts();
-};
-} // namespace llvm
-#endif
diff --git a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
deleted file mode 100644
index 15578c51a49578..00000000000000
--- a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
+++ /dev/null
@@ -1,91 +0,0 @@
-//===- PGOCtxProfWriter.h - Contextual Profile Writer -----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares a utility for writing a contextual profile to bitstream.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_PROFILEDATA_PGOCTXPROFWRITER_H_
-#define LLVM_PROFILEDATA_PGOCTXPROFWRITER_H_
-
-#include "llvm/Bitstream/BitstreamWriter.h"
-#include "llvm/ProfileData/CtxInstrContextNode.h"
-
-namespace llvm {
-enum PGOCtxProfileRecords { Invalid = 0, Version, Guid, CalleeIndex, Counters };
-
-enum PGOCtxProfileBlockIDs {
-  ProfileMetadataBlockID = 100,
-  ContextNodeBlockID = ProfileMetadataBlockID + 1
-};
-
-/// Write one or more ContextNodes to the provided raw_fd_stream.
-/// The caller must destroy the PGOCtxProfileWriter object before closing the
-/// stream.
-/// The design allows serializing a bunch of contexts embedded in some other
-/// file. The overall format is:
-///
-///  [... other data written to the stream...]
-///  SubBlock(ProfileMetadataBlockID)
-///   Version
-///   SubBlock(ContextNodeBlockID)
-///     [RECORDS]
-///     SubBlock(ContextNodeBlockID)
-///       [RECORDS]
-///       [... more SubBlocks]
-///     EndBlock
-///   EndBlock
-///
-/// The "RECORDS" are bitsream records. The IDs are in CtxProfileCodes (except)
-/// for Version, which is just for metadata). All contexts will have Guid and
-/// Counters, and all but the roots have CalleeIndex. The order in which the
-/// records appear does not matter, but they must precede any subcontexts,
-/// because that helps keep the reader code simpler.
-///
-/// Subblock containment captures the context->subcontext relationship. The
-/// "next()" relationship in the raw profile, between call targets of indirect
-/// calls, are just modeled as peer subblocks where the callee index is the
-/// same.
-///
-/// Versioning: the writer may produce additional records not known by the
-/// reader. The version number indicates a more structural change.
-/// The current version, in particular, is set up to expect optional extensions
-/// like value profiling - which would appear as additional records. For
-/// example, value profiling would produce a new record with a new record ID,
-/// containing the profiled values (much like the counters)
-class PGOCtxProfileWriter final {
-  SmallVector<char, 1 << 20> Buff;
-  BitstreamWriter Writer;
-
-  void writeCounters(const ctx_profile::ContextNode &Node);
-  void writeImpl(std::optional<uint32_t> CallerIndex,
-                 const ctx_profile::ContextNode &Node);
-
-public:
-  PGOCtxProfileWriter(raw_fd_stream &Out,
-                      std::optional<unsigned> VersionOverride = std::nullopt)
-      : Writer(Buff, &Out, 0) {
-    Writer.EnterSubblock(PGOCtxProfileBlockIDs::ProfileMetadataBlockID,
-                         CodeLen);
-    const auto Version = VersionOverride ? *VersionOverride : CurrentVersion;
-    Writer.EmitRecord(PGOCtxProfileRecords::Version,
-                      SmallVector<unsigned, 1>({Version}));
-  }
-
-  ~PGOCtxProfileWriter() { Writer.ExitBlock(); }
-
-  void write(const ctx_profile::ContextNode &);
-
-  // constants used in writing which a reader may find useful.
-  static constexpr unsigned CodeLen = 2;
-  static constexpr uint32_t CurrentVersion = 1;
-  static constexpr unsigned VBREncodingBits = 6;
-};
-
-} // namespace llvm
-#endif
diff --git a/llvm/lib/ProfileData/CMakeLists.txt b/llvm/lib/ProfileData/CMakeLists.txt
index 4fa1b76f0a062c..408f9ff01ec87d 100644
--- a/llvm/lib/ProfileData/CMakeLists.txt
+++ b/llvm/lib/ProfileData/CMakeLists.txt
@@ -7,8 +7,6 @@ add_llvm_component_library(LLVMProfileData
   ItaniumManglingCanonicalizer.cpp
   MemProf.cpp
   MemProfReader.cpp
-  PGOCtxProfReader.cpp
-  PGOCtxProfWriter.cpp
   ProfileSummaryBuilder.cpp
   SampleProf.cpp
   SampleProfReader.cpp
@@ -22,7 +20,6 @@ add_llvm_component_library(LLVMProfileData
   intrinsics_gen
 
   LINK_COMPONENTS
-  BitstreamReader
   Core
   Object
   Support
diff --git a/llvm/lib/ProfileData/PGOCtxProfReader.cpp b/llvm/lib/ProfileData/PGOCtxProfReader.cpp
deleted file mode 100644
index 3710f2e4b81858..00000000000000
--- a/llvm/lib/ProfileData/PGOCtxProfReader.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-//===- PGOCtxProfReader.cpp - Contextual Instrumentation profile reader ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Read a contextual profile into a datastructure suitable for maintenance
-// throughout IPO
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ProfileData/PGOCtxProfReader.h"
-#include "llvm/Bitstream/BitCodeEnums.h"
-#include "llvm/Bitstream/BitstreamReader.h"
-#include "llvm/ProfileData/InstrProf.h"
-#include "llvm/ProfileData/PGOCtxProfWriter.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/Error.h"
-
-using namespace llvm;
-
-// FIXME(#92054) - these Error handling macros are (re-)invented in a few
-// places.
-#define EXPECT_OR_RET(LHS, RHS)                                                \
-  auto LHS = RHS;                                                              \
-  if (!LHS)                                                                    \
-    return LHS.takeError();
-
-#define RET_ON_ERR(EXPR)                                                       \
-  if (auto Err = (EXPR))                                                       \
-    return Err;
-
-Expected<PGOContextualProfile &>
-PGOContextualProfile::getOrEmplace(uint32_t Index, GlobalValue::GUID G,
-                                   SmallVectorImpl<uint64_t> &&Counters) {
-  auto [Iter, Inserted] = Callsites[Index].insert(
-      {G, PGOContextualProfile(G, std::move(Counters))});
-  if (!Inserted)
-    return make_error<InstrProfError>(instrprof_error::invalid_prof,
-                                      "Duplicate GUID for same callsite.");
-  return Iter->second;
-}
-
-void PGOContextualProfile::getContainedGuids(
-    DenseSet<GlobalValue::GUID> &Guids) const {
-  Guids.insert(GUID);
-  for (const auto &[_, Callsite] : Callsites)
-    for (const auto &[_, Callee] : Callsite)
-      Callee.getContainedGuids(Guids);
-}
-
-Expected<BitstreamEntry> PGOCtxProfileReader::advance() {
-  return Cursor.advance(BitstreamCursor::AF_DontAutoprocessAbbrevs);
-}
-
-Error PGOCtxProfileReader::wrongValue(const Twine &Msg) {
-  return make_error<InstrProfError>(instrprof_error::invalid_prof, Msg);
-}
-
-Error PGOCtxProfileReader::unsupported(const Twine &Msg) {
-  return make_error<InstrProfError>(instrprof_error::unsupported_version, Msg);
-}
-
-bool PGOCtxProfileReader::canReadContext() {
-  auto Blk = advance();
-  if (!Blk) {
-    consumeError(Blk.takeError());
-    return false;
-  }
-  return Blk->Kind == BitstreamEntry::SubBlock &&
-         Blk->ID == PGOCtxProfileBlockIDs::ContextNodeBlockID;
-}
-
-Expected<std::pair<std::optional<uint32_t>, PGOContextualProfile>>
-PGOCtxProfileReader::readContext(bool ExpectIndex) {
-  RET_ON_ERR(Cursor.EnterSubBlock(PGOCtxProfileBlockIDs::ContextNodeBlockID));
-
-  std::optional<ctx_profile::GUID> Guid;
-  std::optional<SmallVector<uint64_t, 16>> Counters;
-  std::optional<uint32_t> CallsiteIndex;
-
-  SmallVector<uint64_t, 1> RecordValues;
-
-  // We don't prescribe the order in which the records come in, and we are ok
-  // if other unsupported records appear. We seek in the current subblock until
-  // we get all we know.
-  auto GotAllWeNeed = [&]() {
-    return Guid.has_value() && Counters.has_value() &&
-           (!ExpectIndex || CallsiteIndex.has_value());
-  };
-  while (!GotAllWeNeed()) {
-    RecordValues.clear();
-    EXPECT_OR_RET(Entry, advance());
-    if (Entry->Kind != BitstreamEntry::Record)
-      return wrongValue(
-          "Expected records before encountering more subcontexts");
-    EXPECT_OR_RET(ReadRecord,
-                  Cursor.readRecord(bitc::UNABBREV_RECORD, RecordValues));
-    switch (*ReadRecord) {
-    case PGOCtxProfileRecords::Guid:
-      if (RecordValues.size() != 1)
-        return wrongValue("The GUID record should have exactly one value");
-      Guid = RecordValues[0];
-      break;
-    case PGOCtxProfileRecords::Counters:
-      Counters = std::move(RecordValues);
-      if (Counters->empty())
-        return wrongValue("Empty counters. At least the entry counter (one "
-                          "value) was expected");
-      break;
-    case PGOCtxProfileRecords::CalleeIndex:
-      if (!ExpectIndex)
-        return wrongValue("The root context should not have a callee index");
-      if (RecordValues.size() != 1)
-        return wrongValue("The callee index should have exactly one value");
-      CallsiteIndex = RecordValues[0];
-      break;
-    default:
-      // OK if we see records we do not understand, like records (profile
-      // components) introduced later.
-      break;
-    }
-  }
-
-  PGOContextualProfile Ret(*Guid, std::move(*Counters));
-
-  while (canReadContext()) {
-    EXPECT_OR_RET(SC, readContext(true));
-    auto &Targets = Ret.callsites()[*SC->first];
-    auto [_, Inserted] =
-        Targets.insert({SC->second.guid(), std::move(SC->second)});
-    if (!Inserted)
-      return wrongValue(
-          "Unexpected duplicate target (callee) at the same callsite.");
-  }
-  return std::make_pair(CallsiteIndex, std::move(Ret));
-}
-
-Error PGOCtxProfileReader::readMetadata() {
-  EXPECT_OR_RET(Blk, advance());
-  if (Blk->Kind != BitstreamEntry::SubBlock)
-    return unsupported("Expected Version record");
-  RET_ON_ERR(
-      Cursor.EnterSubBlock(PGOCtxProfileBlockIDs::ProfileMetadataBlockID));
-  EXPECT_OR_RET(MData, advance());
-  if (MData->Kind != BitstreamEntry::Record)
-    return unsupported("Expected Version record");
-
-  SmallVector<uint64_t, 1> Ver;
-  EXPECT_OR_RET(Code, Cursor.readRecord(bitc::UNABBREV_RECORD, Ver));
-  if (*Code != PGOCtxProfileRecords::Version)
-    return unsupported("Expected Version record");
-  if (Ver.size() != 1 || Ver[0] > PGOCtxProfileWriter::CurrentVersion)
-    return unsupported("Version " + Twine(*Code) +
-                       " is higher than supported version " +
-                       Twine(PGOCtxProfileWriter::CurrentVersion));
-  return Error::success();
-}
-
-Expected<std::map<GlobalValue::GUID, PGOContextualProfile>>
-PGOCtxProfileReader::loadContexts() {
-  std::map<GlobalValue::GUID, PGOContextualProfile> Ret;
-  RET_ON_ERR(readMetadata());
-  while (canReadContext()) {
-    EXPECT_OR_RET(E, readContext(false));
-    auto Key = E->second.guid();
-    if (!Ret.insert({Key, std::move(E->second)}).second)
-      return wrongValue("Duplicate roots");
-  }
-  return Ret;
-}
diff --git a/llvm/lib/ProfileData/PGOCtxProfWriter.cpp b/llvm/lib/ProfileData/PGOCtxProfWriter.cpp
deleted file mode 100644
index 50817975644693..00000000000000
--- a/llvm/lib/ProfileData/PGOCtxProfWriter.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-//===- PGOCtxProfWriter.cpp - Contextual Instrumentation profile writer ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Write a contextual profile to bitstream.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ProfileData/PGOCtxProfWriter.h"
-#include "llvm/Bitstream/BitCodeEnums.h"
-
-using namespace llvm;
-using namespace llvm::ctx_profile;
-
-void PGOCtxProfileWriter::writeCounters(const ContextNode &Node) {
-  Writer.EmitCode(bitc::UNABBREV_RECORD);
-  Writer.EmitVBR(PGOCtxProfileRecords::Counters, VBREncodingBits);
-  Writer.EmitVBR(Node.counters_size(), VBREncodingBits);
-  for (uint32_t I = 0U; I < Node.counters_size(); ++I)
-    Writer.EmitVBR64(Node.counters()[I], VBREncodingBits);
-}
-
-// recursively write all the subcontexts. We do need to traverse depth first to
-// model the context->subcontext implicitly, and since this captures call
-// stacks, we don't really need to be worried about stack overflow and we can
-// keep the implementation simple.
-void PGOCtxProfileWriter::writeImpl(std::optional<uint32_t> CallerIndex,
-                                    const ContextNode &Node) {
-  Writer.EnterSubblock(PGOCtxProfileBlockIDs::ContextNodeBlockID, CodeLen);
-  Writer.EmitRecord(PGOCtxProfileRecords::Guid,
-                    SmallVector<uint64_t, 1>{Node.guid()});
-  if (CallerIndex)
-    Writer.EmitRecord(PGOCtxProfileRecords::CalleeIndex,
-                      SmallVector<uint64_t, 1>{*CallerIndex});
-  writeCounters(Node);
-  for (uint32_t I = 0U; I < Node.callsites_size(); ++I)
-    for (const auto *Subcontext = Node.subContexts()[I]; Subcontext;
-         Subcontext = Subcontext->next())
-      writeImpl(I, *Subcontext);
-  Writer.ExitBlock();
-}
-
-void PGOCtxProfileWriter::write(const ContextNode &RootNode) {
-  writeImpl(std::nullopt, RootNode);
-}
diff --git a/llvm/unittests/ProfileData/CMakeLists.txt b/llvm/unittests/ProfileData/CMakeLists.txt
index c92642ded82820..ce3a0a45ccf18c 100644
--- a/llvm/unittests/ProfileData/CMakeLists.txt
+++ b/llvm/unittests/ProfileData/CMakeLists.txt
@@ -13,7 +13,6 @@ add_llvm_unittest(ProfileDataTests
   InstrProfTest.cpp
   ItaniumManglingCanonicalizerTest.cpp
   MemProfTest.cpp
-  PGOCtxProfReaderWriterTest.cpp
   SampleProfTest.cpp
   SymbolRemappingReaderTest.cpp
   )
diff --git a/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp b/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp
deleted file mode 100644
index d2cdbb28e2fce7..00000000000000
--- a/llvm/unittests/ProfileData/PGOCtxProfReaderWriterTest.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-//===-------------- PGOCtxProfReadWriteTest.cpp ---------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Bitstream/BitstreamReader.h"
-#include "llvm/ProfileData/CtxInstrContextNode.h"
-#include "llvm/ProfileData/PGOCtxProfReader.h"
-#include "llvm/ProfileData/PGOCtxProfWriter.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Testing/Support/SupportHelpers.h"
-#include "gtest/gtest.h"
-
-using namespace llvm;
-using namespace llvm::ctx_profile;
-
-class PGOCtxProfRWTest : public ::testing::Test {
-  std::vector<std::unique_ptr<char[]>> Nodes;
-  std::map<GUID, const ContextNode *> Roots;
-
-public:
-  ContextNode *createNode(GUID Guid, uint32_t NrCounters, uint32_t NrCallsites,
-                          ContextNode *Next = nullptr) {
-    auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites);
-    auto *Mem = Nodes.emplace_back(std::make_unique<char[]>(AllocSize)).get();
-    std::memset(Mem, 0, AllocSize);
-    auto *Ret = new (Mem) ContextNode(Guid, NrCounters, NrCallsites, Next);
-    return Ret;
-  }
-
-  void SetUp() override {
-    // Root (guid 1) has 2 callsites, one used for an indirect call to either
-    // guid 2 or 4.
-    // guid 2 calls guid 5
-    // guid 5 calls guid 2
-    // there's also a second root, guid3.
-    auto *Root1 = createNode(1, 2, 2);
-    Root1->counters()[0] = 10;
-    Root1->counters()[1] = 11;
-    Roots.insert({1, Root1});
-    auto *L1 = createNode(2, 1, 1);
-    L1->counters()[0] = 12;
-    Root1->subContexts()[1] = createNode(4, 3, 1, L1);
-    Root1->subContexts()[1]->counters()[0] = 13;
-    Root1->subContexts()[1]->counters()[1] = 14;
-    Root1->subContexts()[1]->counters()[2] = 15;
-
-    auto *L3 = createNode(5, 6, 3);
-    for (auto I = 0; I < 6; ++I)
-      L3->counters()[I] = 16 + I;
-    L1->subContexts()[0] = L3;
-    L3->subContexts()[2] = createNode(2, 1, 1);
-    L3->subContexts()[2]->counters()[0] = 30;
-    auto *Root2 = createNode(3, 1, 0);
-    Root2->counters()[0] = 40;
-    Roots.insert({3, Root2});
-  }
-
-  const std::map<GUID, const ContextNode *> &roots() const { return Roots; }
-};
-
-void checkSame(const ContextNode &Raw, const PGOContextualProfile &Profile) {
-  EXPECT_EQ(Raw.guid(), Profile.guid());
-  ASSERT_EQ(Raw.counters_size(), Profile.counters().size());
-  for (auto I = 0U; I < Raw.counters_size(); ++I)
-    EXPECT_EQ(Raw.counters()[I], Profile.counters()[I]);
-
-  for (auto I = 0U; I < Raw.callsites_size(); ++I) {
-    if (Raw.subContexts()[I] == nullptr)
-      continue;
-    EXPECT_TRUE(Profile.hasCallsite(I));
-    const auto &ProfileTargets = Profile.callsite(I);
-
-    std::map<GUID, const ContextNode *> Targets;
-    for (const auto *N = Raw.subContexts()[I]; N; N = N->next())
-      EXPECT_TRUE(Targets.insert({N->guid(), N}).second);
-
-    EXPECT_EQ(Targets.size(), ProfileTargets.size());
-    for (auto It : Targets) {
-      auto PIt = ProfileTargets.find(It.second->guid());
-      EXPECT_NE(PIt, ProfileTargets.end());
-      checkSame(*It.second, PIt->second);
-    }
-  }
-}
-
-TEST_F(PGOCtxProfRWTest, RoundTrip) {
-  llvm::unittest::TempFile ProfileFile("ctx_profile", "", "", /*Unique*/ true);
-  {
-    std::error_code EC;
-    raw_fd_stream Out(ProfileFile.path(), EC);
-    ASSERT_FALSE(EC);
-    {
-      PGOCtxProfileWriter Writer(Out);
-      for (auto &[_, R] : roots())
-        Writer.write(*R);
-    }
-  }
-  {
-    ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
-        MemoryBuffer::getFile(ProfileFile.path());
-    ASSERT_TRUE(!!MB);
-    ASSERT_NE(*MB, nullptr);
-    BitstreamCursor Cursor((*MB)->getBuffer());
-    PGOCtxProfileReader Reader(Cursor);
-    auto Expected = Reader.loadContexts();
-    ASSERT_TRUE(!!Expected);
-    auto &Ctxes = *Expected;
-    EXPECT_EQ(Ctxes.size(), roots().size());
-    EXPECT_EQ(Ctxes.size(), 2U);
-    for (auto &[G, R] : roots())
-      checkSame(*R, Ctxes.find(G)->second);
-  }
-}
-
-TEST_F(PGOCtxProfRWTest, InvalidCounters) {
-  auto *R = createNode(1, 0, 1);
-  llvm::unittest::TempFile ProfileFile("ctx_profile", "", "", /*Unique*/ true);
-  {
-    std::error_code EC;
-    raw_fd_stream Out(ProfileFile.path(), EC);
-    ASSERT_FALSE(EC);
-    {
-      PGOCtxProfileWriter Writer(Out);
-      Writer.write(*R);
-    }
-  }
-  {
-    auto MB = MemoryBuffer::getFile(ProfileFile.path());
-    ASSERT_TRUE(!!MB);
-    ASSERT_NE(*MB, nullptr);
-    BitstreamCursor Cursor((*MB)->getBuffer());
-    PGOCtxProfileReader Reader(Cursor);
-    auto Expected = Reader.loadContexts();
-    EXPECT_FALSE(Expected);
-    consumeError(Expected.takeError());
-  }
-}
-
-TEST_F(PGOCtxProfRWTest, Empty) {
-  BitstreamCursor Cursor("");
-  PGOCtxProfileReader Reader(Cursor);
-  auto Expected = Reader.loadContexts();
-  EXPECT_FALSE(Expected);
-  consumeError(Expected.takeError());
-}
-
-TEST_F(PGOCtxProfRWTest, Invalid) {
-  BitstreamCursor Cursor("Surely this is not valid");
-  PGOCtxProfileReader Reader(Cursor);
-  auto Expected = Reader.loadContexts();
-  EXPECT_FALSE(Expected);
-  consumeError(Expected.takeError());
-}
-
-TEST_F(PGOCtxProfRWTest, ValidButEmpty) {
-  llvm::unittest::TempFile ProfileFile("ctx_profile", "", "", /*Unique*/ true);
-  {
-    std::error_code EC;
-    raw_fd_stream Out(ProfileFile.path(), EC);
-    ASSERT_FALSE(EC);
-    {
-      PGOCtxProfileWriter Writer(Out);
-      // don't write anything - this will just produce the metadata subblock.
-    }
-  }
-  {
-    auto MB = MemoryBuffer::getFile(ProfileFile.path());
-    ASSERT_TRUE(!!MB);
-    ASSERT_NE(*MB, nullptr);
-    BitstreamCursor Cursor((*MB)->getBuffer());
-    PGOCtxProfileReader Reader(Cursor);
-    auto Expected = Reader.loadContexts();
-    EXPECT_TRUE(!!Expected);
-    EXPECT_TRUE(Expected->empty());
-  }
-}
-
-TEST_F(PGOCtxProfRWTest, WrongVersion) {
-  llvm::unittest::TempFile ProfileFile("ctx_profile", "", "", /*Unique*/ true);
-  {
-    std::error_code EC;
-    raw_fd_stream Out(ProfileFile.path(), EC);
-    ASSERT_FALSE(EC);
-    {
-      PGOCtxProfileWriter Writer(Out, PGOCtxProfileWriter::CurrentVersion + 1);
-    }
-  }
-  {
-    auto MB = MemoryBuffer::getFile(ProfileFile.path());
-    ASSERT_TRUE(!!MB);
-    ASSERT_NE(*MB, nullptr);
-    BitstreamCursor Cursor((*MB)->getBuffer());
-    PGOCtxProfileReader Reader(Cursor);
-    auto Expected = Reader.loadContexts();
-    EXPECT_FALSE(Expected);
-    consumeError(Expected.takeError());
-  }
-}
-
-TEST_F(PGOCtxProfRWTest, DuplicateRoots) {
-  llvm::unittest::TempFile ProfileFile("ctx_profile", "", "", /*Unique*/ true);
-  {
-    std::error_code EC;
-    raw_fd_stream Out(ProfileFile.path(), EC);
-    ASSERT_FALSE(EC);
-    {
-      PGOCtxProfileWriter Writer(Out);
-      Writer.write(*createNode(1, 1, 1));
-      Writer.write(*createNode(1, 1, 1));
-    }
-  }
-  {
-    auto MB = MemoryBuffer::getFile(ProfileFile.path());
-    ASSERT_TRUE(!!MB);
-    ASSERT_NE(*MB, nullptr);
-    BitstreamCursor Cursor((*MB)->getBuffer());
-    PGOCtxProfileReader Reader(Cursor);
-    auto Expected = Reader.loadContexts();
-    EXPECT_FALSE(Expected);
-    consumeError(Expected.takeError());
-  }
-}
-
-TEST_F(PGOCtxProfRWTest, DuplicateTargets) {
-  llvm::unittest::TempFile ProfileFile("ctx_profile", "", "", /*Unique*/ true);
-  {
-    std::error_code EC;
-    raw_fd_stream Out(ProfileFile.path(), EC);
-    ASSERT_FALSE(EC);
-    {
-      auto *R = createNode(1, 1, 1);
-      auto *L1 = createNode(2, 1, 0);
-      auto *L2 = createNode(2, 1, 0, L1);
-      R->subContexts()[0] = L2;
-      PGOCtxProfileWriter Writer(Out);
-      Writer.write(*R);
-    }
-  }
-  {
-    auto MB = MemoryBuffer::getFile(ProfileFile.path());
-    ASSERT_TRUE(!!MB);
-    ASSERT_NE(*MB, nullptr);
-    BitstreamCursor Cursor((*MB)->getBuffer());
-    PGOCtxProfileReader Reader(Cursor);
-    auto Expected = Reader.loadContexts();
-    EXPECT_FALSE(Expected);
-    consumeError(Expected.takeError());
-  }
-}

From 9ae2177843f681c70ad89506155a2cb83eeebfd4 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 16 May 2024 02:41:48 +0800
Subject: [PATCH 70/71] [RISCV] Handle undef AVLs in RISCVInsertVSETVLI

Before #91440 a VSETVLIInfo would have had an IMPLICIT_DEF defining
instruction, but now we look up a VNInfo which doesn't exist, which
triggers an assertion failure. Mark these undef AVLs as AVLIsIgnored.
---
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp  | 20 +++++++++-------
 llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll | 24 +++++++++++++++++++
 2 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 1c815424bdfa62..363007d7b68b12 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -48,15 +48,13 @@ static cl::opt<bool> DisableInsertVSETVLPHIOpt(
 namespace {
 
 /// Given a virtual register \p Reg, return the corresponding VNInfo for it.
-/// This should never return nullptr.
+/// This will return nullptr if the virtual register is an implicit_def.
 static VNInfo *getVNInfoFromReg(Register Reg, const MachineInstr &MI,
                                 const LiveIntervals *LIS) {
   assert(Reg.isVirtual());
   auto &LI = LIS->getInterval(Reg);
   SlotIndex SI = LIS->getSlotIndexes()->getInstructionIndex(MI);
-  VNInfo *VNI = LI.getVNInfoBefore(SI);
-  assert(VNI);
-  return VNI;
+  return LI.getVNInfoBefore(SI);
 }
 
 static unsigned getVLOpNum(const MachineInstr &MI) {
@@ -894,8 +892,12 @@ static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI,
            "Can't handle X0, X0 vsetvli yet");
     if (AVLReg == RISCV::X0)
       NewInfo.setAVLVLMAX();
-    else
-      NewInfo.setAVLRegDef(getVNInfoFromReg(AVLReg, MI, LIS), AVLReg);
+    else if (VNInfo *VNI = getVNInfoFromReg(AVLReg, MI, LIS))
+      NewInfo.setAVLRegDef(VNI, AVLReg);
+    else {
+      assert(MI.getOperand(1).isUndef());
+      NewInfo.setAVLIgnored();
+    }
   }
   NewInfo.setVTYPE(MI.getOperand(2).getImm());
 
@@ -966,9 +968,11 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
       }
       else
         InstrInfo.setAVLImm(Imm);
+    } else if (VNInfo *VNI = getVNInfoFromReg(VLOp.getReg(), MI, LIS)) {
+      InstrInfo.setAVLRegDef(VNI, VLOp.getReg());
     } else {
-      InstrInfo.setAVLRegDef(getVNInfoFromReg(VLOp.getReg(), MI, LIS),
-                             VLOp.getReg());
+      assert(VLOp.isUndef());
+      InstrInfo.setAVLIgnored();
     }
   } else {
     assert(isScalarExtractInstr(MI));
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
index 12bb4d27b0f979..da0c1cfb50097c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
@@ -699,3 +699,27 @@ declare <vscale x 2 x i1> @llvm.riscv.vmsgt.nxv2i32.i32.i64(<vscale x 2 x i32>,
 declare <vscale x 2 x i1> @llvm.riscv.vmor.nxv2i1.i64(<vscale x 2 x i1>, <vscale x 2 x i1>, i64)
 declare void @llvm.riscv.vse.mask.nxv2i32.i64(<vscale x 2 x i32>, ptr nocapture, <vscale x 2 x i1>, i64)
 declare void @llvm.riscv.vse.nxv2i32.i64(<vscale x 2 x i32>, ptr nocapture, i64)
+
+define <vscale x 2 x i32> @avl_undef1(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>) {
+; CHECK-LABEL: avl_undef1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, tu, ma
+; CHECK-NEXT:    vadd.vv v8, v9, v10
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i32> @llvm.riscv.vadd.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    <vscale x 2 x i32> %1,
+    <vscale x 2 x i32> %2,
+    i64 undef
+  )
+  ret <vscale x 2 x i32> %a
+}
+
+define i64 @avl_undef2() {
+; CHECK-LABEL: avl_undef2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %1 = tail call i64 @llvm.riscv.vsetvli(i64 poison, i64 2, i64 7)
+  ret i64 %1
+}

From 239d79c47e4cf502f1fc7dfe744d6ede9a50ae1e Mon Sep 17 00:00:00 2001
From: Jacob Lambert <jacob.lambert@amd.com>
Date: Wed, 15 May 2024 09:08:06 -0700
Subject: [PATCH 71/71] [Comgr] Shift name_expression_map test to HIP tests

The name_expression_map test compiles/processes HIP source code,
so it should be run alongside the other HIP tests

Change-Id: Icfa4f9488d90f22841f590d4c3b7ef986eff2d99
---
 amd/comgr/test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/amd/comgr/test/CMakeLists.txt b/amd/comgr/test/CMakeLists.txt
index f0bf636fd3c7ce..39372323ed21e4 100644
--- a/amd/comgr/test/CMakeLists.txt
+++ b/amd/comgr/test/CMakeLists.txt
@@ -246,7 +246,6 @@ add_comgr_test(lookup_code_object_test c)
 add_comgr_test(symbolize_test c)
 add_comgr_test(mangled_names_test c)
 add_comgr_test(multithread_test cpp)
-add_comgr_test(name_expression_map_test c)
 add_comgr_test(nested_kernel_test c)
 add_comgr_test(map_elf_virtual_address_test c)
 add_comgr_test(compile_source_to_executable c)
@@ -264,4 +263,5 @@ if (DEFINED HIP_COMPILER AND "${HIP_COMPILER}" STREQUAL "clang")
   add_comgr_test(unbundle_hip_test c)
   add_comgr_test(compile_hip_to_relocatable c)
   add_comgr_test(mangled_names_hip_test c)
+  add_comgr_test(name_expression_map_test c)
 endif()